## Load Libraries

In [2]:
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install seaborn
%pip install sklearn

Collecting numpy
  Downloading numpy-1.24.3-cp39-cp39-win_amd64.whl (14.9 MB)
                                              0.0/14.9 MB ? eta -:--:--
                                              0.3/14.9 MB 5.2 MB/s eta 0:00:03
     --                                       0.8/14.9 MB 7.5 MB/s eta 0:00:02
     ------                                   2.4/14.9 MB 13.9 MB/s eta 0:00:01
     --------                                 3.1/14.9 MB 13.1 MB/s eta 0:00:01
     -------------                            5.1/14.9 MB 16.1 MB/s eta 0:00:01
     ---------------                          5.9/14.9 MB 17.1 MB/s eta 0:00:01
     -----------------                        6.7/14.9 MB 17.0 MB/s eta 0:00:01
     --------------------                     7.6/14.9 MB 16.8 MB/s eta 0:00:01
     ---------------------                    8.1/14.9 MB 15.7 MB/s eta 0:00:01
     -------------------------                9.5/14.9 MB 15.6 MB/s eta 0:00:01
     ------------------------------          11.6/14

In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec # subplots
import seaborn as sns 

#Import models from scikit learn module:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression, ElasticNet, ElasticNetCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn import metrics 

ModuleNotFoundError: No module named 'sklearn'

## Load Data 

In [40]:
df = pd.read_csv("BGG100.csv",header = 0, encoding= 'unicode_escape', index_col='Rank')

In [28]:
df.head()

Unnamed: 0_level_0,Rating,Name,Subtitle,Year,MinPlayers,MaxPlayers,BestPlayers,MinPlayTime,MaxPlayTime,MinAge,Weight,Type
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,8.511,Gloomhaven,Vanquish monsters with strategic cardplay. Ful...,2017,1,4,3,60,120,14,3.87,"Strategy, Thematic"
2,8.442,Pandemic Legacy: Season 1,Mutating diseases are spreading around the wor...,2015,2,4,4,60,60,13,2.83,"Strategy, Thematic"
3,8.418,Brass: Birmingham,"Build networks, grow industries, and navigate ...",2018,2,4,"3, 4",60,120,14,3.9,Strategy
4,8.273,Terraforming Mars,Compete with rival CEOs to make Mars habitable...,2016,1,5,3,120,120,12,3.24,Strategy
5,8.262,Twilight Imperium: Fourth Edition,"Build an intergalactic empire through trade, r...",2017,3,6,6,240,480,14,4.26,"Strategy, Thematic"


## Data cleaning

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 1 to 100
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Rating       100 non-null    float64
 1   Name         100 non-null    object 
 2   Subtitle     100 non-null    object 
 3   Year         100 non-null    int64  
 4   MinPlayers   100 non-null    int64  
 5   MaxPlayers   100 non-null    int64  
 6   BestPlayers  100 non-null    object 
 7   MinPlayTime  100 non-null    int64  
 8   MaxPlayTime  100 non-null    int64  
 9   MinAge       100 non-null    int64  
 10  Weight       100 non-null    float64
 11  Type         100 non-null    object 
dtypes: float64(2), int64(6), object(4)
memory usage: 10.2+ KB


In [45]:
#dividing target variable from the other parameters
X = df[df.columns[1:12]]
y =  df[df.columns[0]]

In [55]:
#numerating the string values and dropping names

for index, value in X['BestPlayers'].items():
    if value == 'None':
        X.at[index, 'BestPlayers'] = 0
    elif isinstance(value, str):
        value_list = value.strip().split(',')
        if len(value_list) == 1:
            X.at[index, 'BestPlayers'] = int(value_list[0])
        elif len(value_list) == 2:
            start, end = int(value_list[0]), int(value_list[1])
            X.at[index, 'BestPlayers'] = list(range(start, end))
X_numbers =  X[X.columns[2:11]]

In [81]:
uniqueWords = set()
for i in X['Type']:
    i = i.replace("\xa0\xa0", " ").replace("\xa0", " ").strip().split(',')
    for word in i:
        uniqueWords.add(word.strip())

res = list(uniqueWords)
print(res)


['Abstract', 'Strategy', 'Family', 'Customizable', 'Party', 'Wargames', 'Thematic']


In [56]:
X.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 1 to 100
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         100 non-null    object 
 1   Subtitle     100 non-null    object 
 2   Year         100 non-null    int64  
 3   MinPlayers   100 non-null    int64  
 4   MaxPlayers   100 non-null    int64  
 5   BestPlayers  100 non-null    object 
 6   MinPlayTime  100 non-null    int64  
 7   MaxPlayTime  100 non-null    int64  
 8   MinAge       100 non-null    int64  
 9   Weight       100 non-null    float64
 10  Type         100 non-null    object 
dtypes: float64(1), int64(6), object(4)
memory usage: 13.4+ KB


I removed cols with zero standard deviation (size_x, size_y, size_z, spacing_x, spacing_y, spacing_z)

In [42]:
X.describe()

Unnamed: 0,Year,MinPlayers,MaxPlayers,MinPlayTime,MaxPlayTime,MinAge,Weight
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,2012.89,1.61,4.42,67.55,131.15,12.64,3.3224
std,14.665217,0.601261,1.084137,38.055615,107.350683,1.540825,0.698233
min,1876.0,1.0,2.0,5.0,20.0,8.0,1.25
25%,2012.0,1.0,4.0,40.0,90.0,12.0,2.875
50%,2015.5,2.0,4.0,60.0,120.0,13.0,3.4
75%,2018.0,2.0,5.0,90.0,150.0,14.0,3.825
max,2020.0,3.0,8.0,240.0,1000.0,17.0,4.65
