In [355]:
import pandas as pd
import numpy as np
import seaborn as sns

# PREPROCESSING MODULES
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.impute import SimpleImputer

from sklearn.decomposition import PCA

# CLASSIFIER MODULES
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# METRIC MODULES
from sklearn.metrics import f1_score, recall_score, precision_score, fbeta_score, confusion_matrix



ModuleNotFoundError: No module named 'xgboost'

In [353]:
df = pd.read_csv('data/combined_cleaned.csv')
df['genres'] = df['genres'].str.split(', ')
df['tags'] = df['tags'].str.split(', ')
df['platforms'] = df['platforms'].str.split(', ')
df['categories'] = df['categories'].str.split(', ')
df['supported_languages'] = df['supported_languages'].str.split(', ')



# df['target'] = (df['average_2weeks'] > 500).astype(int)

df['target'] = (df['hours_over_age'] >= 1).astype(int)


X_train, X_test, y_train, y_test = train_test_split(df[['supported_languages','tags','genres','categories','platforms',
                                                        'historic_developer_average_recommendations','historic_developer_average_owners','historic_developer_average_hours',
                                                        'historic_publisher_average_recommendations','historic_publisher_average_owners','historic_publisher_average_hours'
                                                       ]], df['target'], test_size=0.2,random_state=42)
X_train

Unnamed: 0,supported_languages,tags,genres,categories,platforms,historic_developer_average_recommendations,historic_developer_average_owners,historic_developer_average_hours,historic_publisher_average_recommendations,historic_publisher_average_owners,historic_publisher_average_hours
17474,"[English, Russian, French, German, Spanish - S...","[Action, Indie, Adventure, Singleplayer, Dark ...","[Action, Adventure, Indie]","[Single-player, Steam Achievements, Full contr...",[windows],0.0,20000.0,0.0,0.0000,20000.0,0.000
41124,"[English, German, Italian, Japanese, Korean, P...","[Local Co-Op, Physics, Cute, Puzzle, Family Fr...",[Indie],"[Single-player, Multi-player, Co-op, Shared/Sp...",[windows],0.0,0.0,0.0,0.0000,0.0,0.000
11605,[English],"[Football, Simulation, Sports, Strategy, Manag...","[Simulation, Sports]","[Single-player, Multi-player, PvP, Shared/Spli...","[windows, mac, linux]",0.0,80000.0,258.0,0.0000,72500.0,196.000
29502,[English],"[Indie, Casual, Adventure]","[Adventure, Casual, Indie]",[Single-player],[windows],0.0,0.0,0.0,0.0000,0.0,0.000
12311,[Russian],"[Casual, Indie, Memes, Psychological Horror, C...","[Casual, Indie]","[Single-player, Steam Trading Cards]",[windows],0.0,0.0,0.0,0.0000,0.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...
6265,"[English, French, German, Spanish - Spain, Rus...","[Sports, Indie, Bikes, BMX, Singleplayer]","[Indie, Sports]","[Single-player, Steam Achievements, Full contr...",[windows],0.0,0.0,0.0,160.2500,162500.0,81.375
11284,"[English, Japanese]","[Action, Indie, Casual, Classic]","[Action, Casual, Indie]","[Single-player, Steam Achievements, Full contr...",[windows],0.0,35000.0,326.0,0.0000,0.0,0.000
38158,[English],"[Horror, Pixel Graphics, Hand-drawn, Dog, Top-...",[Indie],"[Single-player, Steam Achievements]",[windows],0.0,0.0,0.0,0.0000,0.0,0.000
860,"[English, Russian, German]","[Strategy, Simulation, World War II, Tanks, RTS]","[Simulation, Strategy]","[Single-player, Multi-player, Steam Cloud, Inc...",[windows],1127.5,500000.0,422.0,410.1875,200000.0,255.500


In [300]:
class MultiHotEncoder(BaseEstimator, TransformerMixin):
    """Wraps `MultiLabelBinarizer` in a form that can work with `ColumnTransformer`. Note
    that input X has to be a `pandas.DataFrame`.
    """
    def __init__(self):
        self.mlbs = list()
        self.n_columns = 0
        self.categories_ = self.classes_ = list()

    def fit(self, X:pd.DataFrame, y=None):
        for i in range(X.shape[1]): # X can be of multiple columns
            mlb = MultiLabelBinarizer()
            mlb.fit(X.iloc[:,i])
            self.mlbs.append(mlb)
            self.classes_.append(mlb.classes_)
            self.n_columns += 1
        return self

    def transform(self, X:pd.DataFrame):
        if self.n_columns == 0:
            raise ValueError('Please fit the transformer first.')
        if self.n_columns != X.shape[1]:
            raise ValueError(f'The fit transformer deals with {self.n_columns} columns '
                             f'while the input has {X.shape[1]}.'
                            )
        result = list()
        for i in range(self.n_columns):
            result.append(self.mlbs[i].transform(X.iloc[:,i]))

        result = np.concatenate(result, axis=1)
        return result

In [357]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('multihot', MultiHotEncoder())
])

tag_transformer = Pipeline(steps=[
    ('multihot', MultiHotEncoder()),
    ('pca', PCA(100))
])

numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
print('numeric_features',numeric_features)
categorical_features = X_train.select_dtypes(include=['object']).columns
print('categorical_features',categorical_features)
# try:
#     tag_features = X_train[['tags']].columns
# except:
#     tag_features = []
# print('tags',tag_features)


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
#         ('tag', tag_transformer, tag_features)
    ])

rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(1000,max_features=100,class_weight={1: 10, 0: 1}))])

xg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(1000,class_weight={1: 10, 0: 1}))])



numeric_features Index(['historic_developer_average_recommendations',
       'historic_developer_average_owners', 'historic_developer_average_hours',
       'historic_publisher_average_recommendations',
       'historic_publisher_average_owners',
       'historic_publisher_average_hours'],
      dtype='object')
categorical_features Index(['supported_languages', 'tags', 'genres', 'categories', 'platforms'], dtype='object')


In [307]:
rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
np.sum(y_predict)



precision:  0.9375
recall:  0.06382978723404255
f1 : 0.1195219123505976


16

In [334]:
rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))
print(X_train.columns)

precision:  0.6923076923076923
recall:  0.03435114503816794
f1 : 0.06545454545454546
[[8073    4]
 [ 253    9]]
Index(['tags', 'genres', 'categories', 'platforms',
       'historic_developer_average_owners',
       'historic_developer_average_hours'],
      dtype='object')


In [341]:
rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))
print(X_train.columns)

precision:  0.6666666666666666
recall:  0.007633587786259542
f1 : 0.015094339622641508
[[8076    1]
 [ 260    2]]
Index(['tags', 'genres', 'categories', 'platforms',
       'historic_developer_average_owners',
       'historic_developer_average_hours'],
      dtype='object')


In [343]:
rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))
print(X_train.columns)

precision:  0.5
recall:  0.003816793893129771
f1 : 0.007575757575757576
[[8076    1]
 [ 261    1]]
Index(['owners', 'tags', 'genres', 'categories', 'platforms',
       'historic_developer_average_owners',
       'historic_developer_average_hours'],
      dtype='object')


In [358]:
rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))
print(X_train.columns)



precision:  0.5526315789473685
recall:  0.08936170212765958
f1 : 0.15384615384615385
[[8081   17]
 [ 214   21]]
Index(['supported_languages', 'tags', 'genres', 'categories', 'platforms',
       'historic_developer_average_recommendations',
       'historic_developer_average_owners', 'historic_developer_average_hours',
       'historic_publisher_average_recommendations',
       'historic_publisher_average_owners',
       'historic_publisher_average_hours'],
      dtype='object')
