In [324]:
import pandas as pd
import numpy as np
import seaborn as sns

# PREPROCESSING MODULES
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.impute import SimpleImputer


# CLASSIFIER MODULES
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# METRIC MODULES
from sklearn.metrics import f1_score, recall_score, precision_score, fbeta_score, confusion_matrix



In [326]:
df = pd.read_csv('data/combined_cleaned.csv')
df['genres'] = df['genres'].str.split(', ')
df['tags'] = df['tags'].str.split(', ')
df['platforms'] = df['platforms'].str.split(', ')
df['categories'] = df['categories'].str.split(', ')
df['supported_languages'] = df['supported_languages'].str.split(', ')



# df['target'] = (df['average_2weeks'] > 500).astype(int)

df['target'] = (df['hours_over_age'] >= 1).astype(int)


X_train, X_test, y_train, y_test = train_test_split(df[['tags','genres','categories','platforms','owners']], df['target'], test_size=0.2,random_state=42)
X_train

Unnamed: 0,tags,genres,categories,platforms,owners
12536,"[RPG, Indie, Gore, Violent, Nudity, Sexual Con...","[Indie, RPG]","[Single-player, Steam Achievements, Steam Trad...",[windows],50000
8913,"[Indie, Casual, 4 Player Local, Local Multipla...","[Casual, Indie]","[Multi-player, PvP, Shared/Split Screen PvP, C...","[windows, linux]",20000
1318,"[Bullet Hell, Anime, Shoot 'Em Up, Action, Ind...","[Action, Indie]","[Single-player, Steam Trading Cards]",[windows],100000
22158,"[Simulation, Medical Sim, Education, Realistic...","[Indie, Simulation, Early Access]",[Single-player],[windows],50000
5068,"[Adventure, Indie, Point & Click, Sci-fi, Comedy]","[Adventure, Indie]","[Single-player, Steam Achievements, Steam Trad...",[windows],20000
...,...,...,...,...,...
6265,"[Indie, Casual, Action, Platformer, Puzzle-Pla...","[Action, Casual, Indie]","[Single-player, Steam Trading Cards]","[windows, mac]",50000
11284,"[Indie, Simulation, RPG, Casual, Anime, Capita...","[Casual, Indie, RPG, Simulation]","[Single-player, Steam Achievements]",[windows],20000
38158,"[Adventure, Casual, Platformer, Stealth, 2D Pl...","[Adventure, Casual, Indie]","[Single-player, Steam Achievements]","[windows, linux]",20000
860,"[Strategy, RPG, Space, Sci-fi]","[RPG, Strategy]",[Single-player],[windows],100000


In [300]:
class MultiHotEncoder(BaseEstimator, TransformerMixin):
    """Wraps `MultiLabelBinarizer` in a form that can work with `ColumnTransformer`. Note
    that input X has to be a `pandas.DataFrame`.
    """
    def __init__(self):
        self.mlbs = list()
        self.n_columns = 0
        self.categories_ = self.classes_ = list()

    def fit(self, X:pd.DataFrame, y=None):
        for i in range(X.shape[1]): # X can be of multiple columns
            mlb = MultiLabelBinarizer()
            mlb.fit(X.iloc[:,i])
            self.mlbs.append(mlb)
            self.classes_.append(mlb.classes_)
            self.n_columns += 1
        return self

    def transform(self, X:pd.DataFrame):
        if self.n_columns == 0:
            raise ValueError('Please fit the transformer first.')
        if self.n_columns != X.shape[1]:
            raise ValueError(f'The fit transformer deals with {self.n_columns} columns '
                             f'while the input has {X.shape[1]}.'
                            )
        result = list()
        for i in range(self.n_columns):
            result.append(self.mlbs[i].transform(X.iloc[:,i]))

        result = np.concatenate(result, axis=1)
        return result

In [327]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('multihot', MultiHotEncoder())])

numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
print('numeric_features',numeric_features)
categorical_features = X_train.select_dtypes(include=['object']).columns
print('categorical_features',categorical_features)



preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    
    ])

rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(1000,class_weight={1: 10, 0: 1}))])

numeric_features Index(['owners'], dtype='object')
categorical_features Index(['tags', 'genres', 'categories', 'platforms'], dtype='object')


In [307]:
rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
np.sum(y_predict)



precision:  0.9375
recall:  0.06382978723404255
f1 : 0.1195219123505976


16

In [328]:
rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
confusion_matrix(y_test, y_predict)





precision:  0.8666666666666667
recall:  0.05531914893617021
f1 : 0.10400000000000001


array([[8121,    2],
       [ 222,   13]])

In [322]:
rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
np.sum(y_predict)



precision:  0.625
recall:  0.06382978723404255
f1 : 0.11583011583011582


24

In [323]:
mhe MultiHotEncoder()

MultiHotEncoder()