In [104]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings 

# PREPROCESSING MODULES
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.impute import SimpleImputer

from sklearn.decomposition import PCA

# CLASSIFIER MODULES
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# METRIC MODULES
from sklearn.metrics import f1_score, recall_score, precision_score, fbeta_score, confusion_matrix, make_scorer

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [119]:
df = pd.read_csv('data/combined_cleaned.csv')

# CONVERT STRING INTO LISTS
df['genres'] = df['genres'].str.split(', ')
df['tags'].fillna('',inplace=True)
df['tags'] = df['tags'].str.split(', ')
df['platforms'] = df['platforms'].str.split(', ')
df['categories'] = df['categories'].str.split(', ')
df['supported_languages'] = df['supported_languages'].str.split(', ')
df['supported_languages_audio'].fillna('',inplace=True)
df['supported_languages_audio'] = df['supported_languages_audio'].str.split(', ')
df['publishers'].fillna('',inplace=True)
df['publishers'] = df['publishers'].str.split(', ')

# TARGET VARIABLE
df['target'] = (df['hours_over_age'] >= 1).astype(int)
print(df.shape)

X = df[['publishers','tags','genres','categories','platforms',
        'historic_developer_average_recommendations','historic_developer_average_owners','historic_developer_average_hours',
        'historic_publisher_average_recommendations','historic_publisher_average_owners','historic_publisher_average_hours'
        ]]

y = df['target']

sss = StratifiedShuffleSplit(n_splits=1, random_state=42, test_size=0.2)
train_index, test_index = next(sss.split(np.zeros(len(df['target'])), df['target']))
X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]


# X_train, X_test, y_train, y_test = train_test_split(, df['target'], test_size=0.2,random_state=42)





(41647, 41)


In [120]:
class MultiHotEncoder(BaseEstimator, TransformerMixin):
    """Wraps `MultiLabelBinarizer` in a form that can work with `ColumnTransformer`. Note
    that input X has to be a `pandas.DataFrame`.
    """
    def __init__(self):
        self.mlbs = list()
        self.n_columns = 0
        self.categories_ = self.classes_ = list()

    def fit(self, X:pd.DataFrame, y=None):
        for i in range(X.shape[1]): # X can be of multiple columns
            mlb = MultiLabelBinarizer()
            mlb.fit(X.iloc[:,i])
            self.mlbs.append(mlb)
            self.classes_.append(mlb.classes_)
            self.n_columns += 1
        return self

    def transform(self, X:pd.DataFrame):
        if self.n_columns == 0:
            raise ValueError('Please fit the transformer first.')
        if self.n_columns != X.shape[1]:
            raise ValueError(f'The fit transformer deals with {self.n_columns} columns '
                             f'while the input has {X.shape[1]}.'
                            )
        result = list()
        for i in range(self.n_columns):
            result.append(self.mlbs[i].transform(X.iloc[:,i]))

        result = np.concatenate(result, axis=1)
        return result

In [122]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value=list(['missing']))),
    ('multihot', MultiHotEncoder())
])

tag_transformer = Pipeline(steps=[
    ('multihot', MultiHotEncoder()),
    ('pca', PCA(250))
])

numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
print('numeric_features',numeric_features)
categorical_features = X_train.select_dtypes(include=['object']).columns
print('categorical_features',categorical_features)
try:
    tag_features = X_train[['tags']].columns
except:
    tag_features = []
print('tags',tag_features)


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('tag', tag_transformer, tag_features)
    ])

rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(1000,max_features=100,class_weight={1: 10, 0: 1}))])
""
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', xgb.XGBClassifier(n_estimators=200,
                                                       use_label_encoder=False,
                                                       max_depth=6,
                                                       objective='binary:logistic', #new objective
                                                       learning_rate=.05, 
                                                       subsample=.8,
                                                       min_child_weight=3,
                                                       colsample_bytree=.8,
#                                                        scale_pos_weight=40
                                                      )
                      )
                    ])



numeric_features Index(['historic_developer_average_recommendations',
       'historic_developer_average_owners', 'historic_developer_average_hours',
       'historic_publisher_average_recommendations',
       'historic_publisher_average_owners',
       'historic_publisher_average_hours'],
      dtype='object')
categorical_features Index(['publishers', 'tags', 'genres', 'categories', 'platforms'], dtype='object')
tags Index(['tags'], dtype='object')


In [46]:
preprocessor.fit_transform(X_train).shape

(33317, 501)

In [9]:
xgb_pipeline.fit(X_train,y_train)
y_predict = xgb_pipeline.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))
print(X_train.columns)







precision:  0.6
recall:  0.28085106382978725
f1 : 0.3826086956521739
[[8054   44]
 [ 169   66]]
Index(['supported_languages', 'tags', 'genres', 'categories', 'platforms',
       'historic_developer_average_recommendations',
       'historic_developer_average_owners', 'historic_developer_average_hours',
       'historic_publisher_average_recommendations',
       'historic_publisher_average_owners',
       'historic_publisher_average_hours'],
      dtype='object')


In [53]:
xgb_pipeline.fit(X_train,y_train,classifier__eval_metric='auc')
y_predict = xgb_pipeline.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))
print(X_train.columns)



precision:  0.5137614678899083
recall:  0.26046511627906976
f1 : 0.34567901234567905
[[8062   53]
 [ 159   56]]
Index(['supported_languages', 'tags', 'genres', 'categories', 'platforms',
       'historic_developer_average_recommendations',
       'historic_developer_average_owners',
       'historic_developer_average_hours'],
      dtype='object')


In [57]:
xgb_pipeline.fit(X_train,y_train,classifier__eval_metric='auc')
y_predict = xgb_pipeline.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))
print(X_train.columns)



precision:  0.550561797752809
recall:  0.22790697674418606
f1 : 0.3223684210526316
[[8075   40]
 [ 166   49]]
Index(['tags', 'genres', 'categories', 'platforms',
       'historic_developer_average_recommendations',
       'historic_developer_average_owners',
       'historic_developer_average_hours'],
      dtype='object')


In [60]:
xgb_pipeline.fit(X_train,y_train,classifier__eval_metric='auc')
y_predict = xgb_pipeline.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))
print(X_train.columns)



precision:  0.5
recall:  0.22325581395348837
f1 : 0.3086816720257235
[[8067   48]
 [ 167   48]]
Index(['languages_count', 'languages_audio_count', 'tags', 'genres',
       'categories', 'platforms', 'historic_developer_average_recommendations',
       'historic_developer_average_owners',
       'historic_developer_average_hours'],
      dtype='object')


In [66]:
# With dimensionality reduction
xgb_pipeline.fit(X_train,y_train,classifier__eval_metric='auc')
y_predict = xgb_pipeline.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))
print(X_train.columns)



precision:  0.6415094339622641
recall:  0.15813953488372093
f1 : 0.2537313432835821
[[8096   19]
 [ 181   34]]
Index(['languages_count', 'languages_audio_count', 'tags', 'genres',
       'categories', 'platforms', 'historic_developer_average_recommendations',
       'historic_developer_average_owners',
       'historic_developer_average_hours'],
      dtype='object')


In [63]:
# With class weights
xgb_pipeline.fit(X_train,y_train,classifier__eval_metric='auc')
y_predict = xgb_pipeline.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))
print(X_train.columns)



precision:  0.3417085427135678
recall:  0.31627906976744186
f1 : 0.3285024154589372
[[7984  131]
 [ 147   68]]
Index(['languages_count', 'languages_audio_count', 'tags', 'genres',
       'categories', 'platforms', 'historic_developer_average_recommendations',
       'historic_developer_average_owners',
       'historic_developer_average_hours'],
      dtype='object')


In [123]:
# With publishers
xgb_pipeline.fit(X_train,y_train,classifier__eval_metric='auc')
y_predict = xgb_pipeline.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))
print(X_train.columns)

KeyboardInterrupt: 

In [None]:
xgb_pipeline.fit(X_train,y_train,classifier__eval_metric='auc')
y_predict = xgb_pipeline.predict(X_test)
print('precision: ', precision_score(y_test,y_predict))
print('recall: ', recall_score(y_test,y_predict))
print('f1 :', f1_score(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))
print(X_train.columns)