In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import patsy
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from mpl_toolkits import mplot3d

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../Datasets/games-features.csv')
df['ReleaseDate'] = pd.to_datetime(df['ReleaseDate'], errors='coerce')
df = df.loc[df['ReleaseDate'].notnull()]
df = df.loc[df['GenreIsNonGame'] == False]
df = df.loc[df['SteamSpyOwners'] > 0]
df = df.drop(columns=['QueryID', 'ResponseID', 'QueryName', 'PackageCount',
                      'AchievementHighlightedCount', 'AchievementCount', 'ControllerSupport', 'PCReqsHaveMin',
                      'PCReqsHaveRec', 'LinuxReqsHaveMin', 'LinuxReqsHaveRec', 'MacReqsHaveMin',
                      'MacReqsHaveRec', 'SupportEmail', 'SupportURL', 'ExtUserAcctNotice',
                      'HeaderImage', 'LegalNotice', 'Website', 'PCMinReqsText',
                      'PCRecReqsText', 'LinuxMinReqsText', 'LinuxRecReqsText', 'MacMinReqsText',
                      'MacRecReqsText'])

df.loc[:, 'log_owners'] = df['SteamSpyOwners'].apply(lambda x: 0 if x == 0 else np.log(x))
df['NumPlatforms'] = df['PlatformWindows'].astype(int) + df['PlatformLinux'].astype(int) + df['PlatformMac'].astype(int)
df = df[df['PlatformWindows'] == True]

print(df.shape)
df.head()

In [None]:
threshold = 10
df['Popular'] = df['log_owners'].apply(lambda x: True if x >= threshold else False)

In [None]:
lst = ['PlatformLinux', 'PlatformMac', 'Popular']
subset = df[lst]

### Train - Test

In [None]:
x_train, x_test, y_train, y_test = train_test_split(subset.drop(columns = ['Popular']), 
                                                    np.array(subset['Popular']).reshape(-1,1), test_size = 0.3)

In [None]:
svm = SVC(kernel = 'linear', C = 0.025, random_state = 101)
lr = LogisticRegression()

In [None]:
dtree = DecisionTreeClassifier(max_depth = 10, random_state = 101, max_features = None, min_samples_leaf = 15)
agg_models = [lr, dtree, svm]
model_names = ['lr', 'dtree', 'svm']

In [None]:
score_lst = []
for i in range(len(agg_models)):
    model = agg_models[i]
    model.fit(x_train, y_train)
    train_preds = model.predict(x_train)
    test_preds = model.predict(x_test)
    score_lst.append([model.score(x_train, y_train), model.score(x_test, y_test)])

In [None]:
joined_scores = pd.DataFrame(dict(zip(model_names, score_lst))).T
joined_scores.columns = ['train_score', 'test_score']
joined_scores.head()

In [None]:
display(joined_scores.loc[joined_scores['train_score'].idxmax()])
display(joined_scores.loc[joined_scores['test_score'].idxmax()])

### Cross Validation

In [None]:
x_train, x_test, y_train, y_test = train_test_split(subset.drop(columns = ['Popular']), 
                                                    np.array(subset['Popular']).reshape(-1,1), test_size = 0.3)

In [None]:
svm = SVC(kernel = 'linear', C = 0.025, random_state = 101)
lr = LogisticRegression()

In [None]:
dtree = DecisionTreeClassifier(max_depth = 10, random_state = 101, max_features = None, min_samples_leaf = 15)
agg_models = [lr, dtree, svm]
model_names = ['lr', 'dtree', 'svm']

In [None]:
for i in range(1, 41):
    knn = KNeighborsClassifier(n_neighbors = i)
    model_names.append('knn' + str(i))
    agg_models.append(knn)

In [None]:
scores = {}
for i in range(len(agg_models)):
    score = cross_val_score(agg_models[i], subset.drop(columns = ['Popular']), subset['Popular'], cv =5)
    scores[model_names[i]] = score

In [None]:
scores_df = pd.DataFrame(scores).T
scores_df['mean'] = scores_df.apply(lambda x: x.mean(), axis = 1)
scores_df.columns = ['1st_set', '2nd_set', '3rd_set', '4th_set', '5th_set', 'mean']
scores_df.head()

In [None]:
display(scores_df.loc[scores_df['mean'].idxmax()])