## Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import patsy
import statsmodels.api as sm
from mpl_toolkits import mplot3d

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

## Cleaning

In [2]:
df = pd.read_csv('../Datasets/games-features.csv')
df['ReleaseDate'] = pd.to_datetime(df['ReleaseDate'], errors='coerce')
df = df.loc[df['ReleaseDate'].notnull()]
df = df.loc[df['GenreIsNonGame'] == False]
df = df.loc[df['SteamSpyOwners'] > 0]
df = df.drop(columns=['QueryID', 'ResponseID', 'QueryName', 'PackageCount',
                      'AchievementHighlightedCount', 'AchievementCount', 'ControllerSupport', 'PCReqsHaveMin',
                      'PCReqsHaveRec', 'LinuxReqsHaveMin', 'LinuxReqsHaveRec', 'MacReqsHaveMin',
                      'MacReqsHaveRec', 'SupportEmail', 'SupportURL', 'ExtUserAcctNotice',
                      'HeaderImage', 'LegalNotice', 'Website', 'PCMinReqsText',
                      'PCRecReqsText', 'LinuxMinReqsText', 'LinuxRecReqsText', 'MacMinReqsText',
                      'MacRecReqsText'])

df.loc[:, 'log_owners'] = df['SteamSpyOwners'].apply(lambda x: 0 if x == 0 else np.log(x))
df['NumPlatforms'] = df['PlatformWindows'].astype(int) + df['PlatformLinux'].astype(int) + df['PlatformMac'].astype(int)
df = df[df['PlatformWindows'] == True]

print(df.shape)
df.head()

(10218, 55)


Unnamed: 0,ResponseName,ReleaseDate,RequiredAge,DemoCount,DeveloperCount,DLCCount,Metacritic,MovieCount,RecommendationCount,PublisherCount,...,PriceFinal,AboutText,Background,ShortDescrip,DetailedDescrip,DRMNotice,Reviews,SupportedLanguages,log_owners,NumPlatforms
0,Counter-Strike,2000-11-01,0,0,1,0,88,0,68991,1,...,9.99,Play the worlds number 1 online action game. E...,http://cdn.akamai.steamstatic.com/steam/apps/1...,,Play the worlds number 1 online action game. E...,,,English French German Italian Spanish Simplifi...,16.383021,3
1,Team Fortress Classic,1999-04-01,0,0,1,0,0,0,2439,1,...,4.99,One of the most popular online action games of...,http://cdn.akamai.steamstatic.com/steam/apps/2...,,One of the most popular online action games of...,,,English French German Italian Spanish,15.50175,3
2,Day of Defeat,2003-05-01,0,0,1,0,79,0,2319,1,...,4.99,Enlist in an intense brand of Axis vs. Allied ...,http://cdn.akamai.steamstatic.com/steam/apps/3...,,Enlist in an intense brand of Axis vs. Allied ...,,,English French German Italian Spanish,15.846432,3
3,Deathmatch Classic,2001-06-01,0,0,1,0,0,0,888,1,...,4.99,Enjoy fast-paced multiplayer gaming with Death...,http://cdn.akamai.steamstatic.com/steam/apps/4...,,Enjoy fast-paced multiplayer gaming with Death...,,,English French German Italian Spanish,15.830276,3
4,Half-Life: Opposing Force,1999-11-01,0,0,1,0,0,0,2934,1,...,4.99,Return to the Black Mesa Research Facility as ...,http://cdn.akamai.steamstatic.com/steam/apps/5...,,Return to the Black Mesa Research Facility as ...,,,English French German Korean,15.501817,3


In [3]:
threshold = 10
df['Popular'] = df['log_owners'].apply(lambda x: True if x >= threshold else False)

## Comparison of Classification Models with Platforms

In [4]:
lst = ['PlatformLinux', 'PlatformMac', 'Popular']
subset = df[lst]

### Train - Test

In [5]:
x_train, x_test, y_train, y_test = train_test_split(subset.drop(columns = ['Popular']), 
                                                    np.array(subset['Popular']).reshape(-1,1), test_size = 0.3)

In [6]:
svm = SVC(kernel = 'linear', C = 0.025, random_state = 101)
lr = LogisticRegression()

In [7]:
dtree = DecisionTreeClassifier(max_depth = 10, random_state = 101, max_features = None, min_samples_leaf = 15)
agg_models = [lr, dtree, svm]
model_names = ['lr', 'dtree', 'svm']

In [8]:
for i in range(1, 41):
    knn = KNeighborsClassifier(n_neighbors = i)
    model_names.append('knn' + str(i))
    agg_models.append(knn)

In [9]:
score_lst = []
for i in range(len(agg_models)):
    model = agg_models[i]
    model.fit(x_train, y_train)
    train_preds = model.predict(x_train)
    test_preds = model.predict(x_test)
    score_lst.append([model.score(x_train, y_train), model.score(x_test, y_test)])

In [10]:
joined_scores = pd.DataFrame(dict(zip(model_names, score_lst))).T
joined_scores.columns = ['train_score', 'test_score']
joined_scores.head()

Unnamed: 0,train_score,test_score
lr,0.533837,0.539791
dtree,0.534256,0.527397
svm,0.534256,0.527397
knn1,0.471756,0.482714
knn2,0.472176,0.47032


In [11]:
display(joined_scores.loc[joined_scores['train_score'].idxmax()])
display(joined_scores.loc[joined_scores['test_score'].idxmax()])

train_score    0.534256
test_score     0.527397
Name: dtree, dtype: float64

train_score    0.533837
test_score     0.539791
Name: lr, dtype: float64

DecisionTree best for trainning score, LogisticRegression best for testing score. Both barely above chance.

### Cross Validation

In [12]:
x_train, x_test, y_train, y_test = train_test_split(subset.drop(columns = ['Popular']), 
                                                    np.array(subset['Popular']).reshape(-1,1), test_size = 0.3)

In [13]:
svm = SVC(kernel = 'linear', C = 0.025, random_state = 101)
lr = LogisticRegression()
dtree = DecisionTreeClassifier(max_depth = 10, random_state = 101, max_features = None, min_samples_leaf = 15)

In [14]:
agg_models = [lr, dtree, svm]
model_names = ['lr', 'dtree', 'svm']

In [15]:
for i in range(1, 41):
    knn = KNeighborsClassifier(n_neighbors = i)
    model_names.append('knn' + str(i))
    agg_models.append(knn)

In [16]:
scores = {}
for i in range(len(agg_models)):
    score = cross_val_score(agg_models[i], subset.drop(columns = ['Popular']), subset['Popular'], cv =5)
    scores[model_names[i]] = score

In [17]:
scores_df = pd.DataFrame(scores).T
scores_df['mean'] = scores_df.apply(lambda x: x.mean(), axis = 1)
scores_df.columns = ['1st_set', '2nd_set', '3rd_set', '4th_set', '5th_set', 'mean']
scores_df.head()

Unnamed: 0,1st_set,2nd_set,3rd_set,4th_set,5th_set,mean
lr,0.482387,0.507828,0.519569,0.558492,0.540382,0.521732
dtree,0.482387,0.507828,0.519569,0.546256,0.558492,0.522907
svm,0.482387,0.507828,0.527886,0.527655,0.527655,0.514683
knn1,0.481898,0.472114,0.480431,0.451297,0.474302,0.472008
knn2,0.519569,0.483366,0.456458,0.432697,0.461576,0.470733


In [18]:
display(scores_df.loc[scores_df['mean'].idxmax()])

1st_set    0.482387
2nd_set    0.507828
3rd_set    0.519569
4th_set    0.546256
5th_set    0.558492
mean       0.522907
Name: dtree, dtype: float64

DecisionTree best for cross validation score but still barely above chance

## Classifier with Genre+Platform Predictors

In [19]:
lst = ['GenreIsNonGame',
        'GenreIsIndie', 'GenreIsAction', 'GenreIsAdventure', 'GenreIsCasual',
        'GenreIsStrategy', 'GenreIsRPG', 'GenreIsSimulation', 'GenreIsEarlyAccess',
        'GenreIsFreeToPlay', 'GenreIsSports', 'GenreIsRacing', 'GenreIsMassivelyMultiplayer',
        'PlatformLinux', 'PlatformMac', 'Popular']
subset = df[lst]

### Train-Test

In [20]:
x_train, x_test, y_train, y_test = train_test_split(subset.drop(columns = ['Popular']), 
                                                    np.array(subset['Popular']).reshape(-1,1), test_size = 0.3)

In [21]:
svm = SVC(kernel = 'linear', C = 0.025, random_state = 101)
lr = LogisticRegression()

In [22]:
dtree = DecisionTreeClassifier(max_depth = 10, random_state = 101, max_features = None, min_samples_leaf = 15)
agg_models = [lr, dtree, svm]
model_names = ['lr', 'dtree', 'svm']

In [23]:
for i in range(1, 41):
    knn = KNeighborsClassifier(n_neighbors = i)
    model_names.append('knn' + str(i))
    agg_models.append(knn)

In [24]:
score_lst = []
for i in range(len(agg_models)):
    model = agg_models[i]
    model.fit(x_train, y_train)
    train_preds = model.predict(x_train)
    test_preds = model.predict(x_test)
    score_lst.append([model.score(x_train, y_train), model.score(x_test, y_test)])

In [25]:
joined_scores = pd.DataFrame(dict(zip(model_names, score_lst))).T
joined_scores.columns = ['train_score', 'test_score']
joined_scores.head()

Unnamed: 0,train_score,test_score
lr,0.645554,0.648402
dtree,0.666667,0.652642
svm,0.637025,0.646119
knn1,0.607103,0.551207
knn2,0.615352,0.58578


In [26]:
display(joined_scores.loc[joined_scores['train_score'].idxmax()])
display(joined_scores.loc[joined_scores['test_score'].idxmax()])

train_score    0.666667
test_score     0.652642
Name: dtree, dtype: float64

train_score    0.666667
test_score     0.652642
Name: dtree, dtype: float64

### Cross Validation

In [27]:
x_train, x_test, y_train, y_test = train_test_split(subset.drop(columns = ['Popular']), 
                                                    np.array(subset['Popular']).reshape(-1,1), test_size = 0.3)

In [28]:
svm = SVC(kernel = 'linear', C = 0.025, random_state = 101)
lr = LogisticRegression()
dtree = DecisionTreeClassifier(max_depth = 10, random_state = 101, max_features = None, min_samples_leaf = 15)

In [29]:
agg_models = [lr, dtree, svm]
model_names = ['lr', 'dtree', 'svm']

In [30]:
for i in range(1, 41):
    knn = KNeighborsClassifier(n_neighbors = i)
    model_names.append('knn' + str(i))
    agg_models.append(knn)

In [31]:
scores = {}
for i in range(len(agg_models)):
    score = cross_val_score(agg_models[i], subset.drop(columns = ['Popular']), subset['Popular'], cv =5)
    scores[model_names[i]] = score

In [32]:
scores_df = pd.DataFrame(scores).T
scores_df['mean'] = scores_df.apply(lambda x: x.mean(), axis = 1)
scores_df.columns = ['1st_set', '2nd_set', '3rd_set', '4th_set', '5th_set', 'mean']
scores_df.head()

Unnamed: 0,1st_set,2nd_set,3rd_set,4th_set,5th_set,mean
lr,0.686888,0.641879,0.613014,0.652472,0.649046,0.64866
dtree,0.668787,0.635519,0.589041,0.651003,0.662751,0.64142
svm,0.669765,0.633072,0.599804,0.634361,0.63583,0.634567
knn1,0.597847,0.535225,0.536693,0.53255,0.522761,0.545015
knn2,0.573875,0.541585,0.549413,0.552129,0.577093,0.558819


In [33]:
display(scores_df.loc[scores_df['mean'].idxmax()])

1st_set    0.686888
2nd_set    0.641879
3rd_set    0.613014
4th_set    0.652472
5th_set    0.649046
mean       0.648660
Name: lr, dtype: float64