# Running the Models

In [32]:
# Importing all necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import RidgeClassifier

In [2]:
# Loading in our combined data

In [3]:
df = pd.read_csv('KS_BGG_combined.csv', engine = 'python')

In [4]:
df

Unnamed: 0,abstract strategy,action / dexterity,adventure,age of reason,american civil war,american indian wars,american revolutionary war,american west,ancient,animals,...,state,state_changed_at,static_usd_rate,unread_messages_count,unseen_activity_count,urls,usd_pledged,usd_type,_merge,top10%
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,successful,1.443629e+09,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",3.861040e+05,,both,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,live,1.444747e+09,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",1.348585e+06,,both,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,successful,1.426633e+09,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",9.056820e+05,,both,1
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,successful,1.445029e+09,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",8.417600e+04,,both,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,live,1.488917e+09,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",2.221682e+06,,both,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,live,1.508437e+09,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",6.063800e+04,domestic,both,0
361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,successful,1.479496e+09,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",1.904500e+04,,both,0
362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,successful,1.364958e+09,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",5.049715e+04,,both,0
363,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,successful,1.481324e+09,1.000000,,,"{""web"":{""project"":""https://www.kickstarter.com...",8.988600e+04,,both,0


### Feature Selection

To begin the feature selection process, we make a correlation table.

In [5]:
corr_matrix = df.corr().abs()
print(corr_matrix['top10%'].sort_values(ascending=False).head())

top10%         1.000000
rank           0.833500
geek_rating    0.754499
owned          0.398630
num_votes      0.369392
Name: top10%, dtype: float64


We understand that rank and geek_rating are highly correlated with a game being in the top 10%, so we remove them from the features we are considering

In [6]:
df.drop(columns = ['rank', 'geek_rating'], inplace = True)

Considering our target audience of board game publishers, we'll also remove any features that in reality will come in after a Kickstarter is done, like number of people who own it, number of votes, etc. We will leave in 'weight' as a proxy for the complexity of the game.

In [7]:
df.drop(columns = ['owned', 'num_votes', 'year'], inplace = True)

Then, we drop columns that I don't think will be useful for my problem, redudant values, hyperlinks, countries, exchange rates, # of friends of the project, as well as artifacts from the data cleaning eg. _merge:


In [8]:
df.drop(columns = ['bgg_url', 'blurb', 'game_id', 'id', 'is_backing', 'is_starrable', 'is_starred', 'slug', 'urls', 'usd_type', 'category_x', 'category_y', 'created_at', 'creator',  'currency_trailing_code', 'current_currency',  'currency_symbol', 'deadline', 'disable_communication', 'friends', 'fx_rate', 'image_url', '_merge'], inplace = True)

This leaves us with a much smaller list, with many columns focused on the categories (game themes) and game mechanics, which a publisher will have more control of.

In [9]:
# I extract the list of mechanics, categories, and other remaining values using 'list(df.columns.values)''
list(df.columns.values)

['abstract strategy',
 'action / dexterity',
 'adventure',
 'age of reason',
 'american civil war',
 'american indian wars',
 'american revolutionary war',
 'american west',
 'ancient',
 'animals',
 'arabian',
 'aviation / flight',
 'bluffing',
 'book',
 'card game',
 "children's game",
 'city building',
 'civil war',
 'civilization',
 'collectible components',
 'comic book / strip',
 'deduction',
 'dice',
 'economic',
 'educational',
 'electronic',
 'environmental',
 'expansion for base-game',
 'exploration',
 'fantasy',
 'farming',
 'fighting',
 'game system',
 'horror',
 'humor',
 'industry / manufacturing',
 'korean war',
 'mafia',
 'math',
 'mature / adult',
 'maze',
 'medical',
 'medieval',
 'memory',
 'miniatures',
 'modern warfare',
 'movies / tv / radio theme',
 'murder/mystery',
 'music',
 'mythology',
 'napoleonic',
 'nautical',
 'negotiation',
 'none',
 'novel-based',
 'number',
 'party game',
 'pike and shot',
 'pirates',
 'political',
 'post-napoleonic',
 'prehistoric',
 

In [10]:
cats = df[['abstract strategy', 'action / dexterity', 'adventure', 'age of reason', 'american civil war', 'american indian wars', 'american revolutionary war',
 'american west', 'ancient', 'animals', 'arabian', 'aviation / flight', 'bluffing', 'book', 'card game', "children's game", 'city building',
 'civil war', 'civilization', 'collectible components', 'comic book / strip', 'deduction', 'dice', 'economic', 'educational', 'electronic',
 'environmental', 'expansion for base-game', 'exploration', 'fantasy', 'farming', 'fighting', 'game system', 'horror',
 'humor', 'industry / manufacturing', 'korean war', 'mafia', 'math', 'mature / adult', 'maze', 'medical', 'medieval', 'memory',
 'miniatures', 'modern warfare', 'movies / tv / radio theme', 'murder/mystery', 'music', 'mythology', 'napoleonic', 'nautical',
 'negotiation', 'none', 'novel-based', 'number', 'party game', 'pike and shot', 'pirates', 'political', 'post-napoleonic', 'prehistoric',
 'print & play', 'puzzle', 'racing', 'real-time', 'religious', 'renaissance', 'science fiction', 'space exploration', 'spies/secret agents',
 'sports', 'territory building', 'trains', 'transportation', 'travel', 'trivia', 'video game theme', 'vietnam war', 'wargame',
 'word game', 'world war i', 'world war ii', 'zombies', 'top10%']]

With this more focused list, let's look at the correlation of just the categories to being in the top 10%

In [11]:
corr_matrix = cats.corr().abs()
print(corr_matrix['top10%'].sort_values(ascending=False).head(30))

top10%                       1.000000
miniatures                   0.149615
negotiation                  0.137110
zombies                      0.127447
puzzle                       0.111067
civilization                 0.109766
murder/mystery               0.106887
fantasy                      0.106062
economic                     0.098105
bluffing                     0.096702
spies/secret agents          0.093492
memory                       0.090434
political                    0.090272
video game theme             0.089491
deduction                    0.085311
action / dexterity           0.079720
aviation / flight            0.078210
horror                       0.074847
fighting                     0.069423
modern warfare               0.069284
exploration                  0.069010
trains                       0.065118
collectible components       0.063770
movies / tv / radio theme    0.063770
mafia                        0.063296
farming                      0.063296
age of reaso

In [12]:
mechs = df[['acting', 'action / movement programming', 'action point allowance system', 'area control / area influence', 'area enclosure',
 'area movement', 'area-impulse', 'auction/bidding', 'betting/wagering', 'campaign / battle card driven', 'card drafting', 'chit-pull system',
 'co-operative play', 'commodity speculation', 'crayon rail system', 'deck / pool building', 'dice rolling', 'grid movement', 'hand management',
 'hex-and-counter', 'line drawing', 'memory.1', 'modular board', 'none.1', 'paper-and-pencil', 'partnerships', 'pattern building', 'pattern recognition',
 'pick-up and deliver', 'player elimination', 'point to point movement', 'press your luck', 'rock-paper-scissors', 'role playing', 'roll / spin and move',
 'route/network building', 'secret unit deployment', 'set collection', 'simulation', 'simultaneous action selection', 'singing',
 'stock holding', 'storytelling', 'take that', 'tile placement', 'time track', 'trading', 'trick-taking', 'variable phase order', 'variable player powers',
 'voting', 'worker placement', 'top10%']]

With this more focused list, let's look at the correlation of mechanics to being in the top 10% of BGG

In [13]:
corr_matrix = mechs.corr().abs()
print(corr_matrix['top10%'].sort_values(ascending=False).head(30))

top10%                           1.000000
variable player powers           0.227256
worker placement                 0.158571
action point allowance system    0.157275
voting                           0.127447
simulation                       0.127447
variable phase order             0.125749
area movement                    0.106156
hex-and-counter                  0.101249
pattern building                 0.096233
card drafting                    0.087170
chit-pull system                 0.078210
area enclosure                   0.078210
pattern recognition              0.078210
trick-taking                     0.078210
route/network building           0.076282
point to point movement          0.073437
area control / area influence    0.070285
co-operative play                0.069284
commodity speculation            0.067476
stock holding                    0.065149
press your luck                  0.063120
partnerships                     0.062801
dice rolling                     0

With this more focused list, let's look at the correlation of other features to being in the top 10% of BGG

In [16]:
feats = df[['min_players', 'max_players', 'avg_time', 'min_time', 'max_time', 'avg_rating', 'age', 'mechanic', 'designer', 'weight',
 'backers_count', 'converted_pledged_amount', 'country', 'currency', 'goal', 'last_update_published_at',
 'launched_at', 'location', 'permissions', 'photo', 'pledged', 'profile', 'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
 'static_usd_rate', 'unread_messages_count', 'unseen_activity_count', 'usd_pledged', 'top10%']]

In [17]:
corr_matrix = feats.corr().abs()
print(corr_matrix['top10%'].sort_values(ascending=False).head(30))

top10%                      1.000000
avg_rating                  0.337752
weight                      0.213283
age                         0.162547
usd_pledged                 0.154286
pledged                     0.152293
max_time                    0.126873
avg_time                    0.123591
goal                        0.104336
converted_pledged_amount    0.076813
min_time                    0.075545
state_changed_at            0.070641
launched_at                 0.068040
backers_count               0.051107
static_usd_rate             0.026277
max_players                 0.005590
min_players                 0.004581
spotlight                   0.001667
last_update_published_at         NaN
unread_messages_count            NaN
unseen_activity_count            NaN
Name: top10%, dtype: float64


Based on these findings, I will select the top 3 features each of the categories, mechanics, and the other KS and BGG features for our models.

In [18]:
picks = df[['miniatures', 'negotiation', 'zombies', 'variable player powers', 'worker placement', 'action point allowance system',
 'weight', 'avg_time', 'goal']]

### Final Feature Selection, Train Test Split, StandardScaler and Baseline

In [19]:
X = picks
y = df['top10%']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [20]:
# Fit and transform the StandardScaler to the train data, and transforming the test data.
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

##### Baseline

In [21]:
# This is the baseline score that I'm trying to beat
pd.value_counts(df['top10%'], normalize=True)

0    0.575342
1    0.424658
Name: top10%, dtype: float64

## Classifiers

#### Logistic Regression

In [22]:
logreg = LogisticRegression(C = 1.0)

In [23]:
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
print("Logistic Regression training accuracy " + str(logreg.score(X_train, y_train)))
print("Logistic Regression testing accuracy " + str(logreg.score(X_test, y_test)))

Logistic Regression training accuracy 0.673992673992674
Logistic Regression testing accuracy 0.6521739130434783


#### Ridge Classifier

In [25]:
# Instantiating the Ridge Classifier
ridgeclass = RidgeClassifier(alpha=1)

In [26]:
# Fitting the Ridge Classifier
ridgeclass.fit(X_train, y_train)

RidgeClassifier(alpha=1, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [27]:
# Evaluating our Ridge Regression model using R2.
print(ridgeclass.score(X_train, y_train))
print(ridgeclass.score(X_test, y_test))

0.673992673992674
0.6521739130434783


#### KNearest Neighbors (k = 3, 5, 10)

In [33]:
# k = 3
k3class = KNeighborsClassifier(n_neighbors = 3)
k3class.fit(X_train, y_train)
k3class.score(X_train, y_train)

0.7985347985347986

In [34]:
k3class.score(X_test, y_test)

0.6086956521739131

In [35]:
# k = 5
k5class = KNeighborsClassifier(n_neighbors = 5)
k5class.fit(X_train, y_train)
k5class.score(X_train, y_train)

0.7545787545787546

In [36]:
k5class.score(X_test, y_test)

0.5869565217391305

In [37]:
# k = 10
k10class = KNeighborsClassifier(n_neighbors = 10)
k10class.fit(X_train, y_train)
k10class.score(X_train, y_train)

0.6996336996336996

In [38]:
k10class.score(X_test, y_test)

0.6304347826086957

#### Decision Trees Classifier

In [39]:
dtclass = DecisionTreeClassifier(max_depth = 7,
                            min_samples_split = 7,
                            min_samples_leaf = 5,
                            random_state = 42)

In [40]:
dtclass.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=7,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [41]:
dtclass.score(X_train, y_train)

0.7985347985347986

In [42]:
dtclass.score(X_test, y_test)

0.5978260869565217

#### Bagged Decision Trees Classifier

In [43]:
bdtclass = BaggingClassifier(random_state=42)

In [44]:
bdtclass.fit(X_train, y_train)

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=10,
                  n_jobs=None, oob_score=False, random_state=42, verbose=0,
                  warm_start=False)

In [45]:
bdtclass.score(X_train, y_train)

0.967032967032967

In [46]:
bdtclass.score(X_test, y_test)

0.6413043478260869

### Conclusions and the Improvements to be Made

Given that I have a 'client' in mind going into this exercise, and that I want the client to have actionable knowledge, I value interpretability over a high score. It seems that our Logistic Regression does best based on the above metric. The Logistic Regression model displays what I think is the decent score (low overfitting, improvements over the baseline by around 20 percent), and as a model has high interpretability. 

In [51]:
print("Logistic Regression training accuracy " + str(logreg.score(X_train, y_train)))
print("Logistic Regression testing accuracy " + str(logreg.score(X_test, y_test)))

Logistic Regression training accuracy 0.673992673992674
Logistic Regression testing accuracy 0.6521739130434783


Still, there's much to be improved upon. I would like to get more recent BGG and KS data to get a better sample of data. I would also consider getting board game sales data to even better represent long term success.