# Modeling

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

np.random.seed(42)

  from numpy.core.umath_tests import inner1d


In [2]:
train = pd.read_csv('./data/new_train.csv')
test = pd.read_csv('./data/new_test.csv')
labels = pd.read_csv('./data/train_labels.csv')

In [3]:
acc = labels[['game_session', 'accuracy_group']]
trainlab = train.merge(acc, on = 'game_session')

In [4]:
trainlab.drop(columns=['Unnamed: 0', 
                       'installation_id',
                       'game_time_s', 
                       'step_time',
                      'total_game_sessions',
                      'total_game_time',
                      'game_correct',
                      'game_incorrect',
                      'test_correct',
                      'test_incorrect',
                      'new_game',
                      'got_instructions',
                      'instruction_time',
                      'assessment_time',
                      'game_session'], inplace = True)

## FEATURE NOTES
1. We might have data leakage with the current cumulative calculation (it shouldn't include the correct/incorrect answers for the current game session but I think it does)
2. We might want to weight certain features more (or make combination features?) Ex. prior time spent on assessments might be more meaningful; the game/assessment scores from the last game or last three games; the ratio of time spent in the last world they were in before taking an assessment; the amount of time/number of times they got extra instructions)
3. Features we don't have that might be meaningful:
    - Whether or not this is a potentially noisy device (ex. more than 300 game sessions)
    - Slope of 'improvement' (how much faster the player is in the games, how their ratio of correct-incorrect responses has changed since the last game time)

### Balancing Classes
* I tried modeling with balanced classes, model did much worse on accuracy. Reverted modeling back to unbalanced version for now.

In [5]:
# Separate classes
df_0 = trainlab[trainlab['accuracy_group']==0]
df_1 = trainlab[trainlab['accuracy_group']==1]
df_2 = trainlab[trainlab['accuracy_group']==2]
df_3 = trainlab[trainlab['accuracy_group']==3]

print("df0 shape:", df_0.shape)
print("df1 shape:", df_1.shape)
print("df2 shape:", df_2.shape)
print("df3 shape:", df_3.shape)

df0 shape: (4229, 45)
df1 shape: (2411, 45)
df2 shape: (2205, 45)
df3 shape: (8845, 45)


In [6]:
# Upsample minority classes
df0_upsampled = resample(df_0, 
                        replace=True,     
                        n_samples=8845, # match class 3
                        random_state=42) 

df1_upsampled = resample(df_1, 
                        replace=True,     
                        n_samples=8845, # match class 3
                        random_state=42) 

df2_upsampled = resample(df_2, 
                        replace=True,     
                        n_samples=8845, # match class 3
                        random_state=42) 

# Combine majority class with upsampled minority class
trainlab_bal = pd.concat([df0_upsampled, df1_upsampled, df2_upsampled, df_3])
 
# Display new class counts
trainlab_bal['accuracy_group'].value_counts()

3    8845
2    8845
1    8845
0    8845
Name: accuracy_group, dtype: int64

### Train/Test/Split

In [7]:
X = trainlab.drop(columns = ['accuracy_group'])
y = trainlab['accuracy_group']

## to use balanced classes
# X = trainlab_bal.drop(columns = ['accuracy_group'])
# y = trainlab_bal['accuracy_group']


In [8]:
y.value_counts(normalize = True)

3    0.500000
0    0.239062
1    0.136292
2    0.124647
Name: accuracy_group, dtype: float64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

In [10]:
y_train.value_counts(normalize = True)

3    0.499962
0    0.239089
1    0.136278
2    0.124670
Name: accuracy_group, dtype: float64

In [11]:
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

y_train shape: (13267,)
y_test shape: (4423,)


### Scaling

In [12]:
ss = StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

### Logistic Regression 1

In [13]:
logreg = LogisticRegression()
logreg.fit(X_train_ss, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
cv_scores = cross_val_score(logreg, X_train_ss, y_train)
cv_scores

array([0.52689873, 0.52351877, 0.5265777 ])

In [15]:
cv_scores = cross_val_score(logreg, X_test_ss, y_test)
cv_scores

array([0.52235772, 0.51967436, 0.52681602])

In [16]:
# Getting features
coefs = logreg.coef_
cols = X_train.columns
coef_df = pd.DataFrame(coefs, columns = cols)

In [17]:
coef_df

Unnamed: 0,test_correct_cuma,test_incorrect_cuma,proportion_correct_cuma,game_correct_cuma,game_incorrect_cuma,proportion_correct_game_cuma,app_time_cuma,game_sessions_cuma,got_instructions_cuma,instruction_time_cuma,...,TREETOPCITY_correctanswers_rate_perid,TREETOPCITY_correctplay_rate_perid,CRYSTALCAVES_totaltime_perid,CRYSTALCAVES_gamesessions_perid,CRYSTALCAVES_correctanswers_perid,CRYSTALCAVES_incorrectanswers_perid,CRYSTALCAVES_correctplay_perid,CRYSTALCAVES_incorrectplay_perid,CRYSTALCAVES_correctanswers_rate_perid,CRYSTALCAVES_correctplay_rate_perid
0,0.0,0.0,0.0,-1.052897,0.571572,-0.174068,0.036012,0.489182,0.134092,0.0,...,0.0,-0.269534,-0.038238,0.047587,0.0,0.0,-0.113225,0.068996,0.0,0.037986
1,0.0,0.0,0.0,-0.003084,0.112192,-0.059369,0.056199,-0.190356,-0.049928,0.0,...,0.0,0.242495,-0.085023,-0.014778,0.0,0.0,0.272949,-0.0884,0.0,-0.028832
2,0.0,0.0,0.0,0.165546,0.127195,-0.013973,-0.0357,-0.622272,0.054091,0.0,...,0.0,0.12249,-0.070664,0.142684,0.0,0.0,0.149843,-0.166156,0.0,-0.05379
3,0.0,0.0,0.0,0.614349,-0.578596,0.14588,-0.024315,-0.082194,0.074166,0.0,...,0.0,-0.005693,0.088579,-0.150437,0.0,0.0,-0.02778,0.008683,0.0,0.017223


In [18]:
# features with highest negative influence on being in group 3
coef_df.loc[3].sort_values().head(10)

game_incorrect_cuma                  -0.578596
CRYSTALCAVES_gamesessions_perid      -0.150437
TREETOPCITY_incorrectplay_perid      -0.146396
TREETOPCITY_gamesessions_perid       -0.105209
game_sessions_cuma                   -0.082194
MAGMAPEAK_gamesessions_perid         -0.066194
CRYSTALCAVES_correctplay_perid       -0.027780
app_time_cuma                        -0.024315
TREETOPCITY_correctplay_rate_perid   -0.005693
MAGMAPEAK_correctanswers_perid        0.000000
Name: 3, dtype: float64

In [19]:
# features with highest positive influence on being in group 3
coef_df.loc[3].sort_values().tail(10)

prior_assessments_cuma                 0.014280
CRYSTALCAVES_correctplay_rate_perid    0.017223
MAGMAPEAK_totaltime_perid              0.031067
assessment_time_cuma                   0.039662
got_instructions_cuma                  0.074166
CRYSTALCAVES_totaltime_perid           0.088579
TREETOPCITY_correctplay_perid          0.098108
proportion_correct_game_cuma           0.145880
MAGMAPEAK_correctplay_rate_perid       0.278579
game_correct_cuma                      0.614349
Name: 3, dtype: float64

In [20]:
# confusion matrix
y_pred = logreg.predict(X_train_ss)
y_true = y_train

confusion_matrix(y_true, y_pred)

array([[ 619,    8,    0, 2545],
       [ 126,   18,    0, 1664],
       [  73,    7,    0, 1574],
       [ 256,   14,    0, 6363]])

In [21]:
# predicted values are columns, true vals are rows
pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,3,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,619,8,2545,3172
1,126,18,1664,1808
2,73,7,1574,1654
3,256,14,6363,6633
All,1074,47,12146,13267


Before scaling features, the confusion matrix showed more predictions for group 0:

    array([[1077,   11,   12, 2072],
       [ 335,    0,    9, 1464],
       [ 229,    3,    3, 1419],
       [ 836,   15,   26, 5756]])

It looks like this model is VERY BAD at predicting whether a player will be in accuracy groups 1 or 2. It does the best at predicting whether a player will be in accuracy group 3 or not. This could be because of imbalanced classes; it could be because there's a much clearer distinction in play for players that get 3s (and 0s). 

### Random Forest

In [22]:
rf = RandomForestClassifier(max_depth = 10,
                            n_estimators = 1000, 
                            n_jobs = -2, 
                            oob_score = False, 
                            warm_start = True)
rf.fit(X_train_ss, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-2,
            oob_score=False, random_state=None, verbose=0, warm_start=True)

In [23]:
cv_scores = cross_val_score(rf, X_train_ss, y_train)
cv_scores

array([0.53887884, 0.53211217, 0.53879213])

In [24]:
cv_scores = cross_val_score(rf, X_test_ss, y_test)
cv_scores

array([0.53252033, 0.52238806, 0.53903598])

In [25]:
imps = rf.feature_importances_
cols = X_train.columns

In [26]:
feats = pd.DataFrame(list(zip(imps, cols)), columns = ['feature_importance', 'feature'])
feats[feats['feature_importance'] >0].sort_values(by='feature_importance')

Unnamed: 0,feature_importance,feature
41,0.021427,CRYSTALCAVES_incorrectplay_perid
40,0.022822,CRYSTALCAVES_correctplay_perid
32,0.023546,TREETOPCITY_correctplay_perid
25,0.024097,MAGMAPEAK_incorrectplay_perid
33,0.02469,TREETOPCITY_incorrectplay_perid
29,0.027423,TREETOPCITY_gamesessions_perid
28,0.027562,TREETOPCITY_totaltime_perid
43,0.028928,CRYSTALCAVES_correctplay_rate_perid
37,0.029937,CRYSTALCAVES_gamesessions_perid
24,0.029995,MAGMAPEAK_correctplay_perid


In [27]:
# confusion matrix
y_pred = rf.predict(X_train_ss)
y_true = y_train

confusion_matrix(y_true, y_pred)

array([[1402,    2,    0, 1768],
       [ 162,   90,    0, 1556],
       [ 129,    1,   32, 1492],
       [ 199,    3,    0, 6431]])

In [28]:
# predicted values are columns, true vals are rows
pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1402,2,0,1768,3172
1,162,90,0,1556,1808
2,129,1,32,1492,1654
3,199,3,0,6431,6633
All,1892,96,32,11247,13267


## MODEL NOTES
1. Are there other metrics we should be using?
2. How can we make our model better at predicting accuracy groups 1 and 2? 
3. As of 1/7, it looks like groups in the money are scoring 56% - 57% accuracy. The top half of submissions are above 53%