<a href="https://colab.research.google.com/github/nickwan/sliced-basics/blob/main/modeling/sliced_basics_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SLICED BASICS: Modeling  
We show off different, quick ways you can approach modeling in order to get predictive values on a holdout dataset. SLICED contestants could use this information to quickly turnaround models and predictions so they aren't left hanging at the end of the show. This is only a short, small intro to modeling -- didn't do anything heavy here -- so if you are interested in bigger or more complex modeling, come join us on twitch! 

8:30pm ET, Monday - Thursday
https://twitch.tv/nickwan_datasci  

Check out the notebook from this video here:  
https://github.com/nickwan/sliced-basics/blob/main/modeling/sliced_basics_modeling.ipynb  

Grab the data from this video over here:  
https://drive.google.com/drive/folders/1_iDnj3e2jKODqBU-hHkH6YY4XPrAdjGH  

SLICED is a competitive data science show where 4 data scientists have a blind dataset and have 2 hours to create a predictive model, data visualization, and new features through feature engineering, while entertaining chat -- all for points! Come check it out, June 1st through August 17th; every Tuesday on:  

https://twitch.tv/nickwan_datasci  
https://twitch.tv/nickwan_datasci  
https://twitch.tv/nickwan_datasci  
https://twitch.tv/nickwan_datasci  
https://twitch.tv/nickwan_datasci  


In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import os

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, accuracy_score

In [2]:
project_dir = "drive/My Drive/public-sliced/s00e01"
df = pd.read_csv(f"{project_dir}/Copy of sliced-s00e01-data.csv")
df.shape

(5934, 120)

In [16]:
df_holdout = pd.read_csv(f"{project_dir}/Copy of sliced-s00e01-holdout.csv")
for col in feats:
  df_holdout[col] = df_holdout[col].fillna(df_holdout[col].mean())
df_holdout.shape

(882, 117)

In [8]:
feats = ['attr', 'sinc', 'intel', 'fun', 'amb', 'shar']
feats = [f"{x}_o" for x in feats] + feats
target = 'match'

In [43]:
model_data = df.loc[:, ['wave']+feats+[target]].dropna()
log_model = LogisticRegression()
log_model.fit(model_data.loc[:, feats], model_data[target])


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
preds = pd.DataFrame(log_model.predict_proba(df_holdout.loc[:, feats]), index=df_holdout.index)
df_holdout['preds'] = preds[1]
df_holdout['preds_int'] = df_holdout['preds'].round(0).astype(int)
df_holdout.shape

In [30]:
df_targets = pd.read_csv('sliced-s00e01-holdout_full.csv')
df_targets.shape

(882, 120)

In [32]:
df_targets = (df_targets
              .merge(df_holdout
                     .loc[:, ['preds', 'preds_int']], 
                     left_index=True, 
                     right_index=True))
df_targets.shape

(882, 122)

In [35]:
loss = log_loss(df_targets[target], df_targets['preds'])
acc = accuracy_score(df_targets[target], df_targets['preds_int'])
loss, acc

(0.3069897840532516, 0.873015873015873)

# cross validation with logistic regression 

In [36]:
from sklearn.model_selection import KFold, GroupKFold

In [None]:
kf = KFold(10, shuffle=True)
model_data = df.loc[:, ['wave']+feats+[target]].dropna()

df_preds = pd.DataFrame()
for train_idx, test_idx in kf.split(model_data):
  train_data = model_data.iloc[train_idx]
  test_data = model_data.iloc[test_idx]

  log_model = LogisticRegression()
  log_model.fit(train_data.loc[:, feats], train_data[target])
  test_data['preds'] = pd.DataFrame(log_model.predict_proba(test_data.loc[:, feats]), index=test_data.index)[1]
  df_preds = df_preds.append(test_data)
df_preds.shape  

In [48]:
loss = log_loss(df_preds[target], df_preds['preds'])
acc = accuracy_score(df_preds[target], df_preds['preds'].round().astype(int))
loss, acc

(0.3651997773259231, 0.8464052287581699)

In [None]:
kf = GroupKFold(4)
model_data = df.loc[:, ['wave']+feats+[target]].dropna()

df_preds = pd.DataFrame()
for train_idx, test_idx in kf.split(model_data, groups=model_data['wave']):
  train_data = model_data.iloc[train_idx]
  test_data = model_data.iloc[test_idx]

  log_model = LogisticRegression()
  log_model.fit(train_data.loc[:, feats], train_data[target])
  test_data['preds'] = pd.DataFrame(log_model.predict_proba(test_data.loc[:, feats]), index=test_data.index)[1]
  df_preds = df_preds.append(test_data)
df_preds.shape  

In [53]:
loss = log_loss(df_preds[target], df_preds['preds'])
acc = accuracy_score(df_preds[target], df_preds['preds'].round().astype(int))
loss, acc

(0.36557485391204025, 0.8445378151260504)

# cross val with random forest 

In [None]:
kf = KFold(10, shuffle=True)
model_data = df.loc[:, ['wave']+feats+[target]].dropna()

df_preds = pd.DataFrame()
for train_idx, test_idx in kf.split(model_data):
  train_data = model_data.iloc[train_idx]
  test_data = model_data.iloc[test_idx]

  rf_model = RandomForestClassifier()
  rf_model.fit(train_data.loc[:, feats], train_data[target])
  test_data['preds'] = pd.DataFrame(rf_model.predict_proba(test_data.loc[:, feats]), index=test_data.index)[1]
  df_preds = df_preds.append(test_data)
df_preds.shape  

In [56]:
loss = log_loss(df_preds[target], df_preds['preds'])
acc = accuracy_score(df_preds[target], df_preds['preds'].round().astype(int))
loss, acc

(0.38929339399420626, 0.8478057889822596)

In [None]:
kf = GroupKFold(4)
model_data = df.loc[:, ['wave']+feats+[target]].dropna()

df_preds = pd.DataFrame()
for train_idx, test_idx in kf.split(model_data, groups=model_data['wave']):
  train_data = model_data.iloc[train_idx]
  test_data = model_data.iloc[test_idx]

  rf_model = RandomForestClassifier()
  rf_model.fit(train_data.loc[:, feats], train_data[target])
  test_data['preds'] = pd.DataFrame(rf_model.predict_proba(test_data.loc[:, feats]), index=test_data.index)[1]
  df_preds = df_preds.append(test_data)
df_preds.shape  

In [58]:
loss = log_loss(df_preds[target], df_preds['preds'])
acc = accuracy_score(df_preds[target], df_preds['preds'].round().astype(int))
loss, acc

(0.4085300740805961, 0.8426704014939309)

# predict on the holdout 

In [59]:
model_data = df.loc[:, ['wave']+feats+[target]].dropna()
log_model = LogisticRegression()
log_model.fit(model_data.loc[:, feats], model_data[target])


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [60]:
df_holdout = pd.read_csv(f"{project_dir}/Copy of sliced-s00e01-holdout.csv")
for col in feats:
  df_holdout[col] = df_holdout[col].fillna(df_holdout[col].mean())
df_holdout.shape

(882, 117)

In [61]:
preds = pd.DataFrame(log_model.predict_proba(df_holdout.loc[:, feats]), index=df_holdout.index)
df_holdout['preds'] = preds[1]
df_holdout['preds_int'] = df_holdout['preds'].round(0).astype(int)
df_holdout.shape

(882, 119)