# Fitting notebook.

We now have all of the data preprocessing steps in order. Time to fiddle with the fitting functions.

In [1]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from preprocessing_EventTypes import *

In [2]:
# data pre-processing
train_x, train_labels, valid_x, valid_labels, eventLabels  = fullProcessing( './train.csv', './train_labels.csv')


In [4]:
# Iterate through questions 1 to 18 to train models for each question
def fitModels(train_x, train_labels, valid_x, valid_labels):
    # initialization
    models = {}
    train_eval = {}
    valid_eval = {}
    
    # fitting
    for q_no in range(1, 19):

        # Select level group for the question based on the q_no.
        if q_no <= 3:
            grp = '0-4'
        elif q_no <= 13:
            grp = '5-12'
        elif q_no <= 22:
            grp = '13-22'
        print("### q_no", q_no, "grp", grp)

        # Filter the rows in the datasets based on the selected q_no and level group.
        train_df = train_x.loc[ train_x.index.isin([grp], level='level_group') ]
        valid_df = valid_x.loc[ valid_x.index.isin([grp], level='level_group') ]

        train_targets = train_labels.loc[train_labels.q == q_no].set_index('session')
        valid_targets = valid_labels.loc[valid_labels.q == q_no].set_index('session')

        # Model pipeline
        pipe = Pipeline([('est', RandomForestClassifier())])

        # Train
        model = pipe.fit(train_df, train_targets['correct'])

        # Store the model
        models[f'{grp}_{q_no}'] = model

        # Evaluate train and validation accuracy
        train_eval[q_no] = accuracy_score(train_targets['correct'], model.predict(train_df) )
        valid_eval[q_no] = accuracy_score(valid_targets['correct'], model.predict(valid_df) )
    
    return models, train_eval, valid_eval


In [None]:
models, train_eval, valid_eval = fitModels(train_x, train_labels, valid_x, valid_labels)

### q_no 1 grp 0-4
### q_no 2 grp 0-4


### Inspect Accuracy of Individual Models

In [None]:
for name, valid_acc in valid_eval.items():
    train_acc = train_eval[name]
    print(f"question {name}: val accuracy {valid_acc:.4f}, train accuracy {train_acc:.4f}")

print("\nAverage accuracy", sum(evaluation_dict.values())/18)

In [18]:
for q_no in range(1, 19):

    # Select level group for the question based on the q_no.
    if q_no <= 3:
        grp = '0-4'
    elif q_no <= 13:
        grp = '5-12'
    elif q_no <= 22:
        grp = '13-22'
    print("### q_no", q_no, "grp", grp)

    # Filter the rows in the datasets based on the selected q_no and level group.
    train_df = train_x.loc[ train_x.index.isin([grp], level='level_group') ]
    valid_df = valid_x.loc[ valid_x.index.isin([grp], level='level_group') ]
                                               
    train_targets = train_labels.loc[train_labels.q == q_no].set_index('session')
    valid_targets = valid_labels.loc[valid_labels.q == q_no].set_index('session')
    
    preds = models[grp+'_'+str(q_no)].predict(train_df)
    
    print('frac predicted', preds.sum() / len(preds) )
    print( 'targets', train_targets.correct.sum() / len(train_targets))
    
    
    
    

### q_no 1 grp 0-4
frac predicted 0.7277309141068492
targets 0.7277309141068492
### q_no 2 grp 0-4
frac predicted 0.9791500875378004
targets 0.9791500875378004
### q_no 3 grp 0-4
frac predicted 0.9332590588360126
targets 0.9332590588360126
### q_no 4 grp 5-12
frac predicted 0.7986630590482253
targets 0.7986630590482253
### q_no 5 grp 5-12
frac predicted 0.5479866305904822
targets 0.5479866305904822
### q_no 6 grp 5-12
frac predicted 0.7751074327550533
targets 0.7751074327550533
### q_no 7 grp 5-12
frac predicted 0.73277096928219
targets 0.73277096928219
### q_no 8 grp 5-12
frac predicted 0.6183882434081384
targets 0.6183882434081384
### q_no 9 grp 5-12
frac predicted 0.7361133216616266
targets 0.7361133216616266
### q_no 10 grp 5-12
frac predicted 0.5059154331794791
targets 0.5059154331794791
### q_no 11 grp 5-12
frac predicted 0.6445965303199108
targets 0.6445965303199108
### q_no 12 grp 5-12
frac predicted 0.8614250092843122
targets 0.8614250092843122
### q_no 13 grp 5-12
frac predic

In [33]:
model.named_steps['est'].estimators_[0].tree_.

<sklearn.tree._tree.Tree at 0x131170c00>

### Generate submission

In [None]:
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [None]:
# Reference
# https://www.kaggle.com/code/philculliton/basic-submission-demo
# https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook

limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, sample_submission) in iter_test:
    test_df = feature_engineer(test)
    grp = test_df.level_group.values[0]
    a,b = limits[grp]
    for t in range(a,b):
        model = models[f'{grp}_{t}']
        predictions = model.predict(test_df)
        mask = sample_submission.session_id.str.contains(f'q{t}')
        sample_submission.loc[mask,'correct'] = predictions.flatten()
    
    env.predict(sample_submission)