# Submission notebook

Streamlined notebook for submission. Inherits from baselinemodel. Edit me. 

In [1]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
# data proprocession functions

eventVars = ['event_name', 'name', 'level', 'page', 'text', 'fqid', 'room_fqid', 'text_fqid' ]
eventVars.sort()

numericalVars = ['elapsed_time','room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y',
        'hover_duration']

def readData(fileLocation):
    dtypes={
        'elapsed_time':np.int32,
        'event_name':'category',
        'name':'category',
        'level':'category',
        'page':'category',
        'room_coor_x':np.float32,
        'room_coor_y':np.float32,
        'screen_coor_x':np.float32,
        'screen_coor_y':np.float32,
        'hover_duration':np.float32,
        'text':'category',
        'fqid':'category',
        'room_fqid':'category',
        'text_fqid':'category',
        'fullscreen':'category',
        'hq':'category',
        'music':'category',
        'level_group':'category'}
    data = pd.read_csv(fileLocation, dtype=dtypes)

    for column in eventVars:
        data[column] = data[column].cat.add_categories(['-1'])
        data[column] = data[column].fillna('-1')
    return data


def readLabels(fileLocation):
    """Read the labels dataset"""
    labels = pd.read_csv(fileLocation)
    labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]))
    labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]))
    
    return labels


def makeEventLabels(trainData):
    """Make a table containing the labels for any set of event values"""
    eventGrouping = trainData.groupby(eventVars, observed=True)
    eventLabels = pd.DataFrame( eventGrouping.size().index,
                    columns=['event_profile'])

    eventLabels['event_label'] = pd.DataFrame( 
            map(lambda i: 'e_'+ str(i), range(len(eventLabels))),
            dtype='category')
    return eventLabels


def makeEventTable(data, eventLabels):
    """makes a table grouped by event types, session_id, and level_group"""
    eventColumns = ['session_id', 'level_group', *eventVars]
    eventTable = data[ eventColumns ]

    eventTable = eventTable.groupby(eventColumns, observed=True).size().to_frame('counts')
    eventTable = eventTable.reset_index(['session_id', 'level_group'])

    eventDetails = pd.DataFrame( eventTable.index, columns=['event_profile'])
    eventDetails = eventDetails.merge( eventLabels, on='event_profile', how='left' )

    eventTable = eventTable.reset_index().drop(columns=eventVars)
    eventTable['event_label'] = eventDetails['event_label']
    

    eventCounts = eventTable.pivot(index=['session_id', 'level_group'], columns='event_label', values='counts')
    eventCounts = eventCounts.fillna(0)
    return eventCounts


def splitDataset(dataset, labels, train_ratio=0.80):
    """Random split of session_ids in the data.
       The dataset should be indexed by ['session_id', 'level_group']
    """
    
    # `session_id` and `level_group` are the indices of our feature engineered dataset
    if dataset.index.names != ['session_id', 'level_group']:
        raise Exception( 'Data must be indexed by [session_id, level_group]' )
    
    sessionIds = dataset.index.get_level_values('session_id').unique()
    trainIds, valIds = train_test_split(sessionIds, train_size=train_ratio )

    trainData = dataset.loc[trainIds]
    valData =  dataset.loc[valIds]
    
    trainLabels = labels[ labels.session.isin(trainIds) ]
    valLabels = labels[ labels.session.isin(valIds) ]
    
    return trainData, trainLabels, valData, valLabels



def fullProcessing(dataFile, labelFile):
    trainData = readData(dataFile)
    labelData = readLabels(labelFile)


    eventLabels = makeEventLabels(trainData)
    
    eventTable = makeEventTable(trainData, eventLabels)
    
    # split datasets
    trainData, trainLabels, valData, valLabels = splitDataset(eventTable, labelData, train_ratio=0.8)
    
    return trainData, trainLabels, valData, valLabels, eventLabels


In [3]:
# data pre-processing
train_x, train_labels, valid_x, valid_labels, eventLabels  = fullProcessing( './train.csv', './train_labels.csv')


In [4]:
# initialization
VAL_USER_LIST = valid_x.index.get_level_values('session_id').unique()
prediction_df = pd.DataFrame(data=np.zeros(
    (len(VAL_USER_LIST), 18)), index=VAL_USER_LIST)

models = {}

evaluation_dict = {}

In [7]:
# Iterate through questions 1 to 18 to train models for each question

for q_no in range(1, 19):

    # Select level group for the question based on the q_no.
    if q_no <= 3:
        grp = '0-4'
    elif q_no <= 13:
        grp = '5-12'
    elif q_no <= 22:
        grp = '13-22'
    print("### q_no", q_no, "grp", grp)

    # Filter the rows in the datasets based on the selected q_no and level group.
    train_df = train_x.loc[ train_x.index.isin([grp], level='level_group') ]
    valid_df = valid_x.loc[ valid_x.index.isin([grp], level='level_group') ]
    
    train_targets = train_labels.loc[train_labels.q == q_no].set_index('session')
    valid_targets = valid_labels.loc[valid_labels.q == q_no].set_index('session')

    # Model pipeline
    pipe = Pipeline([('est', RandomForestClassifier())])

    # Train
    model = pipe.fit(train_df, train_targets['correct'])

    # Store the model
    models[f'{grp}_{q_no}'] = model

    # Evaluate the trained model on the validation dataset and store the
    # evaluation accuracy in the `evaluation_dict`.
    evaluation_dict[q_no] = accuracy_score(
        valid_targets['correct'], model.predict(valid_df))

    # Use the trained model to make predictions on the validation dataset and
    # store the predicted values in the `prediction_df` dataframe.
    predict = model.predict(valid_df)
    prediction_df.loc[valid_targets.index, q_no-1] = predict.flatten()


### q_no 1 grp 0-4
### q_no 2 grp 0-4
### q_no 3 grp 0-4
### q_no 4 grp 5-12
### q_no 5 grp 5-12
### q_no 6 grp 5-12
### q_no 7 grp 5-12
### q_no 8 grp 5-12
### q_no 9 grp 5-12
### q_no 10 grp 5-12
### q_no 11 grp 5-12
### q_no 12 grp 5-12
### q_no 13 grp 5-12
### q_no 14 grp 13-22
### q_no 15 grp 13-22
### q_no 16 grp 13-22
### q_no 17 grp 13-22
### q_no 18 grp 13-22


### Inspect Accuracy of Individual Models

In [8]:
for name, value in evaluation_dict.items():
    print(f"question {name}: accuracy {value:.4f}")

print("\nAverage accuracy", sum(evaluation_dict.values())/18)

question 1: accuracy 0.7263
question 2: accuracy 0.9775
question 3: accuracy 0.9370
question 4: accuracy 0.7965
question 5: accuracy 0.5237
question 6: accuracy 0.7791
question 7: accuracy 0.7490
question 8: accuracy 0.6056
question 9: accuracy 0.7369
question 10: accuracy 0.4874
question 11: accuracy 0.6378
question 12: accuracy 0.8691
question 13: accuracy 0.7273
question 14: accuracy 0.6983
question 15: accuracy 0.4923
question 16: accuracy 0.7316
question 17: accuracy 0.6582
question 18: accuracy 0.9476

Average accuracy 0.7267251337906973


In [18]:
for q_no in range(1, 19):

    # Select level group for the question based on the q_no.
    if q_no <= 3:
        grp = '0-4'
    elif q_no <= 13:
        grp = '5-12'
    elif q_no <= 22:
        grp = '13-22'
    print("### q_no", q_no, "grp", grp)

    # Filter the rows in the datasets based on the selected q_no and level group.
    train_df = train_x.loc[ train_x.index.isin([grp], level='level_group') ]
    valid_df = valid_x.loc[ valid_x.index.isin([grp], level='level_group') ]
                                               
    train_targets = train_labels.loc[train_labels.q == q_no].set_index('session')
    valid_targets = valid_labels.loc[valid_labels.q == q_no].set_index('session')
    
    preds = models[grp+'_'+str(q_no)].predict(train_df)
    
    print('frac predicted', preds.sum() / len(preds) )
    print( 'targets', train_targets.correct.sum() / len(train_targets))
    
    
    
    

### q_no 1 grp 0-4
frac predicted 0.7277309141068492
targets 0.7277309141068492
### q_no 2 grp 0-4
frac predicted 0.9791500875378004
targets 0.9791500875378004
### q_no 3 grp 0-4
frac predicted 0.9332590588360126
targets 0.9332590588360126
### q_no 4 grp 5-12
frac predicted 0.7986630590482253
targets 0.7986630590482253
### q_no 5 grp 5-12
frac predicted 0.5479866305904822
targets 0.5479866305904822
### q_no 6 grp 5-12
frac predicted 0.7751074327550533
targets 0.7751074327550533
### q_no 7 grp 5-12
frac predicted 0.73277096928219
targets 0.73277096928219
### q_no 8 grp 5-12
frac predicted 0.6183882434081384
targets 0.6183882434081384
### q_no 9 grp 5-12
frac predicted 0.7361133216616266
targets 0.7361133216616266
### q_no 10 grp 5-12
frac predicted 0.5059154331794791
targets 0.5059154331794791
### q_no 11 grp 5-12
frac predicted 0.6445965303199108
targets 0.6445965303199108
### q_no 12 grp 5-12
frac predicted 0.8614250092843122
targets 0.8614250092843122
### q_no 13 grp 5-12
frac predic

In [33]:
model.named_steps['est'].estimators_[0].tree_.

<sklearn.tree._tree.Tree at 0x131170c00>

### Generate submission

In [None]:
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [None]:
# Reference
# https://www.kaggle.com/code/philculliton/basic-submission-demo
# https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook

limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, sample_submission) in iter_test:
    test_df = feature_engineer(test)
    grp = test_df.level_group.values[0]
    a,b = limits[grp]
    for t in range(a,b):
        model = models[f'{grp}_{t}']
        predictions = model.predict(test_df)
        mask = sample_submission.session_id.str.contains(f'q{t}')
        sample_submission.loc[mask,'correct'] = predictions.flatten()
    
    env.predict(sample_submission)