In [83]:
import pandas as pd
import numpy as np
import skops.io as sio

In [86]:
model_path = '../model_weights/Random Forest.skops'
output_path = '../processed/submission.csv'

In [63]:
def back_track_col(df):
    df.loc[:, 'prev_cursor_position'] = df.groupby('id').cursor_position.shift(1, fill_value=0) 
    print(df.columns)
    df['back_track'] = df['cursor_position'] - df['prev_cursor_position']
    df = df.drop('prev_cursor_position', axis=1)
    df[df['back_track']<0]['back_track'] = 0
    return df

In [64]:
def summarization_base(df):
    summarization_df = df.groupby('id').agg(
    word_count=('word_count', 'max'),
    action_time_mean=('action_time', 'mean'),
    action_time_max=('action_time', 'max'),
    action_time_total=('action_time', 'sum'),
    action_time_50=('action_time', 'median'),
    event_count = ('event_id', lambda x: np.count_nonzero(x)),
    paste_count=('activity', lambda x: np.count_nonzero(x == 'Paste')),
    move_activity=('activity', lambda x: np.count_nonzero(x.str.startswith('Move'))),
    nonproduction_count=('activity', lambda x: np.count_nonzero(x == 'Nonproduction')),
    input_count=('activity', lambda x: np.count_nonzero(x == 'Input')),
    remove_cut_count=('activity', lambda x: np.count_nonzero(x == 'Remove/Cut')),
    replace_count=('activity', lambda x: np.count_nonzero(x == 'Replace')),
    nochange_count=('text_change', lambda x: np.count_nonzero(x == 'NoChange')),
    capslock_count=('down_event', lambda x: np.count_nonzero(x == 'CapsLock')),
    paragraph_count=('down_event', lambda x: np.count_nonzero(x == 'Enter')),
    space_count=('down_event', lambda x: np.count_nonzero(x == 'Space')),
    question_count=('down_event', lambda x: np.count_nonzero(x == '?')),
    exclamation_count=('down_event', lambda x: np.count_nonzero(x == '!')),
    sub_texts_count=('down_event', lambda x: np.count_nonzero(x.isin(['(', '[', '{']))),
    amount_back_track  =('back_track', lambda x: np.count_nonzero(x)),
    mean_back_track = ('back_track', lambda x: np.sum(x)/np.count_nonzero(x)),
    max_back_track = ('back_track', 'max'),
    median_back_track = ('back_track', lambda x: np.median(x[x > 0])),
    one_back_track = ('back_track', lambda x: len(x[x == 1]))  
).reset_index()
    return summarization_df

In [65]:
testf = pd.read_csv('../data/test_logs.csv')

In [72]:
back_track_col(testf)
test_summarized = summarization_base(testf)
test_summarized.head()

Index(['id', 'event_id', 'down_time', 'up_time', 'action_time', 'activity',
       'down_event', 'up_event', 'text_change', 'cursor_position',
       'word_count', 'prev_cursor_position', 'back_track'],
      dtype='object')


Unnamed: 0,id,word_count,action_time_mean,action_time_max,action_time_total,action_time_50,event_count,paste_count,move_activity,nonproduction_count,...,paragraph_count,space_count,question_count,exclamation_count,sub_texts_count,amount_back_track,mean_back_track,max_back_track,median_back_track,one_back_track
0,0000aaaa,0,86.0,87,172,86.0,2,0,0,0,...,0,2,0,0,0,1,1.0,1,1.0,1
1,2222bbbb,1,56.5,67,113,56.5,2,0,0,0,...,0,0,0,0,0,1,1.0,1,1.0,1
2,4444cccc,3,71.2,94,356,56.0,5,0,1,0,...,0,2,0,0,0,3,1.0,1,1.0,3


In [85]:
clf = sio.load(model_path, trusted=True)
predictions = clf.predict(test_summarized.drop('id', axis=1).values)
output = test_summarized.copy()
output.loc[:, 'score'] = predictions
output[['id', 'score']].to_csv(output_path, index=False)