In [1]:
#Importing the necessary libraries:
import pandas as pd
import numpy as np
from utils import *
from scipy.stats import skew, kurtosis
from collections import Counter

In [2]:
input_data_path = '../data'

In [3]:
# Load the training data
train_logs = pd.read_csv(f'{input_data_path}/train_logs.csv')
train_scores = pd.read_csv(f'{input_data_path}/train_scores.csv')

In [4]:
# Merge logs and scores based on essay ID
train_data = pd.merge(train_logs, train_scores, on='id')

In [5]:
activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 
              'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
text_changes = ['q', ' ', 'NoChange', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']
punctuations = ['"', '.', ',', "'", '-', ';', ':', '?', '!', '<', '>', '/',
                        '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+']
gaps = [1, 2, 3, 5, 10, 20, 50]

In [6]:
for gap in gaps:
    train_data[f'up_time_shift_{gap}'] = train_data.groupby('id')['up_time'].shift(gap)
    train_data[f'action_time_gap_{gap}'] = train_data['down_time'] - train_data[f'up_time_shift_{gap}']

    train_data[f'cursor_position_shift_{gap}'] = train_data.groupby('id')['cursor_position'].shift(gap)
    train_data[f'cursor_position_change_{gap}'] = train_data['cursor_position'] - train_data[f'cursor_position_shift_{gap}']
    train_data[f'cursor_position_abs_change_{gap}'] = np.abs(train_data[f'cursor_position_change_{gap}'])

    train_data[f'word_count_shift_{gap}'] = train_data.groupby('id')['word_count'].shift(gap)
    train_data[f'word_count_change_{gap}'] = train_data['word_count'] - train_data[f'word_count_shift_{gap}']
    train_data[f'word_count_abs_change_{gap}'] = np.abs(train_data[f'word_count_change_{gap}'])

In [7]:
# # Count nonproduction
# def count_nonproduction(action_list):
#     return len([action for action in action_list if action == 'Nonproduction'])

In [8]:
gap_aggregations = []
for gap in gaps:
    gap_aggregations.append((f'action_time_gap_{gap}', ['max', 'min', 'mean', 'std', 'sum', skew, kurtosis]))
    gap_aggregations.append((f'cursor_position_change_{gap}', ['max', 'min', 'mean', 'std', 'sum', skew, kurtosis]))
    gap_aggregations.append((f'cursor_position_abs_change_{gap}', ['max', 'min', 'mean', 'std', 'sum', skew, kurtosis]))
    gap_aggregations.append((f'word_count_change_{gap}', ['max', 'min', 'mean', 'std', 'sum', skew, kurtosis]))
    gap_aggregations.append((f'word_count_abs_change_{gap}', ['max', 'min', 'mean', 'std', 'sum', skew, kurtosis]))

In [9]:
def distance_of_move_of_selection(selection_vectors):
    if len(selection_vectors) > 0:
        return selection_vectors[1][0] - selection_vectors[0][0]
    else:
        return 0

# How large was the selection
def size_of_moved_selection(selection_vectors):
    if len(selection_vectors) > 0:
        return selection_vectors[0][1] - selection_vectors[0][0]
    else:
        return 0

In [10]:
agg_dict = {
    'activity': 'count',                # Total number of activities
    'action_time': ['sum', 'mean'],     # Total and average action time
    'word_count': 'max',                # Maximum word count
    'text_change': 'nunique',           # Number of unique text changes
    'cursor_position': 'mean',           # Average cursor position
    'text_change' : count_large_text_changes,
    'text_change' : count_extremely_large_text_changes,
    'text_change' : count_tiny_text_changes,
    'activity': count_nonproduction,
    'activity': count_input,
    'activity': count_remove,
    'activity': count_replace,
    'activity': count_paste,
    'distance_of_moved_selection': ['mean', 'max'],
    'size_of_moved_selection': ['mean', 'max'],
    'up_time': fraction_of_time_spent_writing, # Amount of time spent on the essay,
}

In [11]:
agg_dict.update(gap_aggregations)

In [12]:
# Feature engineering for typing behavior features
typing_features = train_data.groupby('id').agg(agg_dict)

KeyError: "Column(s) ['distance_of_moved_selection', 'size_of_moved_selection'] do not exist"

# Other feats

In [None]:
def activity_counts(df):
    tmp_df = df.groupby('id').agg({'activity': list}).reset_index()
    ret = list()
    for li in tmp_df['activity'].values:
        items = list(Counter(li).items())
        di = dict()
        for k in activities:
            di[k] = 0
        for item in items:
            k, v = item[0], item[1]
            if k in di:
                di[k] = v
        ret.append(di)
    ret = pd.DataFrame(ret)
    cols = [f'activity_{i}_count' for i in range(len(ret.columns))]
    ret.columns = cols
    return ret


def event_counts(df, colname):
    tmp_df = df.groupby('id').agg({colname: list}).reset_index()
    ret = list()
    for li in tmp_df[colname].values:
        items = list(Counter(li).items())
        di = dict()
        for k in events:
            di[k] = 0
        for item in items:
            k, v = item[0], item[1]
            if k in di:
                di[k] = v
        ret.append(di)
    ret = pd.DataFrame(ret)
    cols = [f'{colname}_{i}_count' for i in range(len(ret.columns))]
    ret.columns = cols
    return ret


def text_change_counts(df):
    tmp_df = df.groupby('id').agg({'text_change': list}).reset_index()
    ret = list()
    for li in tmp_df['text_change'].values:
        items = list(Counter(li).items())
        di = dict()
        for k in text_changes:
            di[k] = 0
        for item in items:
            k, v = item[0], item[1]
            if k in di:
                di[k] = v
        ret.append(di)
    ret = pd.DataFrame(ret)
    cols = [f'text_change_{i}_count' for i in range(len(ret.columns))]
    ret.columns = cols
    return ret

def match_punctuations(df):
    tmp_df = df.groupby('id').agg({'down_event': list}).reset_index()
    ret = list()
    for li in tmp_df['down_event'].values:
        cnt = 0
        items = list(Counter(li).items())
        for item in items:
            k, v = item[0], item[1]
            if k in punctuations:
                cnt += v
        ret.append(cnt)
    ret = pd.DataFrame({'punct_cnt': ret})
    return ret


def get_input_words(df):
    tmp_df = df[(~df['text_change'].str.contains('=>'))&(df['text_change'] != 'NoChange')].reset_index(drop=True)
    tmp_df = tmp_df.groupby('id').agg({'text_change': list}).reset_index()
    tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: ''.join(x))
    tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: re.findall(r'q+', x))
    tmp_df['input_word_count'] = tmp_df['text_change'].apply(len)
    tmp_df['input_word_length_mean'] = tmp_df['text_change'].apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df['input_word_length_max'] = tmp_df['text_change'].apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df['input_word_length_std'] = tmp_df['text_change'].apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df.drop(['text_change'], axis=1, inplace=True)
    return tmp_df

In [41]:
# counts
print("Engineering activity counts data")
tmp_df = activity_counts(train_data)
feats = pd.concat([train_data, tmp_df], axis=1)

print("Engineering event counts data")
# tmp_df = event_counts(train_data, 'down_event')
# feats = pd.concat([feats, tmp_df], axis=1)
# tmp_df = event_counts(train_data, 'up_event')
# feats = pd.concat([feats, tmp_df], axis=1)

print("Engineering text change counts data")
tmp_df = text_change_counts(train_data)
feats = pd.concat([feats, tmp_df], axis=1)

print("Engineering punctuation counts data")
tmp_df = match_punctuations(train_data)
feats = pd.concat([feats, tmp_df], axis=1)

# input words
print("Engineering input words data")
tmp_df = get_input_words(train_data)
feats = pd.merge(feats, tmp_df, on='id', how='left')

# compare feats
print("Engineering ratios data")
feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
feats['idle_time_ratio'] = feats['action_time_gap1_sum'] / feats['up_time_max']

Engineering activity counts data
Engineering event counts data


MemoryError: Unable to allocate 1.00 GiB for an array with shape (16, 8405898) and data type float64

In [None]:
feats.head()