## Make a large amount of features

In [1]:
#Importing the necessary libraries:
import pandas as pd
import numpy as np

In [2]:
input_data_path = '../data'

In [3]:
# Load the training data
train_logs = pd.read_csv(f'{input_data_path}/train_logs.csv')
train_scores = pd.read_csv(f'{input_data_path}/train_scores.csv')

In [4]:
train_logs

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1
...,...,...,...,...,...,...,...,...,...,...,...
8405893,fff05981,3615,2063944,2064440,496,Nonproduction,Leftclick,Leftclick,NoChange,1031,240
8405894,fff05981,3616,2064497,2064497,0,Nonproduction,Shift,Shift,NoChange,1031,240
8405895,fff05981,3617,2064657,2064765,108,Replace,q,q,q => q,1031,240
8405896,fff05981,3618,2069186,2069259,73,Nonproduction,Leftclick,Leftclick,NoChange,1028,240


## Merging Logs and Scores:

* Merging the two datasets based on the unique essay ID.

In [5]:
# Merge logs and scores based on essay ID
train_data = pd.merge(train_logs, train_scores, on='id')

## Check out the data

In [6]:
train_data['up_time']

0             4557
1             4962
2           106571
3           106777
4           107323
            ...   
8405893    2064440
8405894    2064497
8405895    2064765
8405896    2069259
8405897    2070133
Name: up_time, Length: 8405898, dtype: int64

In [7]:
train_data.describe()

Unnamed: 0,event_id,down_time,up_time,action_time,cursor_position,word_count,score
count,8405898.0,8405898.0,8405898.0,8405898.0,8405898.0,8405898.0,8405898.0
mean,2067.649,793560.3,793658.4,98.08498,1222.964,231.4687,3.992162
std,1588.284,514945.1,514942.8,253.3985,948.5242,175.9088,0.9918788
min,1.0,106.0,252.0,0.0,0.0,0.0,0.5
25%,852.0,373184.2,373282.0,66.0,499.0,96.0,3.5
50%,1726.0,720886.0,720980.0,93.0,1043.0,200.0,4.0
75%,2926.0,1163042.0,1163141.0,122.0,1706.0,327.0,4.5
max,12876.0,8313630.0,8313707.0,447470.0,7802.0,1326.0,6.0


In [8]:
train_data.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,score
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,3.5
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1,3.5
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1,3.5


In [9]:
train_data.loc[train_data['down_event'] != train_data['up_event']]

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,score
5115,0042269b,105,104507,104617,110,Replace,q,m,qqqqqqq qqq qqqq qqqq qqqq qq qqqqqqqq qqqqqqq...,1,4,6.0
5232,0042269b,222,158578,158638,60,Replace,q,i,qqqqqqq => q,15,14,6.0
6276,0042269b,1266,473329,473439,110,Replace,q,o,qqq qq qqq qqqqqqq qqqqqq qqqq qq => q,194,128,6.0
6825,0042269b,1815,641821,641920,99,Replace,q,t,qqq qqqqq qq qqqqqqqq qqqq qqq qqqqqqq qq qqqq...,529,190,6.0
8223,0042269b,3213,1410198,1410293,95,Replace,q,a,qqqqqqq qqqqqqqqq qqqq qqq qq qqqq qqq qqqqqqq...,1539,282,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8323360,fde20dd8,2019,495093,495205,112,Replace,q,i,qqqq => q,530,167,4.0
8328686,fde20dd8,7345,1760415,1760572,157,Replace,q,y,qq qqq qqq'q qqqqq qqqqqq qq qqq qqqqqqq => q,3218,700,4.0
8378372,ff90a677,29,50477,50647,170,Nonproduction,a,A,NoChange,18,4,4.0
8378374,ff90a677,31,50703,50837,134,Replace,q,i,q qq qqqqqqq qqqq => q,1,1,4.0


## Thoughts  

We create a model where the events of an essay are encoded in some way and then are input into our model.  

How are the results for one essay encoded? A few ways:

* Aggregation 
    * There is a loss of information in our data when we create aggregations
* Features that take entire "time series" of a given essay and encode it to have closer to "lossless" feature creation
    * We'll look into this later

## Aggregation

### Text change

In [10]:
# Function for pandas apply that check number of large text changes
def count_large_text_changes(text_changes):
    return len([tc for tc in text_changes if len(tc) > 20])

def count_extremely_large_text_changes(text_changes):
    return len([tc for tc in text_changes if len(tc) > 100])

def count_tiny_text_changes(text_changes):
    return len([tc for tc in text_changes if len(tc) < 5])

### Activity

In [11]:
unique_activities = train_data['activity'].unique()

# There are many many different types of Move From activities, so we'll remove them all and we'll treat them differently
non_move_activities = [a for a in unique_activities if 'Move From' not in a]

In [12]:
non_move_activities

['Nonproduction', 'Input', 'Remove/Cut', 'Replace', 'Paste']

**Count of activities**

In [13]:
# For a given type of activity, count the number of times it occurs
def count_nonproduction(action_list):
    return len([action for action in action_list if action == 'Nonproduction'])

# count input
def count_input(action_list):
    return len([action for action in action_list if action == 'Input'])

# count remove/cut
def count_remove(action_list):
    return len([action for action in action_list if action == 'Remove/Cut'])

# Count Replace
def count_replace(action_list):
    return len([action for action in action_list if action == 'Replace'])

# Count Paste
def count_paste(action_list):
    return len([action for action in action_list if action == 'Paste'])

**Move From features**

In [14]:
import re

# Function to split activity column into vectors
def split_activity(activity):
    return [np.array(a[1:-1].split(', '), dtype=int) for a in re.findall(r'\[[0-9]*, [0-9]*\]', activity)]

# Extract the vectors from the activity column when Move From is in the activity
def get_move_from_vectors(activity):
    if 'Move From' in activity:
        return split_activity(activity)
    else:
        return []

train_data['selection_vectors'] = train_data['activity'].map(get_move_from_vectors)

In [15]:
def distance_of_move_of_selection(selection_vectors):
    if len(selection_vectors) > 0:
        return selection_vectors[1][0] - selection_vectors[0][0]
    else:
        return 0

# How large was the selection
def size_of_moved_selection(selection_vectors):
    if len(selection_vectors) > 0:
        return selection_vectors[0][1] - selection_vectors[0][0]
    else:
        return 0

In [16]:
train_data['distance_of_moved_selection'] = train_data['selection_vectors'].map(distance_of_move_of_selection)
train_data['size_of_moved_selection'] = train_data['selection_vectors'].map(size_of_moved_selection)
train_data.drop(['selection_vectors'], axis=1, inplace=True)

## Time features

### The total amount of time spent on the essay (as a fraction of total time allowed)

In [17]:
# The total amount of time the person spent writing the essay as a fraction of the total time
def fraction_of_time_spent_writing(writing_times):
    total_time = 1800000 # Half an hour in milliseconds
    max_time = max(writing_times)
    return max_time / total_time

### Time related features pertaining to actions

In [18]:
def normalize_activities(activity):
    if 'Move From' in activity:
        return 'Move From'
    else:
        return activity

train_data['activity'] = train_data['activity'].map(normalize_activities)

In [19]:
# Calculate average time, max time and total time of different actions
action_time_features = train_data.groupby(['id', 'activity']).agg(
    {'action_time': ['mean', 'max', 'sum', 'count']}
)

In [20]:
# Flatten multi index columns
action_time_features.columns = ['_'.join(col).strip() for col in action_time_features.columns.values]

# Unstack multi index rows
action_time_features = action_time_features.unstack('activity')

# Flatten multi index columns
action_time_features.columns = ['_'.join(col).strip() for col in action_time_features.columns.values]

In [21]:
action_time_features.fillna(0, inplace=True) # Fill na with 0s 

## Add the move to feature to the agg function

## Features to create


* Actual word count
* Length of an action
* Last word count
* Amount of paragraphs (number of times /n is used)
* Create time series features and then aggregate them
* Time it took to write
* Did they review (reread) what they wrote?
* Distance of mouse movement 
* Angle of mouse movement 
* Movement around essay with non mouse movement 
* Different common patterns of activities (Maybe take a look at X sized window of actions and determine which ones are more commmon amongst different scoring groups)
* Essay structure reconstruction
    * What is the structure of the essay
    * Average size of 
* Usage of punctuation
    * Double quotations --> How many citations are there? 
    * Square brackets --> References
* Time allocation
    * What is the average time of their pauses?
    * What is the pattern of the pauses?
    * What is the patterns/length of pauses at the beginning of the essay writing?
    * What is the patterns/length of pauses at the end of the essay writing?
* Perform clustering on the features in order to try and seperate them into 4 groups (4 prompts)
* perplexity --> LLM scores
* Words per minute --> Speed of typing
* Introduction is for the end
    * Can you isolate when the paragraphs were written in the course of the total time of the essay writing)

--> Moved this info to issue

In [22]:
# Feature engineering for typing behavior features
typing_features = train_data.groupby('id').agg({
    'activity': 'count',                # Total number of activities
    'action_time': ['sum', 'mean'],     # Total and average action time
    'word_count': 'max',                # Maximum word count
    'text_change': 'nunique',           # Number of unique text changes
    'cursor_position': 'mean',           # Average cursor position
    'text_change' : count_large_text_changes,
    'text_change' : count_extremely_large_text_changes,
    'text_change' : count_tiny_text_changes,
    'activity': count_nonproduction,
    'activity': count_input,
    'activity': count_remove,
    'activity': count_replace,
    'activity': count_paste,
    'distance_of_moved_selection': ['mean', 'max'],
    'size_of_moved_selection': ['mean', 'max'],
    'up_time': fraction_of_time_spent_writing, # Amount of time spent on the essay,
    
    
})

In [23]:
typing_features.head()

Unnamed: 0_level_0,activity,action_time,action_time,word_count,text_change,cursor_position,distance_of_moved_selection,distance_of_moved_selection,size_of_moved_selection,size_of_moved_selection,up_time
Unnamed: 0_level_1,count_paste,sum,mean,max,count_tiny_text_changes,mean,mean,max,mean,max,fraction_of_time_spent_writing
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
001519c8,0,297243,116.246774,256,2429,711.163473,0.000391,5,0.004302,8,1.001094
0022f953,1,275391,112.221271,323,2199,776.205786,0.0,0,0.0,0,0.993872
0042269b,0,421201,101.837766,404,3951,731.611702,0.0,0,0.0,0,0.984261
0059420b,1,189596,121.848329,206,1455,542.537275,0.0,0,0.0,0,0.780261
0075873a,0,313702,123.943896,252,2459,600.050968,0.0,0,0.0,0,0.923596


## Flatten the Multi-Level Column Index:

* Flattening the multi-level column index for ease of use.

In [24]:
# Flatten the multi-level column index
typing_features.columns = ['_'.join(col).strip() for col in typing_features.columns.values]

In [25]:
# Merge action time features with typing features
features = pd.merge(typing_features, action_time_features, on='id')

## Merging Typing Features with Scores:

* Merging the typing behavior features with essay scores.

In [26]:
# Merge typing features with scores
features = features.merge(train_scores, on='id')

In [27]:
features.head()

Unnamed: 0,id,activity_count_paste,action_time_sum,action_time_mean,word_count_max,text_change_count_tiny_text_changes,cursor_position_mean,distance_of_moved_selection_mean,distance_of_moved_selection_max,size_of_moved_selection_mean,...,action_time_sum_Paste,action_time_sum_Remove/Cut,action_time_sum_Replace,action_time_count_Input,action_time_count_Move From,action_time_count_Nonproduction,action_time_count_Paste,action_time_count_Remove/Cut,action_time_count_Replace,score
0,001519c8,0,297243,116.246774,256,2429,711.163473,0.000391,5,0.004302,...,0.0,34130.0,876.0,2010.0,3.0,120.0,0.0,417.0,7.0,3.5
1,0022f953,1,275391,112.221271,323,2199,776.205786,0.0,0,0.0,...,71.0,23550.0,98.0,1938.0,0.0,254.0,1.0,260.0,1.0,3.5
2,0042269b,0,421201,101.837766,404,3951,731.611702,0.0,0,0.0,...,0.0,32905.0,627.0,3515.0,0.0,175.0,0.0,439.0,7.0,6.0
3,0059420b,1,189596,121.848329,206,1455,542.537275,0.0,0,0.0,...,160.0,18410.0,174.0,1304.0,0.0,99.0,1.0,151.0,1.0,2.0
4,0075873a,0,313702,123.943896,252,2459,600.050968,0.0,0,0.0,...,0.0,40199.0,0.0,1942.0,0.0,72.0,0.0,517.0,0.0,4.0
