#### Using info from yuji96/reproduce-essay.ipynb

In [3]:
import textwrap
import re
import pandas as pd


In [4]:
df = pd.read_csv("../data/train_logs.csv")

In [5]:
def print_essay(s, width=100):
    print(
        "\n".join(
            textwrap.wrap(
                s, width=width, break_long_words=False, replace_whitespace=False
            )
        )
    )

In [2]:
def extract_text(data):
    text = ''
    last_cur = 0
    for i, row in data.iterrows():
        match row.activity:
            case 'Input':
                text = text[:row.cursor_position-1] + row.text_change[:1] + text[row.cursor_position-1:]
            case 'Remove/Cut':
                text = text[:row.cursor_position] + text[row.cursor_position + len(row.text_change):]
            case 'Replace':
                t1, t2 = row.text_change.split(' => ')
                text = text[:row.cursor_position] + text[row.cursor_position+len(t1):]
            case 'Paste':
                text = text[:last_cursor] + row.text_change + text[last_cursor:]
        last_cursor = row.cursor_position

    return text

#### Optimize this bullshat

In [5]:
# test = df.groupby('id').apply(extract_essay) # Run time is too long, OOM error

### Get some essay examples

In [6]:
ids = df['id'].unique()
print(len(ids))

2471


Select the ids with the shortest essays (to avoid OOM).

In [7]:
counts = df.groupby('id').count()
selected_id = counts.sort_values(by='activity', ascending=True).index[0]

# Get entries with selected ids
selected_df = df.loc[df['id'] == selected_id]

In [8]:
selected_df

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
2870031,5a3f0d07,1,1208,1276,68,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2870032,5a3f0d07,2,11791,11893,102,Nonproduction,CapsLock,CapsLock,NoChange,0,0
2870033,5a3f0d07,3,11899,11963,64,Input,q,q,q,1,1
2870034,5a3f0d07,4,11971,12052,81,Nonproduction,CapsLock,CapsLock,NoChange,1,1
2870035,5a3f0d07,5,12083,12132,49,Input,q,q,q,2,1
...,...,...,...,...,...,...,...,...,...,...,...
2870288,5a3f0d07,258,111669,111781,112,Input,q,q,q,205,35
2870289,5a3f0d07,259,111773,111845,72,Input,q,q,q,206,35
2870290,5a3f0d07,260,112685,112741,56,Input,.,.,.,207,35
2870291,5a3f0d07,261,1603885,1604003,118,Nonproduction,Leftclick,Leftclick,NoChange,207,35


### Reconstruct

In [9]:
essay = extract_text(selected_df)

In [10]:
essay

'qqqqqq qqqqqq qq qqq qqqqqqqq qqq qqq qqqqqqq qq qqqqqqqqq qqqqqqqqqq qqq qqqqqq qq qqqqq qqqq qqq qqqqq qqqqqqqqqq qqq qqqqqqqqq. qqqqqqq qqqqq qq qqqqqq qqq qq qqqqqqq qqqq qqq qqqq qqqqq qq qqqqqqq qqqqq.'