In [1]:
import pandas as pd



In [5]:
!ls data

sample_submission.csv train_essays_02.csv   train_logs.csv.zip
test_logs.csv         train_logs.csv        train_scores.csv


In [3]:
!ls artifacts

test_agg_fe.csv            train_feats.csv
test_feats.csv             train_feats_v2.csv
test_feats_v2.csv          train_paragraph_agg_df.csv
test_paragraph_agg_df.csv  train_sent_agg_df.csv
train_agg_fe.csv


In [23]:
INPUT_DIR = "./data"
ARTIFACTS_DIR = "./artifacts"

# initial data
train_logs = pd.read_csv(INPUT_DIR + '/train_logs.csv')
test_logs = pd.read_csv(INPUT_DIR + '/test_logs.csv')

train_feats = pd.read_csv(ARTIFACTS_DIR + '/train_feats.csv')
test_feats = pd.read_csv(ARTIFACTS_DIR + '/test_feats.csv')

train_featsv2 = pd.read_csv(ARTIFACTS_DIR + '/train_feats_v2.csv')
test_featsv2 = pd.read_csv(ARTIFACTS_DIR + '/test_feats_v2.csv')

train_sent_agg_df = pd.read_csv(ARTIFACTS_DIR + '/train_sent_agg_df.csv')
train_paragraph_agg_df = pd.read_csv(ARTIFACTS_DIR + '/train_paragraph_agg_df.csv')
test_paragraph_agg_df = pd.read_csv(ARTIFACTS_DIR + '/test_paragraph_agg_df.csv')

train_agg_fe = pd.read_csv(ARTIFACTS_DIR + '/train_agg_fe.csv')
test_agg_fe = pd.read_csv(ARTIFACTS_DIR + '/test_agg_fe.csv')

In [24]:
data = [train_logs, test_logs, train_feats, test_feats, train_featsv2, test_featsv2, train_sent_agg_df,
        train_paragraph_agg_df, test_paragraph_agg_df, train_agg_fe, test_agg_fe]


[i.shape for i in data]

[(8405898, 11),
 (6, 11),
 (2471, 287),
 (3, 263),
 (2471, 297),
 (3, 272),
 (2471, 26),
 (2471, 26),
 (3, 26),
 (2471, 46),
 (3, 46)]

In [9]:
train_logs.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1


In [15]:
train_logs.shape

(8405898, 11)

In [10]:
test_logs.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,0000aaaa,1,338433,338518,85,Input,Space,Space,,0,0
1,0000aaaa,2,760073,760160,87,Input,Space,Space,,1,0
2,2222bbbb,1,711956,712023,67,Input,q,q,q,0,1
3,2222bbbb,2,290502,290548,46,Input,q,q,q,1,1
4,4444cccc,1,635547,635641,94,Input,Space,Space,,0,0


In [16]:
test_logs.shape

(6, 11)

Order: train_logs + test_logs $\rightarrow$ sent & paragraph agg dfs for train and test $\rightarrow$ apply preprocessor v1 to train_feats, keep test_feats $\rightarrow$ create train_agg_fe_df (create agg features grouping by ID for down_time, up_time, action_time, cursor_position, word_count) and test_agg

In the provided code, time features are constructed using a technique called "lagging". This involves creating new columns in the dataset that represent values from previous time steps. This is particularly useful in time series data or any sequential data like keystrokes, as it helps to capture the temporal dynamics of the dataset. 

Here's a detailed explanation of how time features are constructed using lag:

1. **Gap Definition**: The variable `self.gaps` contains a list of integers representing the lag steps. These are the time gaps for which features will be created.

2. **Up Time Lagging**: 
   - For each value in `self.gaps`, a new column is created in the dataframe `df` which is a lagged version of the `up_time` column. 
   - This is done using `df.groupby('id')['up_time'].shift(gap)`, where `gap` is the number of steps to lag. This shifts the `up_time` column downwards by `gap` steps within each group of 'id'.
   - This lagged `up_time` essentially represents the 'up_time' of a previous event (like a previous keystroke), depending on the size of the gap.

3. **Action Time Gap Calculation**:
   - For each gap, a new feature called `action_time_gap` is calculated. 
   - This is done by subtracting the lagged `up_time` (from the previous step) from the current `down_time` in the dataframe: `df['down_time'] - df[f'up_time_shift{gap}']`.
   - The `action_time_gap` represents the time interval between two events – the current event's `down_time` and the `up_time` of an event that occurred `gap` steps earlier.

4. **Significance of Lagging**:
   - Lagging helps to capture the temporal sequence and dependencies between events. 
   - In the context of typing data, these lagged time features can reveal patterns such as typing speed, the regularity of keystrokes, and delays between specific keystrokes.
   - Different gaps provide different temporal resolutions. For example, a gap of 1 could capture immediate sequential dependencies, while larger gaps like 10 or 20 could capture longer-term patterns or rhythms in typing.

5. **Cleaning Up**: After the necessary features are created, the intermediate lagged columns (like `up_time_shift{gap}`) are dropped from the dataframe to keep only the relevant features (like `action_time_gap{gap}`).

This lagging technique effectively transforms the sequential keystroke data into a structured format that can be used for predictive modeling, helping to uncover temporal patterns and relationships inherent in the typing behavior.

In [38]:
train_feats

Unnamed: 0,id,event_id_max,up_time_max,action_time_max,action_time_min,action_time_mean,action_time_std,action_time_quantile,action_time_sem,action_time_sum,...,text_change_14_count,punct_cnt,input_word_count,input_word_length_mean,input_word_length_max,input_word_length_std,word_time_ratio,word_event_ratio,event_time_ratio,idle_time_ratio
0,001519c8,2557,1801969,2259,0,116.246774,91.797374,112.0,1.815369,297243,...,-inf,37,366,5.325137,20,3.487804,0.000142,0.100117,0.001419,0.832534
1,0022f953,2454,1788969,1758,0,112.221271,55.431189,115.0,1.118966,275391,...,-inf,53,385,4.410390,33,3.199496,0.000181,0.131622,0.001372,0.828944
2,0042269b,4136,1771669,3005,0,101.837766,82.383766,94.0,1.281007,421201,...,-inf,47,627,5.446571,25,3.474895,0.000228,0.097679,0.002335,0.759751
3,0059420b,1556,1404469,806,0,121.848329,113.768226,110.0,2.884139,189596,...,-inf,18,251,4.609562,19,2.949601,0.000147,0.132391,0.001108,0.835531
4,0075873a,2531,1662472,701,0,123.943896,62.082013,129.0,1.234013,313702,...,-inf,66,412,4.766990,18,2.986064,0.000152,0.099565,0.001522,0.764103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,4739,1791649,3323,0,105.437856,63.622575,113.0,0.924204,499670,...,-62.569776,88,734,4.915531,20,3.001989,0.000257,0.097278,0.002645,0.708572
2467,ffbef7e5,2604,1799174,1144,0,82.266129,36.178818,80.0,0.708980,214221,...,-inf,63,470,4.085106,13,2.231589,0.000243,0.168203,0.001447,0.868855
2468,ffccd6fd,3063,1959363,564,0,75.605615,63.494975,70.0,1.147271,231580,...,-inf,7,222,4.644144,15,2.707087,0.000103,0.065622,0.001563,0.869824
2469,ffec5b38,3242,1508504,1388,0,89.277915,54.515788,85.0,0.957449,289439,...,-inf,70,500,5.294000,24,3.541689,0.000274,0.127390,0.002149,0.794947


In [37]:
train_feats.columns

Index(['id', 'event_id_max', 'up_time_max', 'action_time_max',
       'action_time_min', 'action_time_mean', 'action_time_std',
       'action_time_quantile', 'action_time_sem', 'action_time_sum',
       ...
       'text_change_14_count', 'punct_cnt', 'input_word_count',
       'input_word_length_mean', 'input_word_length_max',
       'input_word_length_std', 'word_time_ratio', 'word_event_ratio',
       'event_time_ratio', 'idle_time_ratio'],
      dtype='object', length=287)

In [14]:
test_feats.head()

Unnamed: 0,id,event_id_max,up_time_max,action_time_max,action_time_min,action_time_mean,action_time_std,action_time_quantile,action_time_sem,action_time_sum,...,text_change_14_count,punct_cnt,input_word_count,input_word_length_mean,input_word_length_max,input_word_length_std,word_time_ratio,word_event_ratio,event_time_ratio,idle_time_ratio
0,0000aaaa,2,760160,87,85,86.0,1.414214,86.0,1.0,172,...,-inf,0,0,0.0,0,0.0,0.0,0.0,3e-06,0.554561
1,2222bbbb,2,712023,67,46,56.5,14.849242,56.5,10.5,113,...,-inf,0,1,2.0,2,0.0,1e-06,0.5,3e-06,-0.592005
2,4444cccc,2,635641,94,56,75.0,26.870058,75.0,19.0,150,...,-inf,0,1,1.0,1,0.0,2e-06,0.5,3e-06,-0.708962


In [17]:
test_feats.shape

(3, 263)

In [27]:
data[1]

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,0000aaaa,1,338433,338518,85,Input,Space,Space,,0,0
1,0000aaaa,2,760073,760160,87,Input,Space,Space,,1,0
2,2222bbbb,1,711956,712023,67,Input,q,q,q,0,1
3,2222bbbb,2,290502,290548,46,Input,q,q,q,1,1
4,4444cccc,1,635547,635641,94,Input,Space,Space,,0,0
5,4444cccc,2,184996,185052,56,Input,q,q,q,1,1


In [31]:
train_featsv2

Unnamed: 0,id,event_id_max,up_time_max,action_time_max,action_time_min,action_time_mean,action_time_std,action_time_quantile,action_time_sem,action_time_sum,...,largest_lantency,smallest_lantency,median_lantency,initial_pause,pauses_half_sec,pauses_1_sec,pauses_1_half_sec,pauses_2_sec,pauses_3_sec,score
0,001519c8,2557,1801969,2259,0,116.246774,91.797374,112.0,1.815369,297243,...,154.136,0.0,0.062,4.526,114,51,30,21,103,3.5
1,0022f953,2454,1788969,1758,0,112.221271,55.431189,115.0,1.118966,275391,...,145.899,0.0,0.061,30.623,141,37,13,19,61,3.5
2,0042269b,4136,1771669,3005,0,101.837766,82.383766,94.0,1.281007,421201,...,153.886,0.0,0.040,4.441,83,46,25,25,52,6.0
3,0059420b,1556,1404469,806,0,121.848329,113.768226,110.0,2.884139,189596,...,101.690,0.0,0.131,41.395,178,81,34,32,55,2.0
4,0075873a,2531,1662472,701,0,123.943896,62.082013,129.0,1.234013,313702,...,110.688,0.0,0.059,78.470,65,24,11,17,71,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,4739,1791649,3323,0,105.437856,63.622575,113.0,0.924204,499670,...,128.570,0.0,0.034,22.467,117,41,18,11,30,3.5
2467,ffbef7e5,2604,1799174,1144,0,82.266129,36.178818,80.0,0.708980,214221,...,267.869,0.0,0.172,21.732,121,43,24,24,66,4.0
2468,ffccd6fd,3063,1959363,564,0,75.605615,63.494975,70.0,1.147271,231580,...,229.804,0.0,0.116,23.482,168,82,37,29,58,1.5
2469,ffec5b38,3242,1508504,1388,0,89.277915,54.515788,85.0,0.957449,289439,...,127.733,0.0,0.091,19.885,116,35,27,15,48,5.0


The research paper, "Uncovering the Writing Process through Keystroke Analyses," focuses on the analysis of keystroke data to understand the writing process. It explores how keystroke patterns, including basic and time-sensitive indices like pauses and latencies, relate to writing quality and linguistic features of essays. The paper investigates both the aggregate and temporal properties of keystroke data, utilizing measures such as the largest and smallest latency, median latency, initial pause, and various categories of pause durations to analyze the writing behavior and its correlation with essay quality.

This research aligns with the code's approach in constructing features like 'largest_latency', 'smallest_latency', 'median_latency', 'initial_pause', and various pause counts. These features are designed to capture similar aspects of the writing process as discussed in the paper, focusing on the timing and rhythm of keystrokes to infer writing quality.

The features in the code, inspired by the research paper, focus on understanding the temporal dynamics of typing behavior:

1. **Largest Latency**: The longest delay between consecutive keystrokes for each writer. Indicates moments of deep thinking or hesitations.

2. **Smallest Latency**: The shortest delay, showing the fastest typing speed.

3. **Median Latency**: The median value of delays, offering insight into the typical typing rhythm.

4. **Initial Pause**: The time before the first keystroke, possibly reflecting preparation time.

5. **Pause Durations**: Counts of pauses in different duration ranges (half a second, one second, etc.), useful for understanding the frequency and length of breaks in typing, which might relate to cognitive processes like planning or revising text.

These features collectively aim to provide a comprehensive picture of the writing process, linking keystroke dynamics to writing quality.

The features extracted from the logs aim to capture various aspects of typing patterns that could correlate with writing quality. Here's an explanation of each feature:

up_time_lagged: This feature represents the 'up_time' of the previous keystroke for each 'id'. It's obtained by shifting the 'up_time' column down by one row within each group of 'id'. The .fillna(logs['down_time']) part ensures that for the first entry of each 'id', where there is no previous 'up_time', the 'down_time' is used instead. This feature helps to understand the interval between consecutive keystrokes.

time_diff: This represents the absolute time difference between the 'down_time' of the current keystroke and the 'up_time' of the previous keystroke, converted to seconds (since the original time is likely in milliseconds). It's a measure of the latency between keystrokes.

largest_latency: This is the maximum latency between keystrokes for each 'id'. It could indicate moments of longer pauses or hesitations while typing.

smallest_latency: This is the minimum latency between keystrokes for each 'id', capturing the quickest consecutive keystrokes.

median_latency: This is the median of the latencies between keystrokes for each 'id', providing a measure of the typical typing rhythm.

initial_pause: The time at which the first keystroke occurred for each 'id', indicating the initial delay or pause before starting to type.

pauses_half_sec, pauses_1_sec, pauses_1_half_sec, pauses_2_sec, pauses_3_sec: These features count the number of pauses of different durations (0.5 to 1 second, 1 to 1.5 seconds, etc.) for each 'id'. These features are important as they capture various lengths of typing breaks, which could be indicative of thinking, correcting errors, or other aspects relevant to writing quality.

By analyzing these features, the competition participants aim to build models that can predict writing quality based on the dynamics of typing patterns. The hypothesis is that certain typing behaviors, like the frequency and length of pauses, speed of typing, and consistency in rhythm, might correlate with the quality of writing.

In [29]:
test_featsv2

Unnamed: 0,id,event_id_max,up_time_max,action_time_max,action_time_min,action_time_mean,action_time_std,action_time_quantile,action_time_sem,action_time_sum,...,idle_time_ratio,largest_lantency,smallest_lantency,median_lantency,initial_pause,pauses_half_sec,pauses_1_sec,pauses_1_half_sec,pauses_2_sec,pauses_3_sec
0,0000aaaa,2,760160,87,85,86.0,1.414214,86.0,1.0,172,...,0.554561,421.555,0.0,210.7775,338.433,0,0,0,0,1
1,2222bbbb,2,712023,67,46,56.5,14.849242,56.5,10.5,113,...,-0.592005,421.521,0.0,210.7605,711.956,0,0,0,0,1
2,4444cccc,2,635641,94,56,75.0,26.870058,75.0,19.0,150,...,-0.708962,450.645,0.0,225.3225,635.547,0,0,0,0,1


In [43]:
train_featsv2.columns.tolist()

['id',
 'event_id_max',
 'up_time_max',
 'action_time_max',
 'action_time_min',
 'action_time_mean',
 'action_time_std',
 'action_time_quantile',
 'action_time_sem',
 'action_time_sum',
 'action_time_skew',
 'action_time_kurtosis',
 'activity_nunique',
 'down_event_nunique',
 'up_event_nunique',
 'text_change_nunique',
 'cursor_position_nunique',
 'cursor_position_max',
 'cursor_position_quantile',
 'cursor_position_sem',
 'cursor_position_mean',
 'word_count_nunique',
 'word_count_max',
 'word_count_quantile',
 'word_count_sem',
 'word_count_mean',
 'action_time_gap1_max',
 'action_time_gap1_min',
 'action_time_gap1_mean',
 'action_time_gap1_std',
 'action_time_gap1_quantile',
 'action_time_gap1_sem',
 'action_time_gap1_sum',
 'action_time_gap1_skew',
 'action_time_gap1_kurtosis',
 'cursor_position_change1_max',
 'cursor_position_change1_mean',
 'cursor_position_change1_std',
 'cursor_position_change1_quantile',
 'cursor_position_change1_sem',
 'cursor_position_change1_sum',
 'cursor_

In [32]:
train_sent_agg_df

Unnamed: 0,sent_count,sent_len_mean,sent_len_std,sent_len_min,sent_len_max,sent_len_first,sent_len_last,sent_len_sem,sent_len_q1,sent_len_median,...,sent_word_count_max,sent_word_count_first,sent_word_count_last,sent_word_count_sem,sent_word_count_q1,sent_word_count_median,sent_word_count_q3,sent_word_count_skew,sent_word_count_sum,id
0,14,106.142857,41.128050,31,196,31,89,10.991934,75.50,119.5,...,29,6,16,1.736577,12.25,21.0,22.00,-5.060073e-01,256,001519c8
1,15,107.666667,64.713287,19,226,19,143,16.708899,56.50,92.0,...,45,3,30,3.269872,12.00,20.0,31.00,3.918573e-01,325,0022f953
2,19,133.842105,33.480115,73,189,139,161,7.680865,108.00,139.0,...,29,21,26,1.207599,17.50,21.0,26.50,-2.425597e-01,408,0042269b
3,13,86.846154,33.195999,39,144,99,80,9.206914,62.00,80.0,...,27,17,14,1.800997,11.00,15.0,18.00,6.560551e-01,208,0059420b
4,16,86.812500,44.094170,22,182,75,22,11.023543,60.00,74.0,...,35,11,3,2.166927,11.00,12.5,18.25,1.148513e+00,255,0075873a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,13,121.076923,40.376275,55,180,79,109,11.198364,84.00,132.0,...,33,16,23,1.860521,16.00,23.0,24.00,5.872502e-03,273,ffb8c745
2467,29,78.310345,40.481127,20,175,143,52,7.517157,52.00,67.0,...,33,27,9,1.334366,10.00,13.0,19.00,7.622383e-01,443,ffbef7e5
2468,4,277.000000,77.395090,200,359,223,359,38.697545,217.25,274.5,...,61,42,61,5.492419,41.75,51.0,60.25,-1.143584e-16,204,ffccd6fd
2469,27,92.592593,33.747090,36,176,79,94,6.494631,63.00,98.0,...,29,11,16,1.086528,11.00,15.0,18.50,7.924021e-01,418,ffec5b38


In [35]:
train_agg_fe

Unnamed: 0,id,tmp_down_time_mean,tmp_down_time_std,tmp_down_time_min,tmp_down_time_max,tmp_down_time_last,tmp_down_time_first,tmp_down_time_sem,tmp_down_time_median,tmp_down_time_sum,...,tmp_cursor_position_sum,tmp_word_count_mean,tmp_word_count_std,tmp_word_count_min,tmp_word_count_max,tmp_word_count_last,tmp_word_count_first,tmp_word_count_sem,tmp_word_count_median,tmp_word_count_sum
0,001519c8,8.481808e+05,395112.665961,4526,1801877,1801877,4526,7813.679400,891716.0,2168798234,...,1818445,128.116152,76.498372,0,256,255,0,1.512819,132.0,327593
1,0022f953,5.188553e+05,384959.404177,30623,1788842,1788842,30623,7771.013336,407673.0,1273271023,...,1904809,182.714751,97.763090,0,323,320,0,1.973502,186.0,448382
2,0042269b,8.284918e+05,489500.796565,4441,1771219,1771219,4441,7611.375322,759582.0,3426641982,...,3025946,194.772727,108.935068,0,404,404,0,1.693860,193.0,805580
3,0059420b,7.854830e+05,385205.014399,41395,1404394,1404394,41395,9765.334758,848240.5,1222211589,...,844188,103.618895,61.882250,0,206,206,0,1.568777,108.5,161231
4,0075873a,7.133542e+05,405576.409034,78470,1662390,1662390,78470,8061.699636,686588.0,1805499474,...,1518729,125.082971,77.255054,0,252,252,0,1.535610,113.0,316585
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,7.361019e+05,503882.020411,22467,1791581,1791581,22467,7319.568976,735095.0,3488386746,...,3667989,256.353661,118.093794,0,461,273,0,1.715472,297.0,1214860
2467,ffbef7e5,8.419662e+05,512744.745940,21732,1799124,1799124,21732,10048.025509,748404.5,2192480040,...,2661493,223.013057,126.627934,0,438,438,0,2.481470,227.5,580726
2468,ffccd6fd,1.229015e+06,514320.848199,23482,1959273,1959273,23482,9293.100430,1506525.0,3764472937,...,4009729,157.589292,61.236111,0,201,201,0,1.106456,201.0,482696
2469,ffec5b38,5.765185e+05,334477.976640,19885,1508335,1508335,19885,5874.366278,573912.0,1869073112,...,3866542,205.917027,118.473905,0,413,413,0,2.080732,205.0,667583


In [33]:
train_paragraph_agg_df

Unnamed: 0,paragraph_count,paragraph_len_mean,paragraph_len_std,paragraph_len_min,paragraph_len_max,paragraph_len_first,paragraph_len_last,paragraph_len_sem,paragraph_len_q1,paragraph_len_median,...,paragraph_word_count_max,paragraph_word_count_first,paragraph_word_count_last,paragraph_word_count_sem,paragraph_word_count_q1,paragraph_word_count_median,paragraph_word_count_q3,paragraph_word_count_skew,paragraph_word_count_sum,id
0,3,508.000000,134.208793,390,654,390,480,77.485483,435.00,480.0,...,112,71,86,11.976829,78.50,86.0,99.00,0.770543,269,001519c8
1,6,278.166667,98.554384,176,462,240,284,40.234659,228.75,261.0,...,96,53,60,8.316316,47.75,56.5,62.25,1.299614,355,0022f953
2,6,429.500000,101.087586,296,568,491,296,41.268834,356.75,444.5,...,88,79,45,6.926599,55.50,73.5,78.75,-0.502908,410,0042269b
3,3,384.000000,56.471232,347,449,347,356,32.603681,351.50,356.0,...,81,62,65,5.897269,63.50,65.0,73.00,1.565482,208,0059420b
4,5,283.400000,232.336609,23,627,351,23,103.904090,124.00,292.0,...,114,61,3,18.706683,26.00,52.0,61.00,0.686760,256,0075873a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,4,407.750000,91.790976,301,514,372,301,45.895488,354.25,408.0,...,88,71,63,6.013873,69.00,78.5,86.50,-0.344825,308,ffb8c745
2467,6,387.166667,178.941797,144,648,144,228,73.052683,274.00,424.0,...,119,27,40,13.965235,50.00,83.5,89.25,-0.319028,443,ffbef7e5
2468,3,918.333333,939.787387,327,2002,426,2002,542.586501,376.50,426.0,...,1703,83,1703,543.873862,71.50,83.0,893.00,1.730889,1846,ffccd6fd
2469,5,509.600000,122.681702,380,672,672,380,54.864925,394.00,540.0,...,111,111,62,8.992219,66.00,85.0,93.00,0.350840,417,ffec5b38


In [56]:
train_feats_2 = pd.read_csv(INPUT_DIR+"/train_feats.csv").drop("Unnamed: 0", axis = 1)
test_feats_2 = pd.read_csv(INPUT_DIR+"/test_feats.csv").drop("Unnamed: 0", axis = 1)

In [57]:
train_feats_2.shape

(2471, 396)

In [55]:
list(set(train_feats_2.columns) - set(train_feats.columns))

['tmp_cursor_position_max',
 'tmp_down_time_max',
 'tmp_up_time_last',
 'paragraph_len_sem',
 'tmp_down_time_sum',
 'sent_word_count_q1',
 'tmp_cursor_position_median',
 'paragraph_word_count_mean',
 'tmp_cursor_position_sum',
 'tmp_action_time_sum',
 'tmp_action_time_median',
 'sent_len_sem',
 'sent_word_count_sum',
 'score',
 'tmp_word_count_sum',
 'sent_len_min',
 'tmp_word_count_median',
 'tmp_word_count_first',
 'tmp_up_time_min',
 'tmp_down_time_last',
 'paragraph_len_median',
 'median_lantency',
 'paragraph_len_last',
 'tmp_cursor_position_sem',
 'paragraph_len_q1',
 'paragraph_word_count_sum',
 'tmp_down_time_min',
 'sent_len_mean',
 'action_time_gap20_kurt',
 'paragraph_word_count_min',
 'paragraph_len_kurt',
 'paragraph_word_count_skew',
 'sent_len_max',
 'largest_lantency',
 'tmp_up_time_sum',
 'pauses_1_sec',
 'word_count_change100_kurt',
 'paragraph_count',
 'tmp_up_time_median',
 'cursor_position_change3_kurt',
 'sent_word_count_median',
 'paragraph_len_min',
 'tmp_action