## **Check for text change and cursor position**
- check for the number of character text change and score correlation
- The log of the number of character changes has a correlation of 0.65 with score
- The average cursor position has a correlation of 0.66 with score
- include the average cursor position, n_char changes, std cursor movement,
- see the correlations in the end
- Only n_char_changes and log_word_count are good


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
#from dask.distributed import Client
import dask.dataframe as dd

In [2]:
notes = set() 
input_folder = "kaggle/input/linking-writing-processes-to-writing-quality/"

In [3]:
train_logs = pd.read_csv(input_folder + "train_logs.csv",delimiter = ",",header = 0)
train_scores = pd.read_csv(input_folder +"train_scores.csv", delimiter = ",", header = 0)
scores = pd.Series(data = train_scores['score'].values, index = train_scores['id'].values, name = 'score')

In [4]:
train_logs.head(5)

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1


In [5]:
# ch3ck number of character changes and score
n = train_logs.text_change.unique()
n

array(['NoChange', 'q', ' ', ..., 'qq qqq qqqq qqqqq',
       'qq qqqqq qqqq qq qqqqq ',
       '\n qqqqq qqqqqq qqqqqqqqqq qq q qqqqqqqq qqq qqq qqqq qqqqqq q qqq. \n\nqqqq qqq qq qqqqqqq qqq:\n- \n- qqq qqqqqqq qqqq q qqqqqq qqqqqqqq qq qqqq qqqqqqqq '],
      dtype=object)

In [6]:
def charCounter(x, character):
    n = 0
    for i in x:
        for j in i:
            if j == character:
                n +=1

    return np.log(n) 



In [35]:
ln_n_char_changes = train_logs.groupby('id')['text_change'].aggregate(charCounter, character = 'q')
ln_n_char_changes.name = "ln_n_char_changes"

In [8]:
def log_var(x):
    return np.log(np.var(x,ddof = 1))

In [9]:
var_cursor_position= train_logs.groupby('id')['cursor_position'].aggregate(log_var)
var_cursor_position.name = "var_cursor_position"

In [10]:
avg_cursor_position = train_logs.groupby('id')['cursor_position'].mean()
avg_cursor_position = np.log(avg_cursor_position)
avg_cursor_position.name = "avg_cursor_position"

In [11]:
skew_cursor_position= train_logs.groupby('id')['cursor_position'].aggregate(st.skew)
skew_cursor_position.name = "skew_cursor_position"


In [12]:
def n_char(x, character):
    n = 0
    for i in x:
        for j in i:
            if j == character:
                n +=1

    return n 
    

In [13]:
n_char_comma = train_logs.groupby('id')['text_change'].aggregate(n_char, character = ',')
n_char_comma.name = "n_char_comma"

In [14]:
n_char_space = train_logs.groupby('id')['text_change'].aggregate(n_char, character = ' ')
n_char_space.name = "n_char_space"

In [15]:
n_char_dot= train_logs.groupby('id')['text_change'].aggregate(n_char, character = '.')
n_char_dot.name = "n_char_dot"

In [16]:
def max_word_count_ratio(x):
    return np.max(x)/len(x)

In [17]:
#check word count per length or average word count:
ave_word_count = train_logs.groupby('id')['word_count'].aggregate(max_word_count_ratio)
ave_word_count.name = "ave_word_count"

In [18]:
def word_count(x):
    return np.log(np.max(x))

In [28]:
ln_word_count = train_logs.groupby('id')['word_count'].aggregate(word_count)
ln_word_count.name = "ln_word_count"

In [36]:
z = pd.merge(ln_n_char_changes, avg_cursor_position, left_index = True, right_index = True)
z = pd.merge(z, var_cursor_position, left_index = True, right_index = True)
z = pd.merge(z, ln_word_count, left_index = True, right_index = True)
z = pd.merge(z, scores, left_index = True, right_index = True)

In [37]:
z.corr()

Unnamed: 0,ln_n_char_changes,avg_cursor_position,var_cursor_position,ln_word_count,score
ln_n_char_changes,1.0,0.804159,0.823411,0.842259,0.654965
avg_cursor_position,0.804159,1.0,0.969796,0.942747,0.666851
var_cursor_position,0.823411,0.969796,1.0,0.9506,0.675767
ln_word_count,0.842259,0.942747,0.9506,1.0,0.691226
score,0.654965,0.666851,0.675767,0.691226,1.0


In [38]:
z.head(4)

Unnamed: 0_level_0,ln_n_char_changes,avg_cursor_position,var_cursor_position,ln_word_count,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
001519c8,7.608871,6.566902,12.170637,5.545177,3.5
0022f953,7.44132,6.654418,12.214639,5.777652,3.5
0042269b,8.209308,6.59525,12.769612,6.001415,6.0
0059420b,7.063048,6.296257,11.457777,5.327876,2.0


In [39]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [46]:
X = z[['ln_n_char_changes', 'ln_word_count']]
X = sm.add_constant(X)
Y = z.score


In [47]:
sm.OLS(Y,X).fit().summary()

0,1,2,3
Dep. Variable:,score,R-squared:,0.496
Model:,OLS,Adj. R-squared:,0.496
Method:,Least Squares,F-statistic:,1215.0
Date:,"Tue, 26 Dec 2023",Prob (F-statistic):,0.0
Time:,10:34:18,Log-Likelihood:,-2720.0
No. Observations:,2471,AIC:,5446.0
Df Residuals:,2468,BIC:,5463.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.5244,0.241,-31.283,0.000,-7.996,-7.053
ln_n_char_changes,0.5417,0.057,9.447,0.000,0.429,0.654
ln_word_count,1.1966,0.066,18.119,0.000,1.067,1.326

0,1,2,3
Omnibus:,184.914,Durbin-Watson:,2.058
Prob(Omnibus):,0.0,Jarque-Bera (JB):,265.439
Skew:,-0.612,Prob(JB):,2.2900000000000003e-58
Kurtosis:,4.038,Cond. No.,162.0
