# **Notes**  
- productive action time is not significant.
- ln action time is not significant

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

import re
from sklearn.preprocessing import OneHotEncoder

In [2]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [3]:
input_folder = "kaggle/input/linking-writing-processes-to-writing-quality/"
train_logs = pd.read_csv(input_folder + "train_logs.csv",delimiter = ",",header = 0)
train_scores = pd.read_csv(input_folder +"train_scores.csv", delimiter = ",", header = 0)
scores = pd.Series(data = train_scores['score'].values, index = train_scores['id'].values, name = 'score')

In [4]:
# eda cursor position
# cursor_position feature engineering
# here, we see that we need to get how many times does the person go back and what is the maginitude or
# how far does he/she went back?

def cp_sum_backstep(x):
    n1 = np.diff(np.log(x+1))
    return np.sum(n1[n1 < 0])

def cp_skew_backstep(x):
    n1 = np.diff(np.log(x+1))
    return st.skew(n1[n1 < 0])

def cp_n_backstep(x):
    n1 = np.diff(np.log(x+1))
    return np.log((n1<0).sum())

def cp_sum_forwardstep(x):
    n1 = np.diff(np.log(x+1))
    return np.sum(n1[n1 > 0])

def cp_skew_forwardstep(x):
    n1 = np.diff(np.log(x+1))
    return st.skew(n1[n1 > 0])

def cp_n_forwardstep(x):
    n1 = np.diff(np.log(x+1))
    return np.log((n1>0).sum())

def cp_change_stat(x):
    n1 = np.diff(np.log(x+1))
    return np.std(n1, ddof = 1)


    
    
cp = train_logs.groupby('id')['cursor_position'].aggregate([cp_sum_backstep,cp_n_backstep, 
                                                            cp_sum_forwardstep, cp_n_forwardstep,
                                                           cp_change_stat,cp_skew_backstep, 
                                                            cp_skew_forwardstep])
cp.columns =['cp_sum_backstep', 'cp_n_backstep', 'cp_sum_forwardstep','cp_n_forwardstep',
             'cp_change_stat', 'cp_skew_backstep', 'cp_skew_forwardstep']
cp.head(3)



Unnamed: 0_level_0,cp_sum_backstep,cp_n_backstep,cp_sum_forwardstep,cp_n_forwardstep,cp_change_stat,cp_skew_backstep,cp_skew_forwardstep
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
001519c8,-17.033012,6.118097,23.986697,7.631917,0.14782,-12.809379,16.882931
0022f953,-17.209967,5.799093,24.536433,7.612337,0.126807,-6.949562,13.822311
0042269b,-32.798866,6.177944,38.997345,8.175829,0.129279,-6.193762,16.609002


In [5]:
# eda wordcount:

# word_count feature engineering
# Based on the graph above, we can count the number of zero changes and get the mean:
# wc_zero_change will return the count of all non-zero steps taken by the person
def wc_non_zero_change(x):
    n1 = np.diff(np.log(x+1))
    n2 = np.count_nonzero(n1)
    return n2
def wc_change_stat(x):
    n1 = np.diff(np.log(x+1))
    last_cutoff = n1.shape[0]-200
    n2 = np.std(n1, ddof = 1)
    return n2
    
wc = train_logs.groupby('id')['word_count'].aggregate([wc_non_zero_change,
                                                       lambda x: np.log(len(x)), 
                                                       lambda x: np.log(np.max(x)),
                                                       wc_change_stat])
wc.columns = ["wc_changing_nsteps", "wc_step_count", "wc_max", "wc_change_stat"]
wc["wc_interaction"] = np.log(wc.wc_changing_nsteps * wc.wc_max)
wc.head(3)

Unnamed: 0_level_0,wc_changing_nsteps,wc_step_count,wc_max,wc_change_stat,wc_interaction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
001519c8,438,7.84659,5.545177,0.021174,7.795148
0022f953,418,7.805475,5.777652,0.022682,7.789479
0042269b,619,8.327484,6.001415,0.02241,8.220101


In [6]:
# eda text_change
# Based on the above, discriminate if the changes has "?" and "=>"
def hasChar(x, character):
    out = 0 
    for strings in x:
        if character in strings:
            out = 1 
            break
    return out

#test:
# statement = "The quick Brown Fox ?"
# print(hasChar(statement, "?"))
# v1 = 'qqqqqqq qqq qqqq qqqq qqqq qq qqqqqqqq qqqqqqq  => q'
# print(hasChar(v1, "=>"))

# del statement, v1
        
#execute now:
tc = train_logs.groupby(['id'])['text_change'].aggregate([
    ("tc_1", lambda x: hasChar(x,character = "?")), 
    ("tc_2", lambda x: hasChar(x,character = "=>")), 
    ("tc_3", lambda x: hasChar(x,character = "(")), 
    ("tc_4", lambda x: hasChar(x,character = "\"")), 
    ("tc_5", lambda x: hasChar(x,character = "-"))]) 

tc.head(3)

Unnamed: 0_level_0,tc_1,tc_2,tc_3,tc_4,tc_5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
001519c8,0,1,0,0,0
0022f953,1,1,0,1,1
0042269b,0,1,0,1,1


In [7]:
# eda up_event
# Based on the above, check for "MediaPlayPause"
def find_clicked(x, st:str):
    has_string = 0
    for event in x:
        if(event == st):
            has_string = 1
            break
    return has_string

ue = train_logs.groupby(['id'])['up_event'].aggregate([('ue_1',lambda x: find_clicked(x,"|")),
                                                      ('ue_2', lambda x: find_clicked(x,"Shift")),
                                                      ('ue_3', lambda x: find_clicked(x,"Tab")),
                                                      ])

ue.head(3)


Unnamed: 0_level_0,ue_1,ue_2,ue_3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
001519c8,0,1,0
0022f953,0,1,1
0042269b,0,1,0


In [8]:
# eda action_time
# check how many have a log of greater than 1, 2, 3 and 4
# The below will check the action_time

def above_log_ratio(x, from_zero = 1):
    z = np.diff(np.log(x+1))
    z = np.abs(z)
    if from_zero < 3:
        count= len(list(filter(lambda q: (q>from_zero -1) and (q < from_zero), z)))
    else:
        count= len(list(filter(lambda q: q>=from_zero, z )))
    return np.log((count+1)/len(z)) 

at = train_logs.groupby('id')['action_time'].aggregate([
    ('at_1', lambda x: above_log_ratio(x, from_zero = 1)),
    ('at_2', lambda x: above_log_ratio(x, from_zero = 2)),
    ('at_3', lambda x: above_log_ratio(x, from_zero = 3)),
])

In [9]:
# AT eda 2 distribution check:
# Get the distribution for each kind of score and perform chi-square to detect the score
def above_log_count(x, from_zero = 1):
    z = np.diff(np.log(x+1))
    z = np.abs(z)
    if from_zero < 5:
        count= len(list(filter(lambda q: (q>from_zero -1) and (q < from_zero), z)))
    else:
        count= len(list(filter(lambda q: q>=from_zero, z )))
    return count 


at_init = train_logs.groupby('id')['action_time'].aggregate([
    ('one', lambda x: above_log_count(x, from_zero = 1)),
    ('two', lambda x: above_log_count(x, from_zero = 2)),
    ('three', lambda x: above_log_count(x, from_zero = 3)),
    ('four', lambda x: above_log_count(x, from_zero = 4)),
    ('five', lambda x: above_log_count(x, from_zero = 5)),
])

at2 = pd.merge(at_init, scores, left_index = True, right_index = True)
# print("Score count")
# print(at2.head(3))

# Discriminate based on score:
# print("\n Score count average per log span")
at2 = at2.groupby(by = 'score').sum()
# print(at2)

# Get the proportion:
# print("\n Probability distribution")
at_proportion= at2.apply(lambda x: x/(np.sum(at2, axis = 1)))

# Perform 1 way chi-square test (goodness of fit):
# use at_init since it is the distribution
# xsum = at_init.apply(func = np.sum, axis = 1)
    
# Use chi-square to select the score of the given participant id   
def compute_score_by_chisquare(fo:pd.Series, distribution, stat_location):
    score_values = np.arange(start = 0.5, stop = 6.5, step = 0.5)
    # print(score_values)
    total = np.sum(fo)
    # print(total)
    expected_arrays = distribution * total
    # print(expected_arrays)
    chi_stat = []
    for j in range(expected_arrays.shape[0]):
        results = st.chisquare(f_obs = fo, f_exp = expected_arrays.iloc[j])
        chi_stat.append(results[1])

    chi_stat = np.array(chi_stat)
    # get the second to the maximum chi-square
    score_index = np.where(chi_stat == np.partition(chi_stat,stat_location)[stat_location])[0][0]
    return score_values[score_index]



In [10]:
#transform:
at3 = at_init.apply([(lambda x: compute_score_by_chisquare(x, distribution = at_proportion,stat_location = -1)),
      (lambda x: compute_score_by_chisquare(x, distribution = at_proportion,stat_location = -2)),
      (lambda x: compute_score_by_chisquare(x, distribution = at_proportion,stat_location = -3)),
      (lambda x: compute_score_by_chisquare(x, distribution = at_proportion,stat_location = -4))],axis = 1)
at3.columns = ['at1', 'at2', 'at3', 'at4']


In [11]:
# Eda for activity 

# Transform all with move into a Move:
# apply function
train_logs.activity = train_logs.activity.apply(lambda x: "Move" if ("Move" in x) else x)
# One hot encode the data:
act_hot = OneHotEncoder(handle_unknown = 'ignore', categories = 'auto', sparse_output = False)
a1 = act_hot.fit_transform(train_logs.activity.values.reshape(-1,1))
a2 = pd.DataFrame(data = a1, columns = ['ac_Input', 'ac_Move', 'ac_NonPro', 'ac_Paste', 'ac_RemCut', 'ac_Replace'])
a2['id'] = train_logs.id.copy()
# act is used in z as well:
act = a2.groupby(by = "id").sum()

#eda 2 distribution check for activity variable:
# Get the distribution for each kind of score and perform chi-square to detect the score

# act distribution:
act_dist = pd.merge(act, scores, left_index = True, right_index = True)
act_dist = act_dist.groupby('score').sum()
act_dist = act_dist + 1
row_total = np.sum(act_dist, axis = 1)
act_dist = act_dist.apply(lambda x: x / row_total)
# Perform 1 way chi-square test (goodness of fit):
# use at_init since it is the distribution
# xsum = at_init.apply(func = np.sum, axis = 1)
    
# Use chi-square to select the score of the given participant id 
# the below, score values are calculated for efficiency.
score_values = np.arange(start = 0.5, stop = 6.5, step = 0.5)
def compute_score_by_chisquare(fo:pd.Series, distribution):
    # print(score_values)
    total = np.sum(fo)
    # print(total)
    expected_arrays = distribution * total
    # print(expected_arrays)
    chi_stat = []
    for j in range(expected_arrays.shape[0]):
        results = st.chisquare(f_obs = fo, f_exp = expected_arrays.iloc[j])
        chi_stat.append(results[1])

    chi_stat = np.array(chi_stat)
    #print(chi_stat.shape[0])
    # get the second to the maximum chi-square
    idx_1 = np.where(chi_stat == np.partition(chi_stat,-1)[-1])[0][0]
    
    #print(score_index)
    return score_values[idx_1] 


#transform:
act2 = act.apply(lambda x: compute_score_by_chisquare(x, act_dist), axis = 1)
act2.name = "act_chisq"


In [14]:
z = pd.merge(cp, wc, left_index = True, right_index = True)
z = pd.merge(z, tc, left_index = True, right_index = True)
z = pd.merge(z, ue, left_index = True, right_index = True)
z = pd.merge(z, at, left_index = True, right_index = True)
z = pd.merge(z, at3, left_index = True, right_index = True)
z = pd.merge(z, act2, left_index = True, right_index = True)
z = pd.merge(z, scores, left_index = True, right_index = True)
z.corr().iloc[:,-5:]

Unnamed: 0,at2,at3,at4,act_chisq,score
cp_sum_backstep,0.054762,0.039411,0.070009,0.070057,-0.018364
cp_n_backstep,-0.01781,0.100114,0.012703,0.116048,0.386854
cp_sum_forwardstep,-0.051813,-0.036957,-0.066399,-0.068852,0.032028
cp_n_forwardstep,0.099595,0.122473,0.109858,0.050783,0.684374
cp_change_stat,-0.065529,-0.022901,-0.073379,0.007465,-0.121643
cp_skew_backstep,-0.023933,-0.076421,-0.008287,-0.064164,-0.313955
cp_skew_forwardstep,0.072081,0.051607,0.031857,0.046224,0.336884
wc_changing_nsteps,0.096975,0.129752,0.112246,0.058471,0.61345
wc_step_count,0.071828,0.112833,0.088311,0.009309,0.646826
wc_max,0.149174,0.13821,0.16365,0.071204,0.691226


In [15]:
X = z.drop(['score'], axis = 1).copy()
X = sm.add_constant(X)
Y = z.score

sm.OLS(Y, X).fit().summary()

0,1,2,3
Dep. Variable:,score,R-squared:,0.567
Model:,OLS,Adj. R-squared:,0.562
Method:,Least Squares,F-statistic:,114.4
Date:,"Sun, 31 Dec 2023",Prob (F-statistic):,0.0
Time:,16:11:39,Log-Likelihood:,-2531.2
No. Observations:,2471,AIC:,5120.0
Df Residuals:,2442,BIC:,5289.0
Df Model:,28,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-10.3520,0.782,-13.230,0.000,-11.886,-8.818
cp_sum_backstep,0.0071,0.019,0.376,0.707,-0.030,0.044
cp_n_backstep,0.0258,0.048,0.535,0.593,-0.069,0.120
cp_sum_forwardstep,0.0047,0.019,0.251,0.802,-0.032,0.042
cp_n_forwardstep,0.8539,0.269,3.170,0.002,0.326,1.382
cp_change_stat,0.3962,0.325,1.218,0.223,-0.242,1.034
cp_skew_backstep,-0.0023,0.004,-0.582,0.560,-0.010,0.005
cp_skew_forwardstep,-0.0019,0.002,-0.974,0.330,-0.006,0.002
wc_changing_nsteps,-0.0011,0.000,-5.324,0.000,-0.001,-0.001

0,1,2,3
Omnibus:,115.589,Durbin-Watson:,2.062
Prob(Omnibus):,0.0,Jarque-Bera (JB):,147.05
Skew:,-0.474,Prob(JB):,1.1700000000000001e-32
Kurtosis:,3.727,Cond. No.,52600.0
