# **Preprocessing**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

import re
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler, SplineTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn.metrics import mean_squared_error

#models
import joblib


In [2]:
input_folder = "kaggle/input/linking-writing-processes-to-writing-quality/"
train_logs = pd.read_csv(input_folder + "train_logs.csv",delimiter = ",",header = 0)
train_scores = pd.read_csv(input_folder +"train_scores.csv", delimiter = ",", header = 0)
scores = pd.Series(data = train_scores['score'].values, index = train_scores['id'].values, name = 'score')
test_logs = pd.read_csv(input_folder + "test_logs.csv",delimiter = ",",header = 0)

In [3]:
# Feature Engineering for transformer for cursor position
class CursorPositionTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        # setup the feature names
        # self.feature_names = ['cp_sum_backstep', 'cp_n_backstep', 'cp_sum_forwardstep','cp_n_forwardstep',
        #              'cp_change_stat', 'cp_skew_backstep', 'cp_skew_forwardstep']  

        self.feature_names = ['cp_sum_backstep', 'cp_n_backstep', 'cp_sum_forwardstep','cp_n_forwardstep',
                             'average_cp']

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        z = X.groupby('id')['cursor_position'].aggregate([self.cp_sum_backstep,self.cp_n_backstep, 
                     self.cp_sum_forwardstep, self.cp_n_forwardstep, lambda x: np.log(np.mean(x))])
        # make a copy of participant ids:
        self.index_ids = z.index.values
        return z.values

    def cp_sum_backstep(self,x):
        n1 = np.diff(np.log(x+1))
        return np.sum(n1[n1 < 0])
    
    def cp_skew_backstep(self,x):
        n1 = np.diff(np.log(x+1))
        return st.skew(n1[n1 < 0])
    
    def cp_n_backstep(self,x):
        n1 = np.diff(np.log(x+1))
        return np.log((n1<0).sum()+1)
    
    def cp_sum_forwardstep(self,x):
        n1 = np.diff(np.log(x+1))
        return np.sum(n1[n1 > 0])
    
    def cp_skew_forwardstep(self,x):
        n1 = np.diff(np.log(x+1))
        return st.skew(n1[n1 > 0])
    
    def cp_n_forwardstep(self,x):
        n1 = np.diff(np.log(x+1))
        return np.log((n1>0).sum()+1)
    
    def cp_change_stat(self,x):
        n1 = np.diff(np.log(x+1))
        return np.std(n1, ddof = 1)
    

In [4]:
# eda wordcount transformer:

# word_count feature engineering
# Based on the graph above, we can count the number of zero changes and get the mean:
# wc_zero_change will return the count of all non-zero steps taken by the person

class WordCountTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y = None):
        return self

    def wc_non_zero_change(self, x):
        n1 = np.diff(np.log(x+1))
        n2 = np.count_nonzero(n1)
        return n2
    def wc_change_stat(self, x):
        n1 = np.diff(np.log(x+1))
        last_cutoff = n1.shape[0]-200
        n2 = np.std(n1, ddof = 1)
        return n2
        
    def transform(self, X):
        output =  X.groupby(['id'])['word_count'].aggregate([self.wc_non_zero_change,lambda x: np.log(len(x)), 
                          lambda x: np.log(np.max(x)+1)])
        output.columns = ["wc_changing_nsteps", "wc_step_count", "wc_max"]
        self.feature_names = output.columns.values
        self.index_ids = output.index.values
        return output.values
        

In [5]:
# eda textchange transformer:
# added tc
class TextChangeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y = None):
        return self

    def hasChar(self, x,character:str):
        out = 0 
        for strings in x:
            if character in strings:
                out = 1
                break
        return out
    
    def qCounter(self,c):
        h = " ".join(c)
        return np.log(len(re.findall(r" q ", h))+1)
        
    def transform(self, X):
        output = X.groupby(['id'])['text_change'].aggregate([
            ("tc_1", lambda x: self.hasChar(x,character = "?")), 
            ("tc_2", lambda x: self.hasChar(x,character = "=>")), 
            ("tc_3", lambda x: self.hasChar(x,character = "(")), 
            ("tc_4", lambda x: self.hasChar(x,character = "\"")), 
            ("tc_5", lambda x: self.hasChar(x,character = "-")), 
            ("tc_6", lambda c: self.qCounter(c))]) 
        self.feature_names = output.columns.values
        self.index_ids = output.index.values
        return output.values
        

            

In [6]:
# Eda Text Change  Part 2
class TextChangeTransformer2(BaseEstimator, TransformerMixin):
    def __init__(self, max_word_length):
        self.max_word_length = max_word_length

    def fit(self, X, y = None):
        return self
        
    def text_change_distribution(self, v:str):
        distribution_container = []
        start_flag = 1
        word_count = 0
        size = 0
        for i in range(1,self.max_word_length + 2):
            s = "q{%s} " % i
            f = re.findall(s, v)
            if(start_flag == 1):
                word_count = len(f)
                start_flag = 0
            else:
                size = word_count - len(f)
                distribution_container.append(size)
                word_count = len(f)
                
        return distribution_container
        
    def transform(self, X):
        
        X = X.groupby('id')['text_change'].aggregate(lambda r: self.text_change_distribution("".join(r)))
        X = np.log(pd.DataFrame(np.stack(X, axis = 0), index = X.index)+1)
        return X.values
        

In [7]:
# Feature Engineering Up Event Variable Transformer:
class UpEventTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y = None):
        return self


    
    def find_clicked(self, x, st:str):
        has_string = 0
        for event in x:
            if(event == st):
                has_string = 1
                break
        return has_string

    def transform(self, X):
        
        output = X.groupby(['id'])['up_event'].aggregate([('ue_1',lambda x: self.find_clicked(x,"|")),
                                                          ('ue_2', lambda x: self.find_clicked(x,"Shift")),
                                                          ('ue_3', lambda x: self.find_clicked(x,"Tab")),
                                                          ])
        self.feature_names = output.columns.values
        self.index_ids = output.index.values
        return output.values



In [8]:
# Eda action_time variable ransformer: (AT)

class ActionTimeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, scores):
        self.scores = scores
        self.score_values = np.arange(start = 0.5, stop = 6.5, step = 0.5)

    def fit(self, X, y = None):
        #Get the action time proportion or distribution per score:
        at_init = X.groupby('id')['action_time'].aggregate([
            ('one', lambda x: self.above_log_count(x, from_zero = 1)),
            ('two', lambda x: self.above_log_count(x, from_zero = 2)),
            ('three', lambda x: self.above_log_count(x, from_zero = 3)),
            ('four', lambda x: self.above_log_count(x, from_zero = 4)),
            ('five', lambda x: self.above_log_count(x, from_zero = 5)),
        ])
        
        at_init2 = pd.merge(at_init, self.scores, left_index = True, right_index = True)
        at2 = at_init2.groupby(by = 'score').sum()
        self.at_proportion= at2.apply(lambda x: x/(np.sum(at2, axis = 1)))
        return self
        
    def above_log_count(self, x, from_zero = 1):
        z = np.diff(np.log(x+1))
        z = np.abs(z)
        if from_zero < 5:
            count= len(list(filter(lambda q: (q>from_zero -1) and (q < from_zero), z)))
        else:
            count= len(list(filter(lambda q: q>=from_zero, z )))
        return count 
        
    def above_log_ratio(self, x, from_zero = 1):
        z = np.diff(np.log(x+1))
        z = np.abs(z)
        if from_zero < 3:
            count= len(list(filter(lambda q: (q>from_zero -1) and (q < from_zero), z)))
        else:
            count= len(list(filter(lambda q: q>=from_zero, z )))
        return np.log((count+1)/len(z)) 

        
    # Use chi-square to select the score of the given participant id   
    def compute_score_by_chisquare(self, fo:pd.Series, distribution):
        fo =fo + 1 # to remove errors for those with zero values
        total = np.sum(fo)
        # print(total)
        expected_arrays = distribution * total
        # print(expected_arrays)
        chi_stat = []
        for j in range(expected_arrays.shape[0]):
            results = st.chisquare(f_obs = fo, f_exp = expected_arrays.iloc[j])
            chi_stat.append(results[1])
    
        chi_stat = np.array(chi_stat)
        # get the maximum p-value (-1) or second to the max (-2), etc
        score_idx_1 = np.where(chi_stat == np.partition(chi_stat,-1)[-1])[0][0]
        score_idx_2 = np.where(chi_stat == np.partition(chi_stat,-2)[-2])[0][0]
        score_idx_3 = np.where(chi_stat == np.partition(chi_stat,-3)[-3])[0][0]
        score_idx_4 = np.where(chi_stat == np.partition(chi_stat,-4)[-4])[0][0]
        score_list = [
            self.score_values[score_idx_1],
            self.score_values[score_idx_3],
            self.score_values[score_idx_3],
            self.score_values[score_idx_4]]
        
        return np.mean(score_list)
        
    def transform(self, X):
        transform_1 = X.groupby("id")['action_time'].aggregate([
        ('at_1', lambda x: self.above_log_ratio(x, from_zero = 1)),
        ('at_2', lambda x: self.above_log_ratio(x, from_zero = 2)),
        ('at_3', lambda x: self.above_log_ratio(x, from_zero = 3))
        ])
        
        at_init = X.groupby('id')['action_time'].aggregate([
            ('one', lambda x: self.above_log_count(x, from_zero = 1)),
            ('two', lambda x: self.above_log_count(x, from_zero = 2)),
            ('three', lambda x: self.above_log_count(x, from_zero = 3)),
            ('four', lambda x: self.above_log_count(x, from_zero = 4)),
            ('five', lambda x: self.above_log_count(x, from_zero = 5)),
        ])
        transform_2 = at_init.apply(
            lambda x: self.compute_score_by_chisquare(x, distribution = self.at_proportion),axis = 1)
        transform_2.name = "at_chisq"
        output = pd.merge(transform_1, transform_2, left_index = True, right_index = True)
        self.feature_names = output.columns.values
        self.index_ids = output.index.values
        return output.values 

        
        


In [9]:
# Transformer for Activity, act:
class ActivityTransformer(BaseEstimator, TransformerMixin):
    oneHot: OneHotEncoder
    scores: pd.Series
    act_dist: pd.DataFrame
    feature_names: np.array
    initial_features: np.array
    
    def __init__(self, scores:pd.Series):
        self.oneHot = OneHotEncoder(handle_unknown = 'ignore', categories = 'auto', sparse_output = False)
        self.scores = scores
        self.score_values = np.arange(start = 0.5, stop = 6.5, step = 0.5)
        self.initial_features = np.array(['ac_Input', 'ac_Move', 'ac_NonPro', 'ac_Paste', 'ac_RemCut', 'ac_Replace'])
        
    def fit(self,X, y=None):
        #Transform X labels first:
        #Transform all with move into a Move:
        X.activity = X.activity.apply(lambda x: "Move" if ("Move" in x) else x)
        #Encode then get the distribution
        self.oneHot.fit(X)
        a1 = self.oneHot.fit_transform(X.activity.values.reshape(-1,1))
        a2 = pd.DataFrame(data=a1, columns=self.initial_features)
        a2['id'] = X.id.copy()
        
        act = a2.groupby(by = "id").sum()
        act = act + 1 # to avoid expected value of zero
        self.act = act
        
        # Get the distribution for each kind of score
        # act distribution:
        act_dist = pd.merge(act, scores, left_index = True, right_index = True)
        act_dist = act_dist.groupby('score').sum()
        
        row_total = np.sum(act_dist, axis = 1)
        self.act_dist = act_dist.apply(lambda x: x / row_total)
            
        return self


    def compute_score_by_chisquare(self, fo:pd.Series, distribution):
        fo = fo+1
        total = np.sum(fo)
        # print(total)
        # add 1 to avoid expected value of zero.
        expected_arrays = distribution * total 
        # print(expected_arrays)
        chi_stat = []
        for j in range(expected_arrays.shape[0]):
            results = st.chisquare(f_obs = fo, f_exp = expected_arrays.iloc[j])
            chi_stat.append(results[1])
    
        chi_stat = np.array(chi_stat)
        # get the maximum p-value (-1) 
        score_idx_1 = np.where(chi_stat == np.partition(chi_stat,-1)[-1])[0][0]
        
        return self.score_values[score_idx_1]


    def transform(self, X):
        #Transform X labels first:
        #Transform all with move into a Move:
        X.activity = X.activity.apply(lambda x: "Move" if ("Move" in x) else x)
        
        pre_output = self.oneHot.transform(X['activity'].values.reshape(-1,1))
        a2 = pd.DataFrame(data = pre_output, columns = self.initial_features)
        a2['id'] = X.id 
        act = a2.groupby('id').sum()
        output = act.apply(lambda x: self.compute_score_by_chisquare(x, self.act_dist), axis = 1)
        output.name = "act_chisq"
        self.feature_names = output.name
        
        return np.c_[output.values, act.values]
        


In [10]:
class ComboActivityActionTime(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        X = X.copy()
        ids = X.id.unique()
        ids_length = ids.shape[0]
        
        X.activity = X.activity.apply(lambda x: "Move" if ("Move" in x) else x)
        
        input_cols = pd.Series(data = np.ones(shape = (ids_length,), dtype = np.int64), index = ids)
        input_cols.name = "Input"
        data_container = X.loc[X.activity == 'Input'][['action_time','id']].groupby('id').agg("sum")
        for t in data_container.index.values:
            if (data_container.loc[t]>0).item():
                input_cols.at[t] = data_container.loc[t].item()  


        nonproduction_cols = pd.Series(data = np.ones(shape = (ids_length,), dtype = np.int64), index = ids)
        nonproduction_cols.name = "Nonproduction"
        data_container = X.loc[X.activity == 'Nonproduction'][['action_time','id']].groupby('id').agg("sum")
        for t in data_container.index.values: 
            if (data_container.loc[t]>0).item():
                nonproduction_cols.at[t] = data_container.loc[t].item()  

        
        move_cols = pd.Series(data = np.ones(shape = (ids_length,), dtype = np.int64), index = ids)
        move_cols.name = "Move"
        data_container = X.loc[X.activity == 'Move'][['action_time','id']].groupby('id').agg("sum")
        for t in data_container.index.values:
            if (data_container.loc[t]>0).item():
                move_cols.at[t] = data_container.loc[t].item()  
        

        paste_cols = pd.Series(data = np.ones(shape = (ids_length,), dtype = np.int64), index = ids)
        paste_cols.name = "Paste"
        data_container= X.loc[X.activity == 'Paste'][['action_time','id']].groupby('id').agg('sum')
        for t in data_container.index.values: 
            if (data_container.loc[t]>0).item():
                paste_cols.at[t] = data_container.loc[t].item()  

        
        remove_cols = pd.Series(data = np.ones(shape = (ids_length,), dtype = np.int64), index = ids)
        remove_cols.name = "Remove/Cut"
        data_container = X.loc[X.activity == 'Remove/Cut'][['action_time','id']].groupby('id').agg('sum')
        for t in data_container.index.values: 
            if (data_container.loc[t]>0).item():
                remove_cols.at[t] = data_container.loc[t].item()  

        
        replace_cols = pd.Series(data = np.ones(shape = (ids_length,), dtype = np.int64), index = ids)
        replace_cols.name = "Replace"
        data_container= X.loc[X.activity == 'Replace'][['action_time','id']].groupby('id').agg('sum')
        for t in data_container.index.values: 
            if (data_container.loc[t]>0).item():
                replace_cols.loc[t] = data_container.loc[t].item()  

        n = pd.merge(input_cols, move_cols, left_index = True, right_index = True)
        n.columns = ['Input', 'Move']
        n = pd.merge(n,nonproduction_cols, left_index = True, right_index = True)
        n.columns = ['Input', 'Move', 'Nonproduction']
        n = pd.merge(n, paste_cols, left_index = True, right_index = True)
        n.columns = ['Input', 'Move', 'Nonproduction', 'Paste']
        n = pd.merge(n, remove_cols, left_index = True, right_index = True)
        n.columns = ['Input', 'Move','Nonproduction', 'Paste', 'Remove/Cut']
        n = pd.merge(n, replace_cols, left_index = True, right_index = True)
        n.columns = ['Input', 'Move','Nonproduction','Paste', 'Remove/Cut', 'Replace']
        n = np.log(n)
        return n 
        

In [11]:
# Pipeline to combine summary:
cp_pipe = Pipeline([('cp_tx', CursorPositionTransformer())])
wc_pipe = Pipeline([('wc_tx', WordCountTransformer())])
tc_pipe = Pipeline([('tc_tx', TextChangeTransformer())])
tc2_pipe = Pipeline([('tc2_tx', TextChangeTransformer2(12))])
ue_pipe = Pipeline([('ue_tx', UpEventTransformer())])
at_pipe = Pipeline([('at_tx', ActionTimeTransformer(scores = scores))])
act_pipe = Pipeline([("act_tx", ActivityTransformer(scores))])
combo_act_pipe = Pipeline([("comboact_tx", ComboActivityActionTime())])

#join the pipes:
main_pipe = FeatureUnion(transformer_list = [
    ('cp_pipe', cp_pipe),
    ('wc_pipe', wc_pipe),
    ('tc_pipe', tc_pipe),
    ('tc2_pipe', tc2_pipe),
    ('ue_pipe', ue_pipe),
    ('at_pipe', at_pipe),
    ('act_pipe', act_pipe),
    ('combo_act_pipe', combo_act_pipe)])

final_pipe = Pipeline([('main_pipe', main_pipe),
                        ('poly', SplineTransformer(degree = 2, n_knots = 3)),
                        ('scaler', StandardScaler())
                         ])


In [12]:
X = final_pipe.fit_transform(train_logs)

In [13]:
train_ids = main_pipe.named_transformers['cp_pipe'].named_steps['cp_tx'].index_ids

In [14]:
#dum main pipe
joblib.dump(X, "transformed_train.pkl")
joblib.dump(train_ids, "train_ids.pkl")

['train_ids.pkl']

In [15]:
X.shape

(2471, 184)