# **EDA for combo activity_actiontime**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
# feature_engineering
import re

# ploting
import matplotlib.pyplot as plt
# model initial:
import statsmodels.api as sm

#sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
#Set import folder
input_folder = "kaggle/input/linking-writing-processes-to-writing-quality/"

In [3]:
# Load train logs
train_logs = pd.read_csv(input_folder + "train_logs.csv",delimiter = ",",header = 0)
# Set the id and event id as index:
train_logs_indexed = train_logs.set_index(['id', 'event_id'])
# Load train scores
train_scores = pd.read_csv(input_folder +"train_scores.csv", delimiter = ",", header = 0)
scores = pd.Series(data = train_scores['score'].values, index = train_scores['id'].values, name = 'score')
scores.unique().shape[0]

12

In [4]:
train_logs.head(6)

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1
5,001519c8,6,107296,107400,104,Input,q,q,q,3,1


In [5]:
train_logs.activity = train_logs.activity.apply(lambda x: "Move" if ("Move" in x) else x)
train_logs_indexed.activity = train_logs_indexed.activity.apply(lambda x: "Move" if ("Move" in x) else x)
    

In [6]:
activity_time = pd.pivot_table(data = train_logs, values = 'action_time', aggfunc = "sum", 
                               columns = "activity", index = "id").fillna(1).replace(0,1)



In [7]:
activity_time.head(4)

activity,Input,Move,Nonproduction,Paste,Remove/Cut,Replace
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
001519c8,243731.0,1.0,18506.0,1.0,34130.0,876.0
0022f953,237891.0,1.0,13781.0,71.0,23550.0,98.0
0042269b,353718.0,1.0,33951.0,1.0,32905.0,627.0
0059420b,167790.0,1.0,3062.0,160.0,18410.0,174.0


In [8]:
activity_time.describe()

activity,Input,Move,Nonproduction,Paste,Remove/Cut,Replace
count,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0
mean,286004.01376,19.13881,17909.078106,25.614731,29519.403885,192.670174
std,137018.057407,632.370323,32568.430834,102.068333,22881.089918,386.569604
min,6886.0,1.0,6.0,1.0,1.0,1.0
25%,182101.5,1.0,3993.0,1.0,13527.0,1.0
50%,262703.0,1.0,9308.0,1.0,23375.0,1.0
75%,364105.5,1.0,19685.5,1.0,39545.5,219.0
max,921077.0,29693.0,482115.0,1931.0,326589.0,3557.0


In [9]:
z = pd.merge(np.log(activity_time), scores, left_index = True, right_index = True).corr()
z

Unnamed: 0,Input,Move,Nonproduction,Paste,Remove/Cut,Replace,score
Input,1.0,0.008696,0.137472,0.050931,0.587012,0.25104,0.57499
Move,0.008696,1.0,0.048067,0.015846,-0.000104,0.030972,-0.007452
Nonproduction,0.137472,0.048067,1.0,0.180287,0.240139,0.412372,0.131174
Paste,0.050931,0.015846,0.180287,1.0,0.078484,0.133584,0.042597
Remove/Cut,0.587012,-0.000104,0.240139,0.078484,1.0,0.199118,0.366995
Replace,0.25104,0.030972,0.412372,0.133584,0.199118,1.0,0.26432
score,0.57499,-0.007452,0.131174,0.042597,0.366995,0.26432,1.0


In [34]:
# Eda combination and activity and action time
# Based on the above, only the Input action time, nonpro, remove/cut and replace will be used:


class ComboActivityActionTime(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        X = X.copy()
        ids = X.id.unique()
        ids_length = ids.shape[0]
        
        X.activity = X.activity.apply(lambda x: "Move" if ("Move" in x) else x)
        
        input_cols = pd.Series(data = np.ones(shape = (ids_length,), dtype = np.int64), index = ids)
        input_cols.name = "Input"
        data_container = X.loc[X.activity == 'Input'][['action_time','id']].groupby('id').agg("sum")
        for t in data_container.index.values:
            if (data_container.loc[t]>0).item():
                input_cols.at[t] = data_container.loc[t].item()  


        nonproduction_cols = pd.Series(data = np.ones(shape = (ids_length,), dtype = np.int64), index = ids)
        nonproduction_cols.name = "Nonproduction"
        data_container = X.loc[X.activity == 'Nonproduction'][['action_time','id']].groupby('id').agg("sum")
        for t in data_container.index.values: 
            if (data_container.loc[t]>0).item():
                nonproduction_cols.at[t] = data_container.loc[t].item()  

        
        move_cols = pd.Series(data = np.ones(shape = (ids_length,), dtype = np.int64), index = ids)
        move_cols.name = "Move"
        data_container = X.loc[X.activity == 'Move'][['action_time','id']].groupby('id').agg("sum")
        for t in data_container.index.values:
            if (data_container.loc[t]>0).item():
                move_cols.at[t] = data_container.loc[t].item()  
        

        paste_cols = pd.Series(data = np.ones(shape = (ids_length,), dtype = np.int64), index = ids)
        paste_cols.name = "Paste"
        data_container= X.loc[X.activity == 'Paste'][['action_time','id']].groupby('id').agg('sum')
        for t in data_container.index.values: 
            if (data_container.loc[t]>0).item():
                paste_cols.at[t] = data_container.loc[t].item()  

        
        remove_cols = pd.Series(data = np.ones(shape = (ids_length,), dtype = np.int64), index = ids)
        remove_cols.name = "Remove/Cut"
        data_container = X.loc[X.activity == 'Remove/Cut'][['action_time','id']].groupby('id').agg('sum')
        for t in data_container.index.values: 
            if (data_container.loc[t]>0).item():
                remove_cols.at[t] = data_container.loc[t].item()  

        
        replace_cols = pd.Series(data = np.ones(shape = (ids_length,), dtype = np.int64), index = ids)
        replace_cols.name = "Replace"
        data_container= X.loc[X.activity == 'Replace'][['action_time','id']].groupby('id').agg('sum')
        for t in data_container.index.values: 
            if (data_container.loc[t]>0).item():
                replace_cols.loc[t] = data_container.loc[t].item()  

        n = pd.merge(input_cols, move_cols, left_index = True, right_index = True)
        n.columns = ['Input', 'Move']
        n = pd.merge(n,nonproduction_cols, left_index = True, right_index = True)
        n.columns = ['Input', 'Move', 'Nonproduction']
        n = pd.merge(n, paste_cols, left_index = True, right_index = True)
        n.columns = ['Input', 'Move', 'Nonproduction', 'Paste']
        n = pd.merge(n, remove_cols, left_index = True, right_index = True)
        n.columns = ['Input', 'Move','Nonproduction', 'Paste', 'Remove/Cut']
        n = pd.merge(n, replace_cols, left_index = True, right_index = True)
        n.columns = ['Input', 'Move','Nonproduction','Paste', 'Remove/Cut', 'Replace']
        n = np.log(n)
        return n 
        

In [35]:
l = ComboActivityActionTime().fit_transform(train_logs)

In [36]:
z1 = pd.merge(l,scores, left_index = True, right_index = True).corr()
z1

Unnamed: 0,Input,Move,Nonproduction,Paste,Remove/Cut,Replace,score
Input,1.0,0.008696,0.137472,0.050931,0.587012,0.25104,0.57499
Move,0.008696,1.0,0.048067,0.015846,-0.000104,0.030972,-0.007452
Nonproduction,0.137472,0.048067,1.0,0.180287,0.240139,0.412372,0.131174
Paste,0.050931,0.015846,0.180287,1.0,0.078484,0.133584,0.042597
Remove/Cut,0.587012,-0.000104,0.240139,0.078484,1.0,0.199118,0.366995
Replace,0.25104,0.030972,0.412372,0.133584,0.199118,1.0,0.26432
score,0.57499,-0.007452,0.131174,0.042597,0.366995,0.26432,1.0


In [37]:
z

Unnamed: 0,Input,Move,Nonproduction,Paste,Remove/Cut,Replace,score
Input,1.0,0.008696,0.137472,0.050931,0.587012,0.25104,0.57499
Move,0.008696,1.0,0.048067,0.015846,-0.000104,0.030972,-0.007452
Nonproduction,0.137472,0.048067,1.0,0.180287,0.240139,0.412372,0.131174
Paste,0.050931,0.015846,0.180287,1.0,0.078484,0.133584,0.042597
Remove/Cut,0.587012,-0.000104,0.240139,0.078484,1.0,0.199118,0.366995
Replace,0.25104,0.030972,0.412372,0.133584,0.199118,1.0,0.26432
score,0.57499,-0.007452,0.131174,0.042597,0.366995,0.26432,1.0


In [38]:
z.equals(z1)

True