In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Flag to run locally
run_local = True

if run_local:
    df = pd.read_csv('../data/train_logs.csv')
    scores = pd.read_csv('../data/train_scores.csv')
    df_test = pd.read_csv('../data/test_logs.csv')
else:
    df = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
    scores = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')
    df_test = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')

df = df.merge(scores, on='id')

In [3]:
df.head(2)

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,score
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5


In [14]:
def raw_aggregation_functions(df):

    typing_features = df.groupby('id').agg({
    'activity': 'count',                # Total number of activities
    'action_time': ['sum', 'mean'],     # Total and average action time
    'word_count': 'max',                # Maximum word count
    'text_change': 'nunique',           # Number of unique text changes
    'cursor_position': 'mean'           # Average cursor position
    })

    # Flatten the multi-level column index
    typing_features.columns = ['_'.join(col).strip() for col in typing_features.columns.values]

    return typing_features

In [5]:
def preprocessing(df, df_test, scale_needed=True):
    """
    Preprocesses the input dataframes for training and testing.
    """

    X_train = pd.DataFrame({'id': df['id'].unique()})
    y_train = df.groupby('id')['score'].first().reset_index(drop=True)
    X_test = pd.DataFrame({'id': df_test['id'].unique()})

    # Feature engineering
    agg_train_list = []
    agg_test_list = []

    # Aggregate and collect features without merging
    for func in [raw_aggregation_functions]:
        agg_train = func(df)
        agg_test = func(df_test)

        # Check if the index name is 'id'
        if agg_train.index.name != 'id' or agg_test.index.name != 'id':
            raise ValueError("The index must be 'id' for aggregation functions.")
        
        agg_train_list.append(agg_train)
        agg_test_list.append(agg_test)

    # Concatenate all aggregated features horizontally, aligned by index
    agg_train = pd.concat(agg_train_list, axis=1)
    agg_test = pd.concat(agg_test_list, axis=1)

    # Reset index before merge
    agg_train.reset_index(inplace=True)
    agg_test.reset_index(inplace=True)

    # Perform a single merge operation
    X_train = X_train.merge(agg_train, on='id', how='left')
    X_test = X_test.merge(agg_test, on='id', how='left')

    # Remove 'id' column before scaling
    X_train_ids = X_train['id']
    X_train = X_train.drop(columns=['id'])
    X_test_ids = X_test['id']
    X_test = X_test.drop(columns=['id'])

    # Standardize features
    if scale_needed:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
    return X_train, X_train_ids, y_train, X_test, X_test_ids

In [6]:
def train(X_train, y_train):
    """
    Trains a regression model.
    """

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    model = Lasso(alpha=0.1)
    model.fit(X_train, y_train)

    return model

In [7]:
# Prediction
def predict(model, X_test):
    """
    Makes predictions using the trained model.
    """

    preds = model.predict(X_test)
    
    return preds

In [8]:
def submit(X_test_ids, preds):
    try:
        # Check if X_test_ids and preds have the same length
        if len(X_test_ids) != len(preds):
            raise ValueError("The lengths of X_test_ids and preds must match.")
        
        # Create a submission file
        submission = pd.DataFrame({'id': X_test_ids, 'score': preds})
        submission.to_csv('./submission.csv', index=False)
        print('Submitted')

    except ValueError as e:
        print(f"Error: {e}")

In [9]:
X_train, X_train_ids, y_train, X_test, X_test_ids = preprocessing(df, df_test, scale_needed=True)
model = train(X_train, y_train)
preds = predict(model, X_test)
submit(X_test_ids, preds)

Submitted
