In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy.fft import fft
import re
import random, os

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor

In [2]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
SEED = 1234
seed_everything(SEED)

In [4]:
# Flag to run locally
run_local = True

if run_local:
    df = pd.read_csv('../data/train_logs.csv')
    scores = pd.read_csv('../data/train_scores.csv')
    df_test = df
else:
    df = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
    scores = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')
    df_test = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')

df = df.merge(scores, on='id')

In [5]:
# ------------- Check the size of a text change, 
# Note: Should be applied to column "text_change"

# Function for pandas apply that check number of large text changes
def count_large_text_changes(text_changes, size=20):
    return len([tc for tc in text_changes if len(tc) > size])

def count_extremely_large_text_changes(text_changes, size=100):
    return len([tc for tc in text_changes if len(tc) > size])

def count_tiny_text_changes(text_changes, size=5):
    return len([tc for tc in text_changes if len(tc) < size])

# ------------- For a given type of activity, count the number of times it occurs ------------ #
# Note: Should be applied to column "activity"

# Count nonproduction
def count_nonproduction(action_list):
    return len([action for action in action_list if action == 'Nonproduction'])

# count input
def count_input(action_list):
    return len([action for action in action_list if action == 'Input'])

# count remove/cut
def count_remove(action_list):
    return len([action for action in action_list if action == 'Remove/Cut'])

# Count Replace
def count_replace(action_list):
    return len([action for action in action_list if action == 'Replace'])

# Count Paste
def count_paste(action_list):
    return len([action for action in action_list if action == 'Paste'])

# ------------- For a given chunk of text that was moved, determine the size and the distance moved ------------ #
# Create move vectors features uses the four helpers below
def create_move_vectors_features(df): 
    
    # Create selection_vectors (the position of the selection before and after the move)
    df['selection_vectors'] = df['activity'].map(get_move_from_vectors)

    # Create functions from this
    df['distance_of_moved_selection'] = df['selection_vectors'].map(distance_of_move_of_selection)
    df['size_of_moved_selection'] = df['selection_vectors'].map(size_of_moved_selection)
    df.drop(['selection_vectors'], axis=1, inplace=True)

    return df

# Function to extract the distance vectors from the activity column
def split_activity(activity):
    return [np.array(a[1:-1].split(', '), dtype=int) for a in re.findall(r'\[[0-9]*, [0-9]*\]', activity)]

# Extract the vectors from the activity column when Move From is in the activity
def get_move_from_vectors(activity):
    if 'Move From' in activity:
        return split_activity(activity)
    else:
        return []
    
def distance_of_move_of_selection(selection_vectors):
    if len(selection_vectors) > 0:
        return selection_vectors[1][0] - selection_vectors[0][0]
    else:
        return 0

# How large was the selection
def size_of_moved_selection(selection_vectors):
    if len(selection_vectors) > 0:
        return selection_vectors[0][1] - selection_vectors[0][0]
    else:
        return 0

## Time features
### The total amount of time spent on the essay (as a fraction of total time allowed)
# The total amount of time the person spent writing the essay as a fraction of the total time
def fraction_of_time_spent_writing(writing_times):
    total_time = 1800000 # Half an hour in milliseconds
    max_time = max(writing_times)
    return max_time / total_time

# This function simply normalizes the text of Move From to be uniform, 
# should be used once Move From features have already been created
def normalize_move_from(activity):
    if 'Move From' in activity:
        return 'Move From'
    else:
        return activity
    
def create_action_time_features(df):
    # Normalize move from column
    df['activity'] = df['activity'].map(normalize_move_from)

    # Calculate average time, max time and total time of different actions
    action_time_features = df.groupby(['id', 'activity']).agg(
        {'action_time': ['mean', 'max', 'sum', 'count']}
    )

    # Flatten multi index columns
    action_time_features.columns = ['_'.join(col).strip() for col in action_time_features.columns.values]

    # Unstack multi index rows
    action_time_features = action_time_features.unstack('activity')

    # Re-flatten multi index columns
    action_time_features.columns = ['_'.join(col).strip() for col in action_time_features.columns.values]
    action_time_features.fillna(0, inplace=True) # Fill na with 0s 
    return action_time_features

def raw_aggregation_functions(df):

    # Create features related to individual action time
    action_time_features = create_action_time_features(df)

    # Create features related to moved selections of text
    df = create_move_vectors_features(df)

    # Feature engineering for typing behavior features
    typing_features = df.groupby('id').agg({
        'activity': 'count',                # Total number of activities
        'action_time': ['sum', 'mean'],     # Total and average action time
        'word_count': 'max',                # Maximum word count
        'text_change': 'nunique',           # Number of unique text changes
        'cursor_position': 'mean',           # Average cursor position
        'text_change' : count_large_text_changes,
        'text_change' : count_extremely_large_text_changes,
        'text_change' : count_tiny_text_changes,
        'activity': count_nonproduction,
        'activity': count_input,
        'activity': count_remove,
        'activity': count_replace,
        'activity': count_paste,
        'distance_of_moved_selection': ['mean', 'max'],
        'size_of_moved_selection': ['mean', 'max'],
        'up_time': fraction_of_time_spent_writing, # Amount of time spent on the essay,
    })

    # Flatten the multi-level column index
    typing_features.columns = ['_'.join(col).strip() for col in typing_features.columns.values]

    # Merge action time features with typing features
    features = pd.merge(typing_features, action_time_features, on='id')
    
    return features

# Optimize the function to calculate top N frequencies and their magnitudes for each 'id' using groupby and apply
def calculate_fft_features(group):

    group['pos'] = group['cursor_position']%30
    group['line'] = (group['cursor_position']/30).astype(int)

    # Perform Fourier Transform on 'pos'
    fft_values = fft(group['pos'])[1:]
    
    # Generate frequencies corresponding to the Fourier Transform values
    frequencies = np.fft.fftfreq(len(fft_values), 1)[1:]
    
    # Take absolute value to get magnitude
    fft_magnitude = np.abs(fft_values)
    
    # Identify indices where the frequencies are positive
    positive_indices = np.where(frequencies > 0)[0]
    
    # Filter out only positive frequencies and skip the zero frequency
    frequencies = frequencies[positive_indices]
    magnitudes = fft_magnitude[positive_indices]
    
    # Frequency Domain Features
    peak_freq = frequencies[np.argmax(magnitudes)]
    if np.sum(magnitudes) == 0:
        mean_freq = 0  # or some other appropriate default value
    else:
        mean_freq = np.average(frequencies, weights=magnitudes)

    median_freq = frequencies[len(magnitudes) // 2]
    bandwidth = np.ptp(frequencies)
    freq_skewness = scipy.stats.skew(magnitudes)
    freq_kurtosis = scipy.stats.kurtosis(magnitudes)

    # Other Features
    total_energy = np.sum(magnitudes ** 2)
    
    # Spectral Entropy
    psd_norm = np.abs(magnitudes) / np.sum(np.abs(magnitudes))
    spectral_entropy = -np.sum(psd_norm * np.log2(psd_norm + np.finfo(float).eps))
    
    # Spectral Flatness
    spectral_flatness = np.exp(np.mean(np.log(magnitudes + np.finfo(float).eps))) / np.mean(magnitudes)
    
    # Spectral Roll-off
    spectral_sum = np.cumsum(magnitudes)
    spectral_rolloff = frequencies[np.searchsorted(spectral_sum, 0.85 * spectral_sum[-1])]
    
    # Statistical Features
    mean_amplitude = np.mean(magnitudes)
    std_amplitude = np.std(magnitudes)
    skew_amplitude = scipy.stats.skew(magnitudes)
    kurtosis_amplitude = scipy.stats.kurtosis(magnitudes)

    features = {
        "Peak Frequency": peak_freq,
        "Mean Frequency": mean_freq,
        "Median Frequency": median_freq,
        "Bandwidth": bandwidth,
        "Frequency Skewness": freq_skewness,
        "Frequency Kurtosis": freq_kurtosis,
        "Total Energy": total_energy,
        "Spectral Entropy": spectral_entropy,
        "Spectral Flatness": spectral_flatness,
        "Spectral Roll-off": spectral_rolloff,
        "Mean Amplitude": mean_amplitude,
        "Std Amplitude": std_amplitude,
        "Skew Amplitude": skew_amplitude,
        "Kurtosis Amplitude": kurtosis_amplitude
    }
    
    return pd.Series(features)

def apply_fft_feats(df):
    return df.groupby('id').apply(calculate_fft_features)

##### MAKE SURE FUNC LIST IS UPDATED BEFORE RUNNING/PASTING TO NOTEBOOK #####
func_list = [raw_aggregation_functions, apply_fft_feats]

In [6]:
def preprocessing(df, df_test, func_list, scale_needed=True, run_local=True):
    """
    Preprocesses the input dataframes for training and testing.
    """

    X_train = pd.DataFrame({'id': df['id'].unique()})
    y_train = df.groupby('id')['score'].first().reset_index(drop=True)
    X_test = pd.DataFrame({'id': df_test['id'].unique()})

    # Feature engineering
    agg_train_list = []
    agg_test_list = []

    # Aggregate and collect features without merging
    for func in func_list:
        agg_train = func(df)
        agg_test = func(df_test)

        # Check if the index name is 'id'
        if agg_train.index.name != 'id' or agg_test.index.name != 'id':
            raise ValueError("The index must be 'id' for aggregation functions.")
        
        agg_train_list.append(agg_train)
        agg_test_list.append(agg_test)

    # Concatenate all aggregated features horizontally, aligned by index
    agg_train = pd.concat(agg_train_list, axis=1)
    agg_test = pd.concat(agg_test_list, axis=1)

    # Reset index before merge
    agg_train.reset_index(inplace=True)
    agg_test.reset_index(inplace=True)

    # Perform a single merge operation
    X_train = X_train.merge(agg_train, on='id', how='left')
    X_test = X_test.merge(agg_test, on='id', how='left')

    # Remove 'id' column before scaling
    X_train_ids = X_train['id']
    X_train = X_train.drop(columns=['id'])
    X_test_ids = X_test['id']
    X_test = X_test.drop(columns=['id'])

    # Save X_train before scaling if run_local=True
    if run_local:
        X_train.to_csv('X_train.csv', index=False)
        y_train.to_csv('y_train.csv', index=False)

    # Standardize features
    if scale_needed:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
    return X_train, X_train_ids, y_train, X_test, X_test_ids

In [7]:
def train(X_train, y_train, model_name, feat_list):
    """
    Trains a regression model.
    """
    
    if model_name=='shap_model':
        model = RandomForestRegressor(n_estimators=25)
        model.fit(X_train[feat_list], y_train)

    return model

In [8]:
def get_k_models(X_train, y_train, model_name, feat_lists, seed):
    
    models = []
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.10, stratify=y_train, random_state=seed)
    skf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
    
    for i, (train_index, val_index) in enumerate(skf.split(X_train, LabelEncoder().fit_transform(y_train))):
        x_train_fold, x_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
        model = train(x_train_fold, y_train_fold, model_name, feat_lists[i])
    
        models.append(model)
    
    return models

In [9]:
# Prediction
def predict(model, X_test):
    """
    Makes predictions using the trained model.
    """

    preds = model.predict(X_test)
    
    return preds

In [10]:
def submit(X_test_ids, preds):
    try:
        # Check if X_test_ids and preds have the same length
        if len(X_test_ids) != len(preds):
            raise ValueError("The lengths of X_test_ids and preds must match.")
        
        # Create a submission file
        submission = pd.DataFrame({'id': X_test_ids, 'score': preds})
        submission.to_csv('./submission.csv', index=False)
        print('Submitted')

    except ValueError as e:
        print(f"Error: {e}")

In [11]:
X_train, X_train_ids, y_train, X_test, X_test_ids = preprocessing(df, df_test, func_list, scale_needed=False, run_local=run_local)

'''
Shap value feat sel
'''
shap_feat_lists = [
    ['word_count_max', 'distance_of_moved_selection_mean', 'distance_of_moved_selection_max', 'size_of_moved_selection_mean', 'size_of_moved_selection_max', 'action_time_mean_Move From', 'action_time_mean_Replace', 'action_time_max_Input', 'action_time_max_Replace', 'action_time_sum_Move From', 'action_time_sum_Nonproduction', 'action_time_count_Input', 'action_time_count_Move From', 'action_time_count_Nonproduction', 'action_time_count_Remove/Cut'],
    ['activity_count_paste', 'action_time_sum', 'action_time_mean', 'word_count_max', 'text_change_count_tiny_text_changes', 'distance_of_moved_selection_mean', 'distance_of_moved_selection_max', 'size_of_moved_selection_mean', 'size_of_moved_selection_max', 'action_time_mean_Input', 'action_time_mean_Move From', 'action_time_mean_Nonproduction', 'action_time_mean_Paste', 'action_time_mean_Remove/Cut', 'action_time_mean_Replace', 'action_time_max_Input', 'action_time_max_Move From', 'action_time_max_Nonproduction', 'action_time_max_Paste', 'action_time_max_Replace', 'action_time_sum_Input', 'action_time_sum_Move From', 'action_time_sum_Nonproduction', 'action_time_sum_Paste', 'action_time_sum_Remove/Cut', 'action_time_sum_Replace', 'action_time_count_Input', 'action_time_count_Move From', 'action_time_count_Nonproduction', 'action_time_count_Paste', 'action_time_count_Replace', 'Mean Frequency', 'Median Frequency', 'Frequency Skewness', 'Frequency Kurtosis', 'Spectral Roll-off', 'Skew Amplitude', 'Kurtosis Amplitude'],
    ['activity_count_paste', 'action_time_mean', 'word_count_max', 'text_change_count_tiny_text_changes', 'distance_of_moved_selection_mean', 'distance_of_moved_selection_max', 'size_of_moved_selection_mean', 'size_of_moved_selection_max', 'up_time_fraction_of_time_spent_writing', 'action_time_mean_Input', 'action_time_mean_Move From', 'action_time_mean_Paste', 'action_time_mean_Replace', 'action_time_max_Input', 'action_time_max_Move From', 'action_time_max_Paste', 'action_time_max_Replace', 'action_time_sum_Move From', 'action_time_sum_Paste', 'action_time_sum_Replace', 'action_time_count_Input', 'action_time_count_Move From', 'action_time_count_Nonproduction', 'action_time_count_Paste', 'action_time_count_Replace', 'Peak Frequency', 'Median Frequency', 'Bandwidth'],
    ['activity_count_paste', 'action_time_sum', 'word_count_max', 'text_change_count_tiny_text_changes', 'cursor_position_mean', 'distance_of_moved_selection_mean', 'distance_of_moved_selection_max', 'size_of_moved_selection_mean', 'size_of_moved_selection_max', 'up_time_fraction_of_time_spent_writing', 'action_time_mean_Move From', 'action_time_mean_Nonproduction', 'action_time_mean_Paste', 'action_time_mean_Remove/Cut', 'action_time_mean_Replace', 'action_time_max_Input', 'action_time_max_Move From', 'action_time_max_Nonproduction', 'action_time_max_Paste', 'action_time_max_Remove/Cut', 'action_time_max_Replace', 'action_time_sum_Input', 'action_time_sum_Move From', 'action_time_sum_Nonproduction', 'action_time_sum_Paste', 'action_time_sum_Remove/Cut', 'action_time_sum_Replace', 'action_time_count_Input', 'action_time_count_Move From', 'action_time_count_Paste', 'action_time_count_Remove/Cut', 'action_time_count_Replace', 'Peak Frequency', 'Mean Frequency', 'Bandwidth', 'Frequency Skewness', 'Frequency Kurtosis', 'Spectral Roll-off', 'Std Amplitude', 'Skew Amplitude', 'Kurtosis Amplitude'],
    ['activity_count_paste', 'action_time_mean', 'word_count_max', 'text_change_count_tiny_text_changes', 'cursor_position_mean', 'distance_of_moved_selection_mean', 'distance_of_moved_selection_max', 'size_of_moved_selection_mean', 'size_of_moved_selection_max', 'up_time_fraction_of_time_spent_writing', 'action_time_mean_Input', 'action_time_mean_Move From', 'action_time_mean_Paste', 'action_time_mean_Remove/Cut', 'action_time_mean_Replace', 'action_time_max_Input', 'action_time_max_Move From', 'action_time_max_Nonproduction', 'action_time_max_Paste', 'action_time_max_Remove/Cut', 'action_time_max_Replace', 'action_time_sum_Input', 'action_time_sum_Move From', 'action_time_sum_Nonproduction', 'action_time_sum_Paste', 'action_time_sum_Remove/Cut', 'action_time_sum_Replace', 'action_time_count_Input', 'action_time_count_Move From', 'action_time_count_Nonproduction', 'action_time_count_Paste', 'action_time_count_Remove/Cut', 'action_time_count_Replace', 'Peak Frequency', 'Mean Frequency', 'Median Frequency', 'Bandwidth', 'Frequency Skewness', 'Frequency Kurtosis', 'Total Energy', 'Spectral Entropy', 'Spectral Flatness', 'Spectral Roll-off', 'Mean Amplitude', 'Std Amplitude', 'Skew Amplitude', 'Kurtosis Amplitude']
]

models = get_k_models(X_train, y_train, model_name='shap_model', feat_lists=shap_feat_lists, seed=SEED)

preds = []
for (model, shap_feat_list) in zip(models, shap_feat_lists):
    preds.append(predict(model, X_test[shap_feat_list]))

# Take mean over K-folds
preds = np.mean(preds, axis=0)


'''
Submit
'''

submit(X_test_ids, preds)

Submitted
