In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy.fft import fft

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Flag to run locally
run_local = True

if run_local:
    df = pd.read_csv('../data/train_logs.csv')
    scores = pd.read_csv('../data/train_scores.csv')
    df_test = pd.read_csv('../data/test_logs.csv')
else:
    df = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
    scores = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')
    df_test = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')

df = df.merge(scores, on='id')

In [3]:
# Function for pandas apply that check number of large text changes
def count_large_text_changes(text_changes):
    return len([tc for tc in text_changes if len(tc) > 20])

def count_extremely_large_text_changes(text_changes):
    return len([tc for tc in text_changes if len(tc) > 100])

# For a given type of activity, count the number of times it occurs
def count_nonproduction(action_list):
    action_type = 'Nonproduction'
    return len([action for action in action_list if action == action_type])

# count input
def count_input(action_list):
    action_type = 'Input'
    return len([action for action in action_list if action == action_type])

# count remove/cut
def count_remove(action_list):
    action_type = 'Remove/Cut'
    return len([action for action in action_list if action == action_type])

# Count Replace
def count_replace(action_list):
    action_type = 'Replace'
    return len([action for action in action_list if action == action_type])

# Count Paste
def count_paste(action_list):
    action_type = 'Paste'
    return len([action for action in action_list if action == action_type])

In [4]:
def raw_aggregation_functions(df):

    # Feature engineering for typing behavior features
    typing_features = df.groupby('id').agg({
        'activity': 'count',                # Total number of activities
        'action_time': ['sum', 'mean'],     # Total and average action time
        'word_count': 'max',                # Maximum word count
        'text_change': 'nunique',           # Number of unique text changes
        'cursor_position': 'mean',           # Average cursor position
        'text_change' : count_large_text_changes,
        'text_change' : count_extremely_large_text_changes,
        'activity': count_nonproduction,
        'activity': count_input,
        'activity': count_remove,
        'activity': count_replace,
        'activity': count_paste,
    })

    # Flatten the multi-level column index
    typing_features.columns = ['_'.join(col).strip() for col in typing_features.columns.values]

    return typing_features

In [11]:
# Optimize the function to calculate top N frequencies and their magnitudes for each 'id' using groupby and apply
def calculate_fft_features(group):

    group['pos'] = group['cursor_position']%30
    group['line'] = (group['cursor_position']/30).astype(int)

    # Perform Fourier Transform on 'pos'
    fft_values = fft(group['pos'])[1:]
    
    # Generate frequencies corresponding to the Fourier Transform values
    frequencies = np.fft.fftfreq(len(fft_values), 1)[1:]
    
    # Take absolute value to get magnitude
    fft_magnitude = np.abs(fft_values)
    
    # Identify indices where the frequencies are positive
    positive_indices = np.where(frequencies > 0)[0]
    
    # Filter out only positive frequencies and skip the zero frequency
    frequencies = frequencies[positive_indices]
    magnitudes = fft_magnitude[positive_indices]
    
    # Frequency Domain Features
    peak_freq = frequencies[np.argmax(magnitudes)]
    if np.sum(magnitudes) == 0:
        mean_freq = 0  # or some other appropriate default value
    else:
        mean_freq = np.average(frequencies, weights=magnitudes)

    median_freq = frequencies[len(magnitudes) // 2]
    bandwidth = np.ptp(frequencies)
    freq_skewness = scipy.stats.skew(magnitudes)
    freq_kurtosis = scipy.stats.kurtosis(magnitudes)

    # Other Features
    total_energy = np.sum(magnitudes ** 2)
    
    # Spectral Entropy
    psd_norm = np.abs(magnitudes) / np.sum(np.abs(magnitudes))
    spectral_entropy = -np.sum(psd_norm * np.log2(psd_norm + np.finfo(float).eps))
    
    # Spectral Flatness
    spectral_flatness = np.exp(np.mean(np.log(magnitudes + np.finfo(float).eps))) / np.mean(magnitudes)
    
    # Spectral Roll-off
    spectral_sum = np.cumsum(magnitudes)
    spectral_rolloff = frequencies[np.searchsorted(spectral_sum, 0.85 * spectral_sum[-1])]
    
    # Statistical Features
    mean_amplitude = np.mean(magnitudes)
    std_amplitude = np.std(magnitudes)
    skew_amplitude = scipy.stats.skew(magnitudes)
    kurtosis_amplitude = scipy.stats.kurtosis(magnitudes)

    features = {
        "Peak Frequency": peak_freq,
        "Mean Frequency": mean_freq,
        "Median Frequency": median_freq,
        "Bandwidth": bandwidth,
        "Frequency Skewness": freq_skewness,
        "Frequency Kurtosis": freq_kurtosis,
        "Total Energy": total_energy,
        "Spectral Entropy": spectral_entropy,
        "Spectral Flatness": spectral_flatness,
        "Spectral Roll-off": spectral_rolloff,
        "Mean Amplitude": mean_amplitude,
        "Std Amplitude": std_amplitude,
        "Skew Amplitude": skew_amplitude,
        "Kurtosis Amplitude": kurtosis_amplitude
    }
    
    return pd.Series(features)

def apply_fft_feats(df):
    return df.groupby('id').apply(calculate_fft_features)

In [12]:
def preprocessing(df, df_test, scale_needed=True):
    """
    Preprocesses the input dataframes for training and testing.
    """

    X_train = pd.DataFrame({'id': df['id'].unique()})
    y_train = df.groupby('id')['score'].first().reset_index(drop=True)
    X_test = pd.DataFrame({'id': df_test['id'].unique()})

    # Feature engineering
    agg_train_list = []
    agg_test_list = []

    # Aggregate and collect features without merging
    for func in [raw_aggregation_functions, apply_fft_feats]:
        agg_train = func(df)
        agg_test = func(df_test)

        # Check if the index name is 'id'
        if agg_train.index.name != 'id' or agg_test.index.name != 'id':
            raise ValueError("The index must be 'id' for aggregation functions.")
        
        agg_train_list.append(agg_train)
        agg_test_list.append(agg_test)

    # Concatenate all aggregated features horizontally, aligned by index
    agg_train = pd.concat(agg_train_list, axis=1)
    agg_test = pd.concat(agg_test_list, axis=1)

    # Reset index before merge
    agg_train.reset_index(inplace=True)
    agg_test.reset_index(inplace=True)

    # Perform a single merge operation
    X_train = X_train.merge(agg_train, on='id', how='left')
    X_test = X_test.merge(agg_test, on='id', how='left')

    # Remove 'id' column before scaling
    X_train_ids = X_train['id']
    X_train = X_train.drop(columns=['id'])
    X_test_ids = X_test['id']
    X_test = X_test.drop(columns=['id'])

    # Standardize features
    if scale_needed:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
    return X_train, X_train_ids, y_train, X_test, X_test_ids

In [7]:
def train(X_train, y_train):
    """
    Trains a regression model.
    """

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    model = Lasso(alpha=0.1)
    model.fit(X_train, y_train)

    return model

In [8]:
# Prediction
def predict(model, X_test):
    """
    Makes predictions using the trained model.
    """

    preds = model.predict(X_test)
    
    return preds

In [9]:
def submit(X_test_ids, preds):
    try:
        # Check if X_test_ids and preds have the same length
        if len(X_test_ids) != len(preds):
            raise ValueError("The lengths of X_test_ids and preds must match.")
        
        # Create a submission file
        submission = pd.DataFrame({'id': X_test_ids, 'score': preds})
        submission.to_csv('./submission.csv', index=False)
        print('Submitted')

    except ValueError as e:
        print(f"Error: {e}")

In [None]:
X_train, X_train_ids, y_train, X_test, X_test_ids = preprocessing(df, df_test, scale_needed=True)
model = train(X_train, y_train)
preds = predict(model, X_test)
# submit(X_test_ids, preds)

In [None]:
preds