# **Library Installation and Imports**

In [3]:
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
import seaborn as sns
import os

drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


# **Dataset Loading and Preprocessing**

In [4]:
df_user1 = pd.read_csv('/content/drive/MyDrive/DS_LAB/Dataset/user1.csv')
df_user2 = pd.read_csv('/content/drive/MyDrive/DS_LAB/Dataset/user2.csv')

filtered_user1 = df_user1[df_user1['mode2'] == 30].head(645)
filtered_user2 = df_user2[df_user2['mode2'] == '30']

print(f"The number of records for user1: {len(filtered_user1)}")
print(f"The number of records for user2: {len(filtered_user2)}")

The number of records for user1: 645
The number of records for user2: 645


In [5]:
filtered_user2 = filtered_user2.copy()
filtered_user2['mode2'] = filtered_user2['mode2'].astype(int)

filtered_user1['quoteLength'] = filtered_user1['quoteLength'].fillna(0).astype(int)
filtered_user2['quoteLength'] = filtered_user2['quoteLength'].astype(int)

In [6]:
unique_counts = filtered_user1.nunique()
print(unique_counts)
constant_cols = unique_counts[unique_counts <= 1].index.tolist()
print(f'Columns with 1 or fewer unique values (candidates for dropping): {constant_cols}')

_id                      645
isPb                       1
wpm                      259
acc                      360
rawWpm                   249
consistency              494
charStats                406
mode                       1
mode2                      1
quoteLength                2
restartCount              11
testDuration               3
afkDuration                4
incompleteTestSeconds    237
punctuation                1
numbers                    1
language                   1
funbox                     0
difficulty                 1
lazyMode                   1
blindMode                  1
bailedOut                  1
tags                       0
timestamp                645
dtype: int64
Columns with 1 or fewer unique values (candidates for dropping): ['isPb', 'mode', 'mode2', 'punctuation', 'numbers', 'language', 'funbox', 'difficulty', 'lazyMode', 'blindMode', 'bailedOut', 'tags']


In [7]:
cols_to_drop = [
    '_id', 'mode', 'mode2', 'quoteLength', 'punctuation', 'numbers',
    'language', 'funbox', 'difficulty', 'lazyMode', 'tags', 'blindMode', 'bailedOut', 'isPb'
]

filtered_user1 = filtered_user1.drop(columns=cols_to_drop)
filtered_user2 = filtered_user2.drop(columns=cols_to_drop)

In [8]:
min_timestamp_user1 = df_user1['timestamp'].min()
min_timestamp_user2 = df_user2['timestamp'].min()

print(f"Minimum timestamp in user1 dataset: {min_timestamp_user1}")
print(f"Minimum timestamp in user2 dataset: {min_timestamp_user2}")

min_ts_user1 = min_timestamp_user1
min_ts_user2 = min_timestamp_user2

start_date_user1 = pd.to_datetime(min_ts_user1, unit='ms')
start_date_user2 = pd.to_datetime(min_ts_user2, unit='ms')

print(f"\nUser1 data collection start date: {start_date_user1}")
print(f"User2 data collection start date: {start_date_user2}")

Minimum timestamp in user1 dataset: 1617051568259
Minimum timestamp in user2 dataset: 1624348421344

User1 data collection start date: 2021-03-29 20:59:28.259000
User2 data collection start date: 2021-06-22 07:53:41.344000


In [9]:
def categorize_time_of_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

In [10]:
filtered_user1['datetime'] = pd.to_datetime(filtered_user1['timestamp'], unit='ms')
filtered_user2['datetime'] = pd.to_datetime(filtered_user2['timestamp'], unit='ms')

filtered_user1['time_of_day'] = filtered_user1['datetime'].dt.hour.apply(categorize_time_of_day)
filtered_user2['time_of_day'] = filtered_user2['datetime'].dt.hour.apply(categorize_time_of_day)

filtered_user1 = filtered_user1.drop(columns=["timestamp"])
filtered_user2 = filtered_user2.drop(columns=["timestamp"])

filtered_user1['user_id'] = 1
filtered_user2['user_id'] = 2

print(filtered_user1['time_of_day'].value_counts())
print()
print(filtered_user2['time_of_day'].value_counts())

time_of_day
afternoon    285
morning      231
evening      117
night         12
Name: count, dtype: int64

time_of_day
morning      304
afternoon    163
night        132
evening       46
Name: count, dtype: int64


In [11]:
def split_char_stats(df):
    char_stats_split = df['charStats'].str.split(';', expand=True)
    char_stats_split.columns = [
        'correct_characters', 'incorrect_characters', 'extra_characters', 'missed_characters'
    ]

    for col in char_stats_split.columns:
        char_stats_split[col] = pd.to_numeric(char_stats_split[col], errors='coerce').fillna(0).astype(int)

    df = pd.concat([df.drop(columns=['charStats']), char_stats_split], axis=1)
    return df

filtered_user1 = split_char_stats(filtered_user1)
filtered_user2 = split_char_stats(filtered_user2)

In [12]:
cols1 = list(filtered_user1.columns)
cols2 = list(filtered_user2.columns)

common_cols = list(set(cols1).intersection(cols2))
print(f"Columns in df_user1: {len(cols1)} | Column: {cols1}")
print(f"Columns in df_user2: {len(cols2)} | Columns: {cols2}")
print(f"Number of common columns: {len(common_cols)}")

Columns in df_user1: 15 | Column: ['wpm', 'acc', 'rawWpm', 'consistency', 'restartCount', 'testDuration', 'afkDuration', 'incompleteTestSeconds', 'datetime', 'time_of_day', 'user_id', 'correct_characters', 'incorrect_characters', 'extra_characters', 'missed_characters']
Columns in df_user2: 15 | Columns: ['wpm', 'acc', 'rawWpm', 'consistency', 'restartCount', 'testDuration', 'afkDuration', 'incompleteTestSeconds', 'datetime', 'time_of_day', 'user_id', 'correct_characters', 'incorrect_characters', 'extra_characters', 'missed_characters']
Number of common columns: 15


In [13]:
ordered_cols = [
    'user_id', 'datetime', 'time_of_day',
    'wpm', 'rawWpm', 'acc', 'consistency',
    'restartCount', 'testDuration', 'afkDuration', 'incompleteTestSeconds',
    'correct_characters', 'incorrect_characters', 'extra_characters', 'missed_characters'
]

filtered_user1 = filtered_user1[ordered_cols]
filtered_user2 = filtered_user2[ordered_cols]

In [14]:
combined_df = pd.concat([filtered_user1, filtered_user2], ignore_index=True)
print(f'Combined dataset shape: {combined_df.shape}')

Combined dataset shape: (1290, 15)


In [15]:
combined_df[['wpm', 'rawWpm']].corr()

Unnamed: 0,wpm,rawWpm
wpm,1.0,0.986886
rawWpm,0.986886,1.0


In [16]:
save_path = "/content/drive/MyDrive/DS_LAB//Processed/"
os.makedirs(save_path, exist_ok=True)
filtered_user1.to_pickle(save_path + "filtered_user1.pkl")
filtered_user2.to_pickle(save_path + "filtered_user2.pkl")
combined_df.to_pickle(save_path + "combined_df.pkl")
print("Data Saved")

Data Saved
