In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('data/Train_keystroke.csv')

In [3]:
df.head()

Unnamed: 0,user,press-0,release-0,press-1,release-1,press-2,release-2,press-3,release-3,press-4,...,press-8,release-8,press-9,release-9,press-10,release-10,press-11,release-11,press-12,release-12
0,1,0,120,216,312,424,496,592,664,808,...,1712,1760,1992,2064,2376,2448,2584,2632,2752,2824
1,1,0,95,168,265,360,455,527,599,736,...,1423,1471,1664,1711,1880,1952,2039,2111,2231,2279
2,1,0,71,143,231,783,903,1087,1159,1351,...,2039,2111,2271,2343,2487,2559,2679,2751,2871,2926
3,1,0,95,144,263,353,431,760,832,1159,...,3151,3223,3415,3463,3631,3703,3815,3887,3983,4055
4,1,0,70,166,238,310,406,526,598,710,...,1310,1382,1543,1605,1734,1806,1926,1998,2086,2182


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 880 entries, 0 to 879
Data columns (total 27 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   user        880 non-null    int64
 1   press-0     880 non-null    int64
 2   release-0   880 non-null    int64
 3   press-1     880 non-null    int64
 4   release-1   880 non-null    int64
 5   press-2     880 non-null    int64
 6   release-2   880 non-null    int64
 7   press-3     880 non-null    int64
 8   release-3   880 non-null    int64
 9   press-4     880 non-null    int64
 10  release-4   880 non-null    int64
 11  press-5     880 non-null    int64
 12  release-5   880 non-null    int64
 13  press-6     880 non-null    int64
 14  release-6   880 non-null    int64
 15  press-7     880 non-null    int64
 16  release-7   880 non-null    int64
 17  press-8     880 non-null    int64
 18  release-8   880 non-null    int64
 19  press-9     880 non-null    int64
 20  release-9   880 non-null    int6

### Calculate Features

In [5]:
df.iloc[:, 2:-2:2]

Unnamed: 0,release-0,release-1,release-2,release-3,release-4,release-5,release-6,release-7,release-8,release-9,release-10,release-11
0,120,312,496,664,856,1072,1400,1544,1760,2064,2448,2632
1,95,265,455,599,807,999,1095,1271,1471,1711,1952,2111
2,71,231,903,1159,1454,1631,1799,1902,2111,2343,2559,2751
3,95,263,431,832,1207,1377,1591,3015,3223,3463,3703,3887
4,70,238,406,598,758,950,1022,1166,1382,1605,1806,1998
...,...,...,...,...,...,...,...,...,...,...,...,...
875,80,306,465,672,784,1040,1123,1576,1735,1884,2040,2186
876,81,289,473,1057,1189,1419,1465,1698,1866,2019,2265,2457
877,79,300,519,807,894,1204,1279,1512,1724,1866,2046,2192
878,84,290,476,930,978,1240,1310,1484,1658,1848,1940,2082


In [6]:
ht = df.iloc[:, 2::2].values - df.iloc[:, 1::2].values
rrt = df.iloc[:, 4::2].values - df.iloc[:, 2:-2:2].values
ppt = df.iloc[:, 3::2].values - df.iloc[:, 1:-2:2].values
rpt = df.iloc[:, 3::2].values - df.iloc[:, 2:-2:2].values

In [7]:
ht_mean = np.mean(ht, axis=1)
ht_std = np.std(ht, axis=1)

rrt_mean = np.mean(rrt, axis=1)
rrt_std = np.std(rrt, axis=1)

ppt_mean = np.mean(ppt, axis=1)
ppt_std = np.std(ppt, axis=1)

rpt_mean = np.mean(rpt, axis=1)
rpt_std = np.std(rpt, axis=1)

In [8]:
data = pd.DataFrame({'ht_mean': ht_mean, 'ht_std': ht_std, 'rrt_mean': rrt_mean, 'rrt_std': rrt_std, 'ppt_mean': ppt_mean, 'ppt_std': ppt_std, 'rpt_mean': rpt_mean, 'rpt_std': rpt_std})

In [9]:
data.head()

Unnamed: 0,ht_mean,ht_std,rrt_mean,rrt_std,ppt_mean,ppt_std,rpt_mean,rpt_std
0,72.0,21.049392,225.333333,69.958718,229.333333,60.428102,157.333333,65.019655
1,70.384615,16.836492,182.0,38.464269,185.916667,34.862727,113.666667,44.548351
2,80.307692,16.498879,237.916667,139.1357,239.25,130.555943,156.833333,127.846805
3,72.0,20.373437,330.0,338.389568,331.916667,352.649604,259.916667,347.130249
4,74.769231,13.325244,176.0,38.177218,173.833333,45.487788,100.833333,45.586609


In [10]:
data['UserID'] = df['user']

### Split data in training and test

In [12]:
X = data.drop('UserID', axis=1)
y = data['UserID']

In [13]:
# Mapping class labels to integers
user_to_int = {user: i for i, user in enumerate(np.unique(y))}
y = y.map(user_to_int)

# Converting class labels to integers for XGBoost
y = y.astype('int')

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training and Evaluate Models

In [15]:
svc = SVC()
svc.fit(X_train, y_train)
y_pred1 = svc.predict(X_test)
print('SVC Accuracy: ', accuracy_score(y_test, y_pred1))

SVC Accuracy:  0.011363636363636364


In [16]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred2 = rfc.predict(X_test)
print('Random Forest Accuracy: ', accuracy_score(y_test, y_pred2))

Random Forest Accuracy:  0.1875


In [17]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred3 = xgb.predict(X_test)
print('XGBoost Accuracy: ', accuracy_score(y_test, y_pred3))

XGBoost Accuracy:  0.16477272727272727


### Storing the models

In [18]:
import joblib

In [21]:
joblib.dump(svc, 'models/svc.joblib')
joblib.dump(rfc, 'models/rfc.joblib')
joblib.dump(xgb, 'models/xgb.joblib')

['models/xgb.joblib']

### Testing a loaded Model

In [22]:
loaded_rfc = joblib.load('models/rfc.joblib')

In [23]:
y_pred4 = loaded_rfc.predict(X_test)
print('Loaded RFC Accuracy: ', accuracy_score(y_test, y_pred4))

Loaded RFC Accuracy:  0.1875
