In [1]:
import os
from typing import Any, Dict, Literal, Sequence, Tuple

import inflection
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import schema
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier

  from pandas import MultiIndex, Int64Index


In [2]:
DATASET_DIR = "./Train_keystroke.csv"
MODELS_DIR = "models"  # To save the trained model in it

In [3]:
raw_df = pd.read_csv(DATASET_DIR)

In [4]:
raw_df.head()

Unnamed: 0,user,press-0,release-0,press-1,release-1,press-2,release-2,press-3,release-3,press-4,...,press-8,release-8,press-9,release-9,press-10,release-10,press-11,release-11,press-12,release-12
0,1,0,120,216,312,424,496,592,664,808,...,1712,1760,1992,2064,2376,2448,2584,2632,2752,2824
1,1,0,95,168,265,360,455,527,599,736,...,1423,1471,1664,1711,1880,1952,2039,2111,2231,2279
2,1,0,71,143,231,783,903,1087,1159,1351,...,2039,2111,2271,2343,2487,2559,2679,2751,2871,2926
3,1,0,95,144,263,353,431,760,832,1159,...,3151,3223,3415,3463,3631,3703,3815,3887,3983,4055
4,1,0,70,166,238,310,406,526,598,710,...,1310,1382,1543,1605,1734,1806,1926,1998,2086,2182


In [5]:
raw_df.isnull().values.any()

False

In [6]:
raw_df.describe()

Unnamed: 0,user,press-0,release-0,press-1,release-1,press-2,release-2,press-3,release-3,press-4,...,press-8,release-8,press-9,release-9,press-10,release-10,press-11,release-11,press-12,release-12
count,880.0,880.0,880.0,880.0,880.0,880.0,880.0,880.0,880.0,880.0,...,880.0,880.0,880.0,880.0,880.0,880.0,880.0,880.0,880.0,880.0
mean,55.5,0.0,69.740909,192.504545,280.730682,378.078409,460.992045,625.425,719.445455,792.982955,...,1683.110227,1773.718182,1874.559091,1973.45,2093.876136,2194.747727,2272.895455,2366.973864,2491.186364,2587.061364
std,31.771009,0.0,35.830579,118.281523,128.337586,243.485675,246.432639,359.327185,362.141729,403.595652,...,654.99947,657.806567,724.772838,725.310795,793.45893,792.913889,856.416953,856.341509,905.936922,905.291522
min,1.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,4.0,14.0,...,667.0,763.0,793.0,924.0,891.0,1028.0,955.0,1055.0,1124.0,1188.0
25%,28.0,0.0,50.0,152.0,232.0,281.0,374.0,463.0,561.0,592.75,...,1288.5,1375.0,1433.5,1532.5,1589.75,1708.0,1741.75,1834.25,1930.5,2034.5
50%,55.5,0.0,72.0,192.0,279.0,358.0,436.0,569.0,663.0,721.5,...,1526.0,1618.5,1695.0,1791.5,1896.0,1994.0,2059.5,2151.0,2255.5,2344.0
75%,83.0,0.0,94.0,231.25,328.0,424.0,501.25,706.5,800.0,888.5,...,1921.25,2004.0,2145.0,2256.0,2390.25,2474.5,2585.75,2687.25,2823.25,2905.75
max,110.0,0.0,241.0,1287.0,1343.0,4333.0,4453.0,4823.0,4890.0,5191.0,...,7377.0,7471.0,7663.0,7753.0,8006.0,8063.0,8189.0,8259.0,8422.0,8487.0


In [7]:
raw_df.dtypes

user          int64
press-0       int64
release-0     int64
press-1       int64
release-1     int64
press-2       int64
release-2     int64
press-3       int64
release-3     int64
press-4       int64
release-4     int64
press-5       int64
release-5     int64
press-6       int64
release-6     int64
press-7       int64
release-7     int64
press-8       int64
release-8     int64
press-9       int64
release-9     int64
press-10      int64
release-10    int64
press-11      int64
release-11    int64
press-12      int64
release-12    int64
dtype: object

### Note
Each feature is a relation between two "key states" by calculating the difference between them. In the "HT" feature the two states belong to the same character so the displacement here is 0, while The displacement is 1 in features which describe the relation between two adjacent characters (all others).

In [8]:
features_config = {
    "HT": {"start": "press", "end": "release", "displacment": 0},
    "PPT": {"start": "press", "end": "press", "displacment": 1},
    "RPT": {"start": "release", "end": "press", "displacment": 1},
    "RRT": {"start": "release", "end": "release", "displacment": 1},
}

In [9]:
def get_model_name(model: Any) -> str:
    return inflection.underscore(model.__class__.__name__)

In [10]:
def generate_pairs(feature: str, config: Dict, length: int = 13) -> Sequence[Tuple]:
    """
    Generate all the possible pairs between the keys for each feature based on the feature config.
    """
    return [
        (
            f"{feature}_{idx}",
            f"{config['end']}-{x[1]}",
            f"{config['start']}-{x[0]}",
        )
        for idx, x in enumerate(
            zip(range(length), range(length)[features_config[feature]["displacment"] :])
        )
    ]

### Examples

In [11]:
generate_pairs("HT", features_config["HT"])[:3]

[('HT_0', 'release-0', 'press-0'),
 ('HT_1', 'release-1', 'press-1'),
 ('HT_2', 'release-2', 'press-2')]

In [12]:
generate_pairs("RRT", features_config["RRT"])[:3]

[('RRT_0', 'release-1', 'release-0'),
 ('RRT_1', 'release-2', 'release-1'),
 ('RRT_2', 'release-3', 'release-2')]

In [13]:
def generate_features(df: pd.DataFrame):
    """
    Generate new features based on the configuration in `features_config`.

    Parameters:
    - df: pd.DataFrame: The input DataFrame.

    Returns:
    - pd.DataFrame: The DataFrame with all generated features (base feautres + mean and std features + label).

    The new features generated are the mean and standard deviation of the difference between pairs of columns (base features),
      as specified in `features_config`.
    """
    df = df.copy()
    for feature, config in features_config.items():
        feature_cols = []
        for name, end, start in generate_pairs(feature, config):
            df[name] = df[end] - df[start]
            feature_cols.append(name)

        # calculate the mean and standard deviation for each feature
        feature_cols = [col for col in df.columns if col.startswith(feature)]
        df[f"{feature}_mean"] = df[feature_cols].mean(axis=1)
        df[f"{feature}_std"] = df[feature_cols].std(axis=1)
    return df

In [14]:
def preprocess_data(df: pd.DataFrame) -> Tuple:
    df = generate_features(raw_df)

    # Keep only the final 8 features
    features_df = df[[col for col in df.columns if col.endswith(("mean", "std"))]]
    data, label = (
        features_df,
        df[["user"]],
    )
    X_train, X_test, y_train, y_test = train_test_split(
        data.values,
        label.values,
        test_size=0.125,
        random_state=42,
        stratify=label,  # To make sure that we have all the classess in both train and test
    )
    return X_train, X_test, np.ravel(y_train), np.ravel(y_test)

In [15]:
def save_model(model: Any) -> None:
    if os.path.exists(MODELS_DIR) is None:
        os.mkdir(MODELS_DIR)
    joblib.dump(model, open(f"{MODELS_DIR}/{get_model_name(model)}.pkl", "wb"))

In [16]:
def load_model(model_name: str):
    return joblib.load(open(model_name, "rb"))

In [17]:
def evaluate(pipe, X, y, state: Literal["train", "pred"]) -> None:
    pred = pipe.predict(X)
    accuracy = accuracy_score(y, pred)
    f1 = f1_score(y, pred, average="weighted")
    print(get_model_name(model))
    print(f"{state} Accuracy: ", "%.2f" % (accuracy * 100))
    print(f"{state} F1-Score: ", "%.2f" % (f1 * 100))
    print("=" * 100)

In [18]:
def run(
    model,
    X_train,
    X_test,
    y_train,
    y_test,
    # scaler=MinMaxScaler(), # (Max/Standard)Scaler leads to bad results with SVC
) -> None:
    # pipe = pipe = Pipeline([("scaler", MinMaxScaler()), (get_model_name(model), model)])
    model.fit(X_train, y_train)
    evaluate(model, X_train, y_train, state="train")
    evaluate(model, X_test, y_test, state="pred")
    save_model(model)
    return model

In [19]:
X_train, X_test, y_train, y_test = preprocess_data(raw_df)

In [20]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((770, 8), (110, 8), (770,), (110,))

In [21]:
num_classes = np.unique(y_train)
num_features = X_train.shape[1]

In [22]:
import random

def random_sample(X: pd.DataFrame, y: pd.DataFrame) -> Tuple:
    idx = random.randint(0, len(X))
    return X[idx].reshape(1, num_features), y[idx]

In [23]:
models_to_train = [SVC(kernel="linear"), XGBClassifier(), RandomForestClassifier()]
trained_models = []
for model in models_to_train:
    model = run(model, X_train, X_test, y_train, y_test)
    trained_models.append(model)

svc
train Accuracy:  96.75
train F1-Score:  96.72
svc
pred Accuracy:  30.91
pred F1-Score:  24.39
xgb_classifier
train Accuracy:  100.00
train F1-Score:  100.00
xgb_classifier
pred Accuracy:  20.91
pred F1-Score:  16.97
random_forest_classifier
train Accuracy:  100.00
train F1-Score:  100.00
random_forest_classifier
pred Accuracy:  28.18
pred F1-Score:  21.52


### Compare the loaded models with the fitted to make sure that the saving & loading are working!

In [24]:
sample, label = random_sample(X_train, y_train)
print(sample, label, sep="\n")

[[ 88.92307692  30.78273742 165.91666667  58.0539848   75.5
   77.2110449  158.          63.7495098 ]]
11


In [25]:
evaluate(trained_models[0], X_train, y_train, state="train")
evaluate(trained_models[1], X_train, y_train, state="train")
evaluate(trained_models[2], X_train, y_train, state="train")
evaluate(load_model("svc.pkl"), X_train, y_train, state="train")
evaluate(load_model("xgb_classifier.pkl"), X_train, y_train, state="train")
evaluate(load_model("random_forest_classifier.pkl"), X_train, y_train, state="train")

random_forest_classifier
train Accuracy:  96.75
train F1-Score:  96.72
random_forest_classifier
train Accuracy:  100.00
train F1-Score:  100.00
random_forest_classifier
train Accuracy:  100.00
train F1-Score:  100.00
random_forest_classifier
train Accuracy:  96.75
train F1-Score:  96.72
random_forest_classifier
train Accuracy:  100.00
train F1-Score:  100.00
random_forest_classifier
train Accuracy:  100.00
train F1-Score:  100.00


### Test and prepare needed funcations for the score.py

In [26]:
sample_svm = """
{
    "Model": "SVM",
    "HT": {
        "Mean": 48.43,
        "STD": 23.34
    },
    "PPT": {
        "Mean": 120.43,
        "STD": 37.41
    },
    "RRT": {
        "Mean": 124.43,
        "STD": 45.34
    },
    "RPT": {
        "Mean": 132.56,
        "STD": 47.12
    }
}
"""
sample_xgboost = """
{
    "Model": "XGBoost",
    "HT": {
        "Mean": 48.43,
        "STD": 23.34
    },
    "PPT": {
        "Mean": 120.43,
        "STD": 37.41
    },
    "RRT": {
        "Mean": 124.43,
        "STD": 45.34
    },
    "RPT": {
        "Mean": 132.56,
        "STD": 47.12
    }
}
"""
sample_rf = """
{
    "Model": "RF",
    "HT": {
        "Mean": 48.43,
        "STD": 23.34
    },
    "PPT": {
        "Mean": 120.43,
        "STD": 37.41
    },
    "RRT": {
        "Mean": 124.43,
        "STD": 45.34
    },
    "RPT": {
        "Mean": 132.56,
        "STD": 47.12
    }
}
"""

In [27]:
import json
import joblib
import numpy as np
from schema import And, Or, Schema, Use


# This will be overridden in the score.py
def init():
    global svm, xgboost, rf, input_schema
    svm = load_model(f"{MODELS_DIR}/svc.pkl")
    xgboost = load_model(f"{MODELS_DIR}/xgb_classifier.pkl")
    rf = load_model(f"{MODELS_DIR}/random_forest_classifier.pkl")
    
    # For input validation!
    input_schema = Schema(
        {
            "Model": Or("SVM", "XGBoost", "RF"),
            "HT": {"Mean": Use(float), "STD": Use(float)},
            "PPT": {"Mean": Use(float), "STD": Use(float)},
            "RRT": {"Mean": Use(float), "STD": Use(float)},
            "RPT": {"Mean": Use(float), "STD": Use(float)},
        }
    )


# This will be the same in the score.py
def run(raw_data):
    data = json.loads(raw_data)
    if input_schema.is_valid(data) is False:
        return {"message": "failed"}
    try:
        if data["Model"] == "RF":
            model = rf
        elif data["Model"] == "XGBoost":
            model = xgboost
        else:
            model = svm
        # The Correct order for the features:
        # HT_mean, HT_std, PPT_mean, PPT_std, PRT_mean, RPT_std, RRT_mean, RRT_std
        sample = np.array(
            [
                data["HT"]["Mean"],
                data["HT"]["STD"],
                data["PPT"]["Mean"],
                data["PPT"]["STD"],
                data["RPT"]["Mean"],
                data["RPT"]["STD"],
                data["HT"]["Mean"],
                data["HT"]["STD"],
            ]
        ).reshape(
            1, 8
        )  # 1 sample & 8 features
        result = model.predict(sample)
        return {
            "message": "success",
            "user_id": json.dumps(result.tolist()[0]),
            "used_model": model.__class__.__name__,
        }
    except Exception as e:
        return {"message": str(e)}

In [28]:
init()
print(run(sample_svm))
print(run(sample_xgboost))
print(run(sample_rf))

{'message': 'success', 'user_id': '84', 'used_model': 'SVC'}
{'message': 'success', 'user_id': '84', 'used_model': 'XGBClassifier'}
{'message': 'success', 'user_id': '84', 'used_model': 'RandomForestClassifier'}
