In [48]:
import numpy as np
import pandas as pd
import time
import random
import itertools

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import shuffle
from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score

from behalearn.preprocessing import columns
from behalearn.authentication import authentication_metrics
from behalearn.authentication import authentication_results
from behalearn.estimators import VotingClassifier
from behalearn.features import FeatureExtractor
from behalearn.features import temporal
from behalearn.metrics import fmr_score
from behalearn.metrics import fnmr_score
from behalearn.metrics import hter_score
from behalearn.preprocessing import StartEndSegmentSplitter
from behalearn.preprocessing import SegmentSplitter
from behalearn.preprocessing.segment import criteria
from behalearn.visualization import initialize_notebook_output
from behalearn.visualization import label_touches
from behalearn.visualization import plot_fmr_fnmr
from behalearn.visualization import visualize_mobile_data
from behalearn.visualization import visualize_mouse_data
from behalearn.visualization import visualize_custom_data


In [95]:
segment_column = 'segment'
user_column = 'user'
user_name_column = 'username'
compute_features_for_segment = True

path_to_raw_data = '../login_datasets/2019-01-08_FIIT_-2-poschodie_po_skuske_KPAIS_correct_patterns_only/'
path_to_featutes = "../login_features/" + ("segments" if compute_features_for_segment else "paterns") + "/"

columns_to_identificate_features = ['id', 'pattern_id','device','scenario', user_name_column, user_column]
if compute_features_for_segment:
    columns_to_identificate_features.append(segment_column)

In [3]:
touch_data = pd.read_csv(path_to_raw_data + 'touch.csv', sep=',')
acc_data = pd.read_csv(path_to_raw_data + 'linear_accelerometer.csv', sep=',')
gyro_data = pd.read_csv(path_to_raw_data + 'gyroscope.csv', sep=',')
touch_data
touch_data["id"] = touch_data["pattern_id"].astype(str) + touch_data["device"]
acc_data["id"] = acc_data["pattern_id"].astype(str) + acc_data["device"]
gyro_data["id"] = gyro_data["pattern_id"].astype(str) + gyro_data["device"]

if compute_features_for_segment:
    touch_data["id"] += touch_data['segment'].astype(str)
    acc_data["id"] += acc_data['segment'].astype(str)
    gyro_data["id"] += gyro_data['segment'].astype(str)
else:
    touch_data.drop('segment', axis=1, inplace=True)
    acc_data.drop('segment', axis=1, inplace=True)
    gyro_data.drop('segment', axis=1, inplace=True)

In [None]:
touch_data.head()

In [4]:
le = LabelEncoder()
le.fit(pd.concat([touch_data['id'], acc_data['id'], gyro_data['id']]))
touch_data[user_column] = le.transform(touch_data['id'])
acc_data[user_column] = le.transform(acc_data['id'])
gyro_data[user_column] = le.transform(gyro_data['id'])

In [None]:
touch_data.head()

In [5]:
def get_columns_combinations(col_names,combinations = None):
    combs = columns._get_column_combinations(col_names, combinations)
    final_combs=[]
    for comb in combs:
        if len(comb) > 1:
            final_combs.append(comb)
    return final_combs

In [6]:
def calculate_maginute_to_df(df, columns_name):
    final_combinations = []
    for combination in get_columns_combinations(columns_name):
        sum = [0]*len(df)
        for dimension in combination:
            sum += df[dimension] ** 2
        magnitude=sum **(1/2)
        new_column='_'.join(combination)
        df[new_column]=magnitude
        final_combinations.append(new_column)
        
    return final_combinations

In [7]:
def compute_features(df, features, prefix = None):
    extractor = FeatureExtractor(features, [user_column])

    features_df = extractor.fit_transform(df)
    features_df = features_df.replace([np.inf, -np.inf], np.nan).dropna()
    
    if prefix is not None:
        features_df.columns = features_df.columns.str.replace(r"velocity", prefix)

    return features_df

In [8]:
def renaming_condition(x, columns_name, prefix):
    if x in columns_name:
        return prefix + x
    return x


def add_prefix_to_columns(df, columns_name, prefix):
    df.columns = [renaming_condition(col, columns_name, prefix) for col in df.columns]

    return [prefix + s for s in columns_name]

In [9]:
def compute_statistics(df, columns_to_compute_statistic, prefix):
    columns_to_compute_statistic += calculate_maginute_to_df(df, columns_to_compute_statistic)
    columns_to_compute_statistic = add_prefix_to_columns(df, columns_to_compute_statistic, prefix)

    statistics = df.groupby([user_column])[columns_to_compute_statistic].describe()
    statistics.columns = statistics.columns.to_flat_index()
    statistics.rename(columns='_'.join, inplace=True)
    statistics = statistics[statistics.columns.drop(list(statistics.filter(regex='count')))]
    
    return statistics

In [10]:
def normalize_columns_names(df):
    df.columns = df.columns.str.replace(r"25%", "_lower_q")
    df.columns = df.columns.str.replace(r"50%", "_median")
    df.columns = df.columns.str.replace(r"75%", "_upper_q")
    df.columns = df.columns.str.replace(r"__", "_")

In [11]:
features = [
    'duration',
    ('length', {
        'columns': ['x', 'y'],
    }),
    ('start', {
        'columns': ['x', 'y'],
    }),
    ('velocity', {
        'columns': ['x', 'y'],
    }),
    ('acceleration', {
        'columns': ['x', 'y'],
    }),
    ('jerk', {
        'columns': ['x', 'y'],
    }),
    ('angular_velocity', {
        'columns': ['x', 'y'],
    }),
    ('angular_acceleration', {
        'columns': ['x', 'y'],
    }),
]
print(time.strftime("%Y-%m-%d %H:%M"))
touch_features = compute_features(touch_data, features)
touch_features = touch_features.merge(touch_data[columns_to_identificate_features], on=[user_column]).drop_duplicates()
features = [
    ('velocity', {
        'columns': ['x', 'y', 'z'],
    }),
]
print(time.strftime("%Y-%m-%d %H:%M"))

acc_features = compute_features(acc_data, features, "accelerometer_jerk")
acc_statistics_from_raw_data = compute_statistics(acc_data,['x','y','z'],"accelerometer_")
acc_features = acc_features.merge(acc_statistics_from_raw_data, on='user', how='inner').drop_duplicates()
print(time.strftime("%Y-%m-%d %H:%M"))

gyro_features = compute_features(gyro_data, features, "gyro_jerk")
gyro_statistics_from_raw_data = compute_statistics(gyro_data,['x','y','z'],"gyro_")
gyro_features = gyro_features.merge(gyro_statistics_from_raw_data, on='user', how='inner').drop_duplicates()
print(time.strftime("%Y-%m-%d %H:%M"))

all_features = touch_features.merge(acc_features, on='user', how='inner').merge(gyro_features, on='user', how='inner')
normalize_columns_names(all_features)

2020-10-12 13:22


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


2020-10-12 14:14


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


2020-10-12 14:46


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


2020-10-12 15:18


In [None]:
print(touch_features)
print(acc_features)
print(gyro_features)
print(all_features)

In [14]:
touch_features.to_csv(path_to_featutes + "touch_feautures.csv", encoding='utf-8', index=False)
acc_features.to_csv(path_to_featutes + "acc_feautures.csv", encoding='utf-8', index=False)
gyro_features.to_csv(path_to_featutes + "gyro_feautures.csv", encoding='utf-8', index=False)
all_features.to_csv(path_to_featutes + "all_feautures.csv", encoding='utf-8', index=False)

In [13]:
len(touch_data['user'].unique())

18228

In [12]:
all_features

Unnamed: 0,user,duration,length,start_x,start_y,velocity_x_mean,velocity_x_std,velocity_x_min,velocity_x_max,velocity_x_abs_min,...,gyro_y_z_median,gyro_y_z_upper_q,gyro_y_z_max,gyro_x_y_z_mean,gyro_x_y_z_std,gyro_x_y_z_min,gyro_x_y_z_lower_q,gyro_x_y_z_median,gyro_x_y_z_upper_q,gyro_x_y_z_max
0,0,184999936.0,206.726860,291.484070,273.423280,7.034850e-07,5.918383e-07,0.000000e+00,1.455914e-06,0.000000e+00,...,0.280382,0.382203,0.764908,0.448234,0.187035,0.100277,0.311417,0.402196,0.599749,0.770899
1,1,208000000.0,277.265785,427.942170,461.313480,3.643308e-07,3.580648e-07,-2.515809e-07,7.649364e-07,-4.197011e-08,...,0.237045,0.479407,0.592870,0.406079,0.218763,0.098148,0.216095,0.354227,0.610894,0.781745
2,2,68999936.0,117.059255,501.915220,258.519230,-2.633276e-07,5.866631e-08,-3.489159e-07,-2.178355e-07,-2.178355e-07,...,0.469289,0.514789,0.616602,0.615999,0.046920,0.550602,0.577071,0.620979,0.648476,0.675760
3,3,416999936.0,461.659216,483.217770,133.391480,-4.913717e-07,4.591894e-07,-1.611801e-06,-4.011417e-08,-4.011417e-08,...,0.270333,0.404044,0.642192,0.459463,0.185875,0.053937,0.301894,0.468513,0.606242,0.816882
4,4,278000128.0,188.740335,276.939100,450.126460,-4.738293e-07,3.137901e-07,-1.016403e-06,-2.438351e-08,-2.438351e-08,...,0.371707,0.507209,0.615344,0.463616,0.272550,0.029498,0.231071,0.529131,0.641455,0.929948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14583,18262,115000064.0,163.039593,160.665740,78.558655,-3.353735e-07,1.338002e-07,-4.123821e-07,-9.830668e-08,-9.830668e-08,...,0.376625,0.488743,0.652517,0.638690,0.407682,0.094401,0.410524,0.522512,0.775925,1.544065
14584,18264,74999808.0,105.359947,149.248230,446.219850,1.182830e-06,4.618389e-07,6.661992e-07,1.747868e-06,6.661992e-07,...,0.200401,0.252776,0.345602,0.344062,0.139910,0.163125,0.226129,0.349722,0.449297,0.563844
14585,18267,107000064.0,176.989896,85.769775,77.559326,1.248288e-07,3.839744e-07,-4.444082e-07,4.993943e-07,-7.511793e-10,...,0.183370,0.284651,0.582454,0.392348,0.193583,0.120158,0.235234,0.371714,0.533132,0.743876
14586,18269,122000128.0,210.571564,90.535720,457.870420,1.416593e-06,1.218048e-06,2.925796e-07,3.882524e-06,2.925796e-07,...,0.144914,0.158979,0.193934,0.207518,0.072254,0.081686,0.156262,0.200473,0.271823,0.314249


In [93]:
all_features = pd.read_csv(path_to_featutes + "all_feautures.csv")

In [96]:
columns_to_identificate_features.remove(user_name_column)
all_features.drop(columns=columns_to_identificate_features, inplace=True)

train_x_columns = [x for x in list(all_features.columns) if x != user_name_column]
train_y_columns = user_name_column

In [42]:
# x_train, x_test, y_train, y_test = train_test_split(all_features[train_x_columns], all_features[train_y_columns], random_state=42)

In [98]:
def use_knn(df_x_train,df_x_test):
    neigh = NearestNeighbors(n_neighbors=1)
    neigh.fit(df_x_train)
    start = time.time()
    knn=neigh.kneighbors(df_x_test)
    end = time.time()
    return knn

In [99]:
users= all_features[user_name_column].unique()
print(len(users))
print(users)

33
['mino' 'Jozef Schneider' 'PeterB' 'Martin' 'lukasb' 'Filip'
 'Matej Adamov' 'Martina' 'michal' 'TomasM' 'vargaf' 'AdamT' 'baxos'
 'mimo' 'kitti' 'Matúš Cuper' 'Rado' 'denisgr' 'fero' 'IM' 'Miki' 'blinky'
 'tany' 'tom990' 'martin' 'hawkie94' 'banas' 'Sojkam' 'dominika' 'luci'
 'Ivi' 'Jakub.m' 'Stevo']


In [64]:
def get_threshold_for_knn()
    random_users = random.sample(list(all_features[user_name_column].unique()), 10)
    vsetky_vzdialenosti = []
    for random_user in random_users:
        train = all_features.loc[all_features[user_name_column] == random_user]
        test = all_features.loc[all_features[user_name_column] != random_user]
        knn = use_knn(train[train_x_columns], test[train_x_columns])
        vsetky_vzdialenosti.append(knn[0])
    medians = []
    for hodnota in vsetky_vzdialenosti:
        medians.append(np.percentile(hodnota, 25))  
    
    return np.percentile(medians, 50)
treshold_for_knn = get_threshold_for_knn()
# 33.69773240289701

In [133]:
def split_to_train_test(df, users_to_train):

    df_shuffled = df.sample(frac=1).reset_index(drop=True)

    mask = [True if x[user_name_column] in users_to_train else False for i, x in df_shuffled.iterrows()]
    df_owner = df_shuffled[mask]

    mask2 = [True if x[user_name_column] not in users_to_train else False for i, x in df_shuffled.iterrows()]
    df_others = df_shuffled[mask2]
    
    df_train = df_owner[:int(len(df_owner)*0.8)]
    df_test = df_owner[int(len(df_owner)*0.8):]
    df_test = df_test.append(df_others[:len(df_test)])
    
    return df_train, df_test

In [134]:
def unify_y_column_format(test_y, predicted, selected_owners):
    test_converted = ["known" if i in selected_owners else "unknown" for i in test_y]
    
    predicted = list(itertools.chain.from_iterable(predicted[0]))
    predict_converted = ["known" if i <= treshold_for_knn else "unknown" for i in predicted]
    
    return test_converted, predict_converted

In [138]:
def show_results(test_y,predicted_y):
    print ('Accuracy:', accuracy_score(test_y, predicted_y))
    print ('F1 score:', f1_score(test_y, predicted_y, pos_label='known'))
    print ('Recall:', recall_score(test_y, predicted_y, pos_label='known'))
    print ('Precision:', precision_score(test_y, predicted_y, pos_label='known'))
    print ('\n clasification report:\n', classification_report(test_y,predicted_y))
    print ('\n confussion matrix:\n',confusion_matrix(test_y, predicted_y))

In [139]:
selected_owners = ['mino']
df_train, df_test = split_to_train_test(all_features, selected_owners)

predicted = use_knn(df_train[train_x_columns],df_test[train_x_columns])

test_y, predicted_y = unify_y_column_format(df_test[train_y_columns], predicted, selected_owners)

show_results(test_y,predicted_y)

Accuracy: 0.6616161616161617
F1 score: 0.5939393939393939
Recall: 0.494949494949495
Precision: 0.7424242424242424

 clasification report:
               precision    recall  f1-score   support

       known       0.74      0.49      0.59        99
     unknown       0.62      0.83      0.71        99

    accuracy                           0.66       198
   macro avg       0.68      0.66      0.65       198
weighted avg       0.68      0.66      0.65       198


 confussion matrix:
 [[49 50]
 [17 82]]
