# Biometric Authentication of Smartphone Users with Support Vector Machines
**Math189R - Midterm Project**  
Nico Espinosa Dice  
*April, 2020*

## Importing Data

In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
train_data = pd.read_csv("../Data/train.csv")
test_data = pd.read_csv("../Data/test.csv")
questions = pd.read_csv("../Data/questions.csv")
sample_submission = pd.read_csv("../Data/sampleSubmission.csv")

## Data Exploration

Verifies that the ID's provided in the Device column of train.csv correspond bijectively to the ID's in the QuizDevice column of questions.csv:

In [3]:
train_devices = train_data["Device"].tolist()
train_devices.sort()
train_devices = list(dict.fromkeys(train_devices))

quiz_devices = questions["QuizDevice"].tolist()
quiz_devices.sort()
quiz_devices = list(dict.fromkeys(quiz_devices))

train_devices == quiz_devices

True

Shape of datasets:

In [4]:
print("train_data shape:", train_data.shape)
print("test_data shape:", test_data.shape)

train_data shape: (29563983, 5)
test_data shape: (27007200, 5)


**Features of training data:**  
T = time (Unix time: miliseconds since 1/1/1970)  
X = acceleration measured in g on x co-ordinate  
Y = acceleration measured in g on y co-ordinate  
Z = acceleration measured in g on z co-ordinate  
DeviceId = Unique Id of the device that generated the samples

In [5]:
print(train_data.columns)

Index(['T', 'X', 'Y', 'Z', 'Device'], dtype='object')


## Data Preparation

The cell below is commented out because the entire dataset is too large and slow to work with for general testing purposes.

In [6]:
# Creates a dataset of only the X values: time, x-acceleration, y-acceleration, and z-acceleration
# X_train = train_data[["T", "X", "Y", "Z"]]
# X_test = test_data[["T", "X", "Y", "Z"]]

# # Creates a dataset of the target
# y_train = train_data[["Device"]]

In the cell below, we take a random subset of the entire dataset to train and test our data on.

In [7]:
smaller_data = train_data.sample(frac = 0.017)

In [8]:
selected_devices = [7, 8, 9]
three_devices = train_data.loc[train_data['Device'].isin(selected_devices)]
print(three_devices)

                   T         X         Y         Z  Device
0       1.336645e+12  0.340509  8.308413  4.140585       7
1       1.336645e+12  0.381370  8.390134  4.249548       7
2       1.336645e+12  0.272407  8.471856  4.018002       7
3       1.336645e+12  0.149824  8.430995  4.290409       7
4       1.336645e+12  0.272407  8.430995  4.481094       7
...              ...       ...       ...       ...     ...
997106  1.337962e+12  2.914754  4.018003  8.049625       9
997107  1.337962e+12  2.833032  3.868179  8.499097       9
997108  1.337962e+12  2.683209  3.718355  8.390134       9
997109  1.337962e+12  1.947710  3.377846  8.281172       9
997110  1.337962e+12  3.173541  3.595772  9.615966       9

[997111 rows x 5 columns]


In [9]:
# new_three_devices = three_devices.sample(frac = 0.5)
# print(new_three_devices.shape)

In [10]:
# count = 0
# new_list = []
# new_row = []
# new_data = []
# for device in selected_devices:
#     for index, row in (new_three_devices.loc[new_three_devices['Device'] == device]).iterrows():
#         new_list.append([row['T'], row['X'], row['Y'], row['Z']])
#         if (count == 0):
#             new_row.append(device)
#             count += 1
#         elif (count == 10):
#             new_row.append(new_list)
#             new_data.append(new_row)
#             new_list = []
#             new_row = []
#             count = 0
#         else:
#             count += 1

# new_df = pd.DataFrame(new_data, columns = ['Device', 'Array']) 

In [59]:
# print(new_df)

In [18]:
# num_devices = 10
# devices = [7, 8, 9, 12, 23, 25, 26, 27, 33, 37, 39, 45, 47, 51, 52, 57, 58, 65, 67, 68, 70, 71, 73, 74, 75, 78, 79, 81, 87, 89, 90, 91, 92, 94, 95, 96, 99, 104, 105, 108, 110, 111, 116, 117, 120, 122, 124, 126, 127, 129, 134, 137, 142, 145, 148, 149, 152, 156, 157, 158, 159, 162, 163, 168, 169, 174, 175, 177, 183, 187, 188, 189, 190, 194, 196, 204, 206, 207, 211, 213, 216, 219, 222, 224, 229, 232, 233, 234, 236, 237, 239, 240, 261, 263, 268, 269, 270, 271, 273, 274, 275, 277, 281, 282, 283, 284, 285, 289, 290, 291, 294, 296, 297, 298, 299, 302, 306, 309, 312, 313, 314, 323, 325, 333, 335, 338, 341, 343, 344, 345, 350, 360, 361, 366, 369, 370, 371, 376, 378, 381, 390, 394, 398, 399, 401, 404, 411, 412, 413, 415, 417, 421, 422, 423, 425, 433, 438, 447, 448, 455, 461, 463, 466, 471, 473, 477, 478, 479, 482, 485, 486, 487, 491, 492, 494, 501, 503, 505, 507, 509, 514, 515, 518, 520, 523, 524, 528, 531, 533, 534, 536, 537, 539, 547, 550, 552, 553, 554, 556, 557, 562, 568, 571, 573, 574, 575, 577, 579, 580, 581, 583, 589, 593, 594, 595, 596, 600, 601, 607, 610, 611, 612, 613, 614, 617, 621, 622, 626, 627, 629, 632, 634, 638, 640, 642, 643, 646, 647, 650, 653, 656, 658, 660, 661, 663, 664, 665, 666, 667, 669, 670, 671, 674, 675, 676, 678, 679, 680, 681, 682, 683, 684, 687, 688, 690, 691, 692, 694, 696, 698, 699, 700, 703, 705, 706, 709, 710, 711, 713, 714, 720, 721, 722, 727, 728, 729, 730, 732, 735, 736, 738, 739, 745, 746, 750, 751, 754, 755, 757, 761, 762, 763, 764, 768, 770, 774, 776, 781, 782, 784, 789, 792, 793, 795, 801, 802, 804, 805, 806, 810, 812, 814, 818, 820, 823, 824, 827, 834, 836, 838, 839, 841, 842, 846, 847, 848, 854, 857, 859, 860, 862, 864, 868, 870, 871, 877, 880, 882, 883, 887, 890, 895, 897, 900, 911, 912, 913, 919, 933, 941, 943, 945, 953, 955, 956, 967, 973, 977, 979, 983, 987, 991, 992, 996, 997, 998, 1000, 1006, 1015, 1017, 1027, 1029, 1031, 1033, 1035, 1036, 1037]
# selected_devices = []

# for i in range(num_devices):
#     selected_devices.append(devices[i])
    
# print(selected_devices)

[7, 8, 9, 12, 23, 25, 26, 27, 33, 37]
387


In [60]:
X_train, X_test, y_train, y_test = train_test_split(three_devices[["T", "X", "Y", "Z"]], three_devices.Device, test_size=0.3, random_state = 1)



In [61]:
# new shape
X_train.shape

(697977, 4)

Variance of features:

In [62]:
X_train.var()

T    9.527624e+18
X    7.083804e+00
Y    1.307122e+01
Z    1.911240e+01
dtype: float64

In [72]:
X_test.head()

Unnamed: 0,T,X,Y,Z
194197,1338545000000.0,-0.272407,5.897611,6.932756
23097,1337093000000.0,0.231546,8.853226,3.377846
722138,1344085000000.0,0.612916,3.990762,8.812365
430222,1342111000000.0,-0.340509,7.858941,5.053148
2081,1336646000000.0,4.671779,8.008764,-0.190686


## Model Implementation

Create a classification SVM with a Radial Basis Function kernel

In [63]:
svm_model = svm.SVC(kernel='rbf')

Trains the svm on train_data

In [None]:
svm_model.fit(X_train, y_train)

## Model Prediction and Evaluation

Outputs prediction of svm for test_data inputs

In [None]:
y_pred = svm_model.predict(X_test)

Evaluates model's accuracy (i.e how many times the model predicts the correct classification)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

## Saving Model

Saves model to file

In [None]:
from joblib import dump, load
dump(svm_model, 'svm_2.joblib')

In [None]:
new_svm = load('svm_2.joblib')