In [3]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import seaborn as sns
import dask.config
from sklearn.preprocessing import LabelEncoder
import random


data = dd.read_csv('../data/rba-dataset.csv')
print(data.head(10))

   index          Login Timestamp              User ID  Round-Trip Time [ms]  \
0      0  2020-02-03 12:43:30.772 -4324475583306591935                   NaN   
1      1  2020-02-03 12:43:43.549 -4324475583306591935                   NaN   
2      2  2020-02-03 12:43:55.873 -3284137479262433373                   NaN   
3      3  2020-02-03 12:43:56.180 -4324475583306591935                   NaN   
4      4  2020-02-03 12:43:59.396 -4618854071942621186                   NaN   
5      5  2020-02-03 12:44:05.160 -4324475583306591935                   NaN   
6      6  2020-02-03 12:44:07.893  7246533443898239661                   NaN   
7      7  2020-02-03 12:44:12.799 -3243978724802435038                   NaN   
8      8  2020-02-03 12:44:17.474  8076000552587369902                   NaN   
9      9  2020-02-03 12:44:19.071 -3065936140549856249                   NaN   

        IP Address Country    Region       City     ASN  \
0      10.0.65.171      NO         -          -   29695   
1

In [None]:
pd.set_option('display.max_columns', None) # Sets the number of columns visible
print(data.head())

# Data Investigation

We will need to clean the data. Let's first see how many NaNs there are in each column

In [None]:
nans_in_cols = data.isna().sum().compute()

print("Number of NaNs in each column:\n", nans_in_cols)

In [None]:
country_counts = data['Country'].value_counts().compute()
pd.set_option('max_rows', None)
print(country_counts)
print(len(data['Country']))


In [None]:
len(country_counts)

I think country and region and city might be redundant. Let's see what's inside region and city.

In [None]:
region_counts = data['Region'].value_counts().compute()
print(region_counts)

In [None]:
city_counts = data['City'].value_counts().compute()
print(city_counts)

# Data Engineering

### Create a balanced data frame

We have significantly more valid logins than account takeovers. We will try to balance the dataset by taking a random sample from examples where account takeover did not happen and combine that will all of the samples for the account takeover

In [4]:
# Filter dataframe to only accound takeovers
takeover_df = data[data['Is Account Takeover'] == True].compute()
print(takeover_df.shape[0])

141


In [5]:
# Create a dataframe with no account takeovers
data_sliced = data.head(100000)
data_sliced = data_sliced.reset_index(drop=True)
no_takeover_df = data_sliced[data_sliced['Is Account Takeover'] == False]
print(data_sliced.shape[0])

100000


In [6]:
# Take a random sample from the no_takeover_df
num_obs = no_takeover_df.shape[0]
n = takeover_df.shape[0]
print("n: " + str(n) + "\n")
random.seed(42)
index_list = random.sample(range(num_obs), n)
index_list.sort()
random_df = no_takeover_df.loc[no_takeover_df.index[index_list]]
random_df = random_df.reset_index(drop=True)
print(random_df.shape[0])

n: 141

141


In [11]:
final_df = pd.concat([takeover_df, random_df], ignore_index=True, axis=0)
print(final_df.shape[0])
final_df.to_csv("balanced_data.csv")

282


### Clean the data

In [8]:
# Convert True and False to 0 and 1
final_df['Is Account Takeover'] = final_df['Is Account Takeover'].astype(int)
final_df['Login Successful'] = final_df['Login Successful'].astype(int)
final_df['Is Attack IP'] = final_df['Is Attack IP'].astype(int)


In [9]:
# Deal with strings
print(final_df["OS Name and Version"].value_counts())
final_df["OS Name"] = final_df['OS Name and Version'].str.extract(r'(Mac|Windows Phone|Linux|iOS|Android|Chrome OS|Chromecast|Prosonic|Other)')
print(final_df["OS Name"].value_counts())

Mac OS X 10.14.6          135
iOS 11.2.6                 40
iOS 7.1                    18
Android 4.1                14
Android 5.5.1               8
Android 2.2                 7
iOS 14.2.1                  6
Windows Phone 8.1           5
iOS 13.4                    5
iOS 8.2                     5
Chrome OS 11316.123.0       4
Android 6.0.99              4
Chrome OS 5978.98.0         3
Chromecast 1.52.251747      3
Chrome OS 12499.66.0        2
Android 10.0.99             2
Android 9.7                 2
Chrome OS 13505.73.0        2
Mac OS X 11.6.3             2
Android 4.0.3               1
Android 13.0                1
Other                       1
Chrome OS 13099.30.0        1
Chromecast 1.49.230269      1
Chrome OS 13729.29.0        1
Chromecast 1.54.250118      1
iOS 4.2.1                   1
Android 8.1                 1
Chrome OS 12607.82.0        1
Windows Phone 7.5           1
Android 4.2.3               1
Mac OS X 10.13.3            1
Mac OS X 10.12.3            1
Mac OS X 1

In [171]:
# Catagorical variables
le = LabelEncoder()
country = le.fit_transform(final_df['Country'])
final_df['Country'] = country

le2 = LabelEncoder()
final_df["OS Name"] = le2.fit_transform(final_df['OS Name'])
final_df.head(20)

le3 = LabelEncoder()
final_df["Device Type"] = le3.fit_transform(final_df['Device Type'])
final_df.head(20)


Unnamed: 0,index,Login Timestamp,User ID,Round-Trip Time [ms],IP Address,Country,Region,City,ASN,User Agent String,Browser Name and Version,OS Name and Version,Device Type,Login Successful,Is Attack IP,Is Account Takeover,OS Name
0,82873,2020-02-04 13:45:50.280,5519106287451092780,,10.4.1.162,14,Provincia di Treviso,Treviso,503109,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...,Chrome 79.0.3945.192.218,Mac OS X 10.14.6,1,1,0,1,3
1,82947,2020-02-04 13:46:45.241,-7654599524478640403,,10.4.1.162,14,Provincia di Treviso,Treviso,503109,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...,Chrome 79.0.3945.192.218,Mac OS X 10.14.6,1,1,0,1,3
2,100085,2020-02-04 17:15:42.743,-6380256063165146454,,31.131.16.24,22,-,-,56851,Mozilla/5.0 (X11; CrOS armv7l 5978.98.0) Appl...,Chrome 71.0.3578.40.50,Chrome OS 5978.98.0,1,1,1,1,1
3,202905,2020-02-06 05:19:28.841,4130074439166519892,,185.170.136.4,14,Veneto,Sospirolo,206801,Mozilla/5.0 (iPad; CPU OS 8_2 like Mac OS X) ...,Android 2.3.6,iOS 8.2,2,1,0,1,6
4,273968,2020-02-07 01:25:57.399,-136955930917892295,,10.0.85.13,19,Vestland,Vassenden,197475,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...,Chrome 79.0.3945.192.203,Mac OS X 10.14.6,1,1,0,1,3
5,482034,2020-02-10 05:51:00.938,-5783801028078876142,,2.56.166.10,22,-,-,3280,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...,Chrome 79.0.3945.192.218,Mac OS X 10.14.6,1,1,1,1,3
6,482102,2020-02-10 05:52:45.031,6969491805167028251,,2.56.166.10,22,-,-,3280,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...,Chrome 79.0.3945.192.218,Mac OS X 10.14.6,1,1,1,1,3
7,482456,2020-02-10 06:01:54.380,-2200491188712463133,,2.56.166.10,22,-,-,3280,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...,Chrome 79.0.3945.192.218,Mac OS X 10.14.6,1,1,1,1,3
8,482566,2020-02-10 06:05:11.464,5780471454460598558,,2.56.166.10,22,-,-,3280,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...,Chrome 79.0.3945.192.218,Mac OS X 10.14.6,1,1,1,1,3
9,483111,2020-02-10 06:17:40.877,-4181075837585773799,,91.240.236.235,22,Ilfov,Petrachioaia,62350,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...,Chrome 79.0.3945.192.218,Mac OS X 10.14.6,1,1,1,1,3


In [177]:
# Drop columns
to_drop = ['index', 'Login Timestamp', 'User ID', "Round-Trip Ts
          'Region', 'City', 'ASN', 'User Agent String', 'Browser Name and Version', 'OS Name and Version']

ml_ready_df = final_df.drop(columns=to_drop)
ml_ready_df.head(10)

Unnamed: 0,Country,Device Type,Login Successful,Is Attack IP,Is Account Takeover,OS Name
0,14,1,1,0,1,3
1,14,1,1,0,1,3
2,22,1,1,1,1,1
3,14,2,1,0,1,6
4,19,1,1,0,1,3
5,22,1,1,1,1,3
6,22,1,1,1,1,3
7,22,1,1,1,1,3
8,22,1,1,1,1,3
9,22,1,1,1,1,3


# Train Model

In [198]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.ensemble import GradientBoostingClassifier

# create feature and target dataframes
y = ml_ready_df['Is Account Takeover']
X = ml_ready_df.drop(['Is Account Takeover'], axis=1)

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# initialize the models
bagging_clf = BaggingClassifier()
random_forest_clf = RandomForestClassifier()
decision_tree_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier()
gbdt_clf = GradientBoostingClassifier()
svm_clf = SVC()
nb_clf = GaussianNB()

# train the models
bagging_clf.fit(X_train, y_train)
random_forest_clf.fit(X_train, y_train)
decision_tree_clf.fit(X_train, y_train)
knn_clf.fit(X_train, y_train)
gbdt_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)
nb_clf.fit(X_train, y_train)

# predict using the trained models
bagging_preds = bagging_clf.predict(X_test)
random_forest_preds = random_forest_clf.predict(X_test)
decision_tree_preds = decision_tree_clf.predict(X_test)
knn_preds = knn_clf.predict(X_test)
gbdt_preds = gbdt_clf.predict(X_test)
svm_preds = svm_clf.predict(X_test)
nb_preds = nb_clf.predict(X_test)

# calculate f1 scores
bagging_f1 = f1_score(y_test, bagging_preds)
random_forest_f1 = f1_score(y_test, random_forest_preds)
decision_tree_f1 = f1_score(y_test, decision_tree_preds)
knn_f1 = f1_score(y_test, knn_preds)
gbdt_f1 = f1_score(y_test, gbdt_preds)
svm_f1 = f1_score(y_test, svm_preds)
nb_f1 = f1_score(y_test, nb_preds)

# calculate accuracy
print("Bagging accuracy: ", round(accuracy_score(y_test, bagging_preds), 4))
print("Random Forest accuracy: ", round(accuracy_score(y_test, random_forest_preds), 4))
print("Decision Tree accuracy: ", round(accuracy_score(y_test, decision_tree_preds), 4))
print("KNN accuracy: ", round(accuracy_score(y_test, knn_preds),4 ))
print("GBDT accuracy: ", round(accuracy_score(y_test, gbdt_preds), 4))
print("SVM accuracy: ", round(accuracy_score(y_test, svm_preds),4 ))
print("Naive Bayes accuracy: ", round(accuracy_score(y_test, nb_preds), 4), "\n")

# print f1 scores
print("Bagging f1 score: ", round(bagging_f1, 4))
print("Random Forest f1 score: ", round(random_forest_f1, 4))
print("Decision Tree f1 score: ", round(decision_tree_f1, 4))
print("KNN f1 score: ", round(knn_f1, 4))
print("GBDT f1 score: ", round(gbdt_f1, 4))
print("SVM f1 score: ", round(svm_f1, 4))
print("Naive Bayes f1 score: ", round(nb_f1, 4), '\n')

# We care more about false positives
print("Bagging recall: ", round(recall_score(y_test, bagging_preds), 5))
print("Random Forest recall: ", round(recall_score(y_test, random_forest_preds), 5))
print("Decision Tree recall: ", round(recall_score(y_test, decision_tree_preds), 5))
print("KNN recall: ", round(recall_score(y_test, knn_preds), 5))
print("GBDT recall: ", round(recall_score(y_test, gbdt_preds), 5))
print("SVM recall: ", round(recall_score(y_test, svm_preds), 5))
print("Naive Bayes recall: ", round(recall_score(y_test, nb_preds), 5))

Bagging accuracy:  0.9474
Random Forest accuracy:  0.9649
Decision Tree accuracy:  0.9474
KNN accuracy:  0.8246
GBDT accuracy:  0.9474
SVM accuracy:  0.6842
Naive Bayes accuracy:  0.807 

Bagging f1 score:  0.9412
Random Forest f1 score:  0.96
Decision Tree f1 score:  0.9388
KNN f1 score:  0.8214
GBDT f1 score:  0.9388
SVM f1 score:  0.7188
Naive Bayes f1 score:  0.807 

Bagging recall:  0.92308
Random Forest recall:  0.92308
Decision Tree recall:  0.88462
KNN recall:  0.88462
GBDT recall:  0.88462
SVM recall:  0.88462
Naive Bayes recall:  0.88462


In [180]:
# Leave One Out Cross Validation
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Load data into X and y
X = ml_ready_df.drop(columns=['Is Account Takeover'])
y = ml_ready_df['Is Account Takeover']

# Define classifiers to train and test
classifiers = [
    BaggingClassifier(),
    DecisionTreeClassifier(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(),
    RandomForestClassifier(),
    GaussianNB(),
    SVC()
]

# Train and test each classifier with LOOCV
loo = LeaveOneOut()
for classifier in classifiers:
    print("Training and testing: ", type(classifier).__name__)
    scores = cross_val_score(classifier, X, y, cv=loo)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


Training and testing:  BaggingClassifier
Accuracy: 0.92 (+/- 0.55)
Training and testing:  DecisionTreeClassifier
Accuracy: 0.92 (+/- 0.55)
Training and testing:  GradientBoostingClassifier
Accuracy: 0.93 (+/- 0.50)
Training and testing:  KNeighborsClassifier
Accuracy: 0.89 (+/- 0.62)
Training and testing:  RandomForestClassifier
Accuracy: 0.92 (+/- 0.54)
Training and testing:  GaussianNB
Accuracy: 0.84 (+/- 0.73)
Training and testing:  SVC
Accuracy: 0.70 (+/- 0.91)


In [199]:
# Feature importance analysis
rf_importances = random_forest_clf.feature_importances_
gbdt_importances = gbdt_clf.feature_importances_

print(rf_importances)
print(gbdt_importances)

print(X)

[0.33855878 0.20085507 0.21797132 0.09237608 0.15023875]
[0.3569454  0.29807242 0.1969369  0.06916104 0.07888424]
     Country  Device Type  Login Successful  Is Attack IP  OS Name
0         14            1                 1             0        3
1         14            1                 1             0        3
2         22            1                 1             1        1
3         14            2                 1             0        6
4         19            1                 1             0        3
..       ...          ...               ...           ...      ...
277       20            2                 0             0        6
278       19            2                 1             0        6
279       19            1                 1             0        3
280       19            2                 1             0        6
281       19            2                 1             0        6

[282 rows x 5 columns]


In [1]:
final_df["IP Address"].value_counts()

NameError: name 'final_df' is not defined