In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os 
from joblib import dump, load

from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression

### Community data model

Because of the absolute time issue, this model's performance is not representative or meaningful. It is included mainly for completeness

In [4]:
# Community data import (make sure to have path with access to parsed community data)
path="../../p2/parsed/community/"
day_one="day_1"
day_two="day_2"
day_three="day_3"
file="_parsed.csv"
col_names = ["website-index", "time", "direction", "packet size"]

df_one = pd.read_csv(path+day_one+file, header=0, names=col_names, error_bad_lines=False, warn_bad_lines=False)
df_two = pd.read_csv(path+day_two+file, header=0, names=col_names, error_bad_lines=False, warn_bad_lines=False)
df_three = pd.read_csv(path+day_three+file, header=0, names=col_names, error_bad_lines=False, warn_bad_lines=False)

# Preprocess training data
df = df_one.append(df_two)
X = df.loc[:, df.columns != 'website-index']
y = df['website-index']

# Preprocess testing data
X_test = df_three.loc[:, df_three.columns != 'website-index']
y_test = df_three['website-index']

# Model training
clf = RandomForestClassifier()
clf.fit(X,y)

svm_model = svm.SVC()
svm_model.fit(X, y)

# Evaluate
rf_acc = clf.score(X_test, y_test)
svm_acc = svm_model.score(X_test, y_test)

print("RF acc: " + str(rf_acc))
print("SVM acc: " + str(svm_acc))

RF acc: 0.9708700651590648
SVM acc: 0.9862016098121886


### Original data (all possible protocol formats)

In [None]:
# We want to loop over the folders in /collection
# load up the dataframes in each of them
# Train the model
# Dump it out 

path = '../../p2/collection'
col_names = ["website-index", "time", "direction", "packet size"]
folders = []
for r, d, f in os.walk(path):
    for folder in d:
        folders.append(folder)

results = {}

for folder in folders:
    if (folder == "raw"):
        continue 
    results[folder] = {} # Goddam python dictionaries
    print(folder)
    df_one = pd.read_csv(path+"/"+folder+"/"+"Day1-parsed-ondevice.csv", header=0, names=col_names, error_bad_lines=False, warn_bad_lines=False, quotechar='^')
    df_two = pd.read_csv(path+"/"+folder+"/"+"Day2-parsed-ondevice.csv", header=0, names=col_names, error_bad_lines=False, warn_bad_lines=False, quotechar='^')
    df_three = pd.read_csv(path+"/"+folder+"/"+"Day3-parsed-ondevice.csv", header=0, names=col_names, error_bad_lines=False, warn_bad_lines=False, quotechar='^')
    
    # Preprocess training data
    df = df_one.append(df_two)
    X = df.loc[:, df.columns != 'website-index']
    y = df['website-index']
    
    # Preprocess testing data
    X_test = df_three.loc[:, df_three.columns != 'website-index']
    y_test = df_three['website-index']
    
    outpath = './models/' + folder + "/"
    if not os.path.exists(outpath):
        os.mkdir(outpath)    

    # Model training
    clf = RandomForestClassifier()
    clf.fit(X,y)
    rf_acc = clf.score(X_test, y_test)
    print("RF acc: " + str(rf_acc))
    results[folder]["rf"] = str(rf_acc)
    dump(clf, outpath + 'rf_model.joblib')
    print(results)

    
    
    svm_model = svm.SVC(verbose=True)
    svm_model.fit(X, y)
    svm_acc = svm_model.score(X_test, y_test)
    print("SVM acc: " + str(svm_acc))
    results[folder]["svm"] = str(svm_acc)
    dump(svm_model, outpath + 'svm_model.joblib')
    print(results)

    
    
    lr = LogisticRegression(random_state=0, max_iter=10e5)
    lr.fit(X, y)
    lr_acc = lr.score(X_test, y_test)
    print("Logistic Reg acc: " + str(lr_acc))
    results[folder]["lr"] = str(lr_acc)
    dump(lr, outpath + "lr_model.joblib")
    print(results)

all




RF acc: 0.20582571253629342
{'all': {'rf': '0.20582571253629342'}}




[LibSVM]

[TODO] So we want the output of all the combinations here.

### Newly collected data

After receiving the community scripts, we wanted to compare out original locally collected data against the data we would collect using the community scripts. So we corrected and adapted the scripts (because of the absolute time issue). As you can see down below, the performance of the model (both SVM & RF) is much worse; basically equal to random guessing



In [17]:
path = "./community_script/aggregate_data/"
day_one="day_1"
day_two="day_2"
day_three="day_3"
file="_parsed.csv"
col_names = ["website-index", "time", "direction", "packet size"]


df_one = pd.read_csv(path+day_one+file, header=0, names=col_names, error_bad_lines=False, warn_bad_lines=False, quotechar='^')
df_two = pd.read_csv(path+day_two+file, header=0, names=col_names, error_bad_lines=False, warn_bad_lines=False, quotechar='^')
df_three = pd.read_csv(path+day_three+file, header=0, names=col_names, error_bad_lines=False, warn_bad_lines=False, quotechar='^')

In [18]:
# Verify data correctness:
df_one

Unnamed: 0,website-index,time,direction,packet size
0,youtube,4.241315,1,368
1,youtube,4.254662,0,450
2,youtube,4.275345,1,373
3,youtube,4.289213,0,450
4,yahoo,3.834408,1,368
...,...,...,...,...
804,imgur,3.842293,0,450
805,dropbox,3.527019,1,368
806,dropbox,3.537343,0,450
807,dropbox,3.556095,1,373


In [8]:
# Preprocess training data
df = df_one.append(df_two)
X = df.loc[:, df.columns != 'website-index']
y = df['website-index']

# Preprocess testing data
X_test = df_three.loc[:, df_three.columns != 'website-index']
y_test = df_three['website-index']

# Model training
clf = RandomForestClassifier()
clf.fit(X,y)

svm_model = svm.SVC()
svm_model.fit(X, y)

lr = LogisticRegression(random_state=0, max_iter=10e5)
lr.fit(X, y)

# Evaluate
rf_acc = clf.score(X_test, y_test)
svm_acc = svm_model.score(X_test, y_test)
lr_acc = lr.score(X_test, y_test)

# print("Logistic Reg acc: " + str(lr_acc))
print("RF acc: " + str(rf_acc))
print("LR acc: " + str(lr_acc))
print("SVM acc: " + str(svm_acc))

RF acc: 0.051597051597051594
LR acc: 0.056511056511056514
SVM acc: 0.05405405405405406


Because the performance on the newly collected data using the community scripts is much worse, we continue to model on the originally collected data.