In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

from sklearn import tree
#from sklearn.model_selection import cross_val_score
#from sklearn import svm
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
data1 = pd.read_csv('data/raw/20220210T003523_300-500-true-300-5000-iperf.csv')
data2 = pd.read_csv('data/raw/20220210T003523_300-5000-true-300-500-iperf.csv')
data3 = pd.read_csv('data/raw/20220210T010808_400-2000-true-400-2000-iperf.csv')
data4 = pd.read_csv('data/raw/20220210T011704_500-3000-true-500-3000-iperf.csv')
data5 = pd.read_csv('data/raw/20220210T015628_600-4000-true-600-4000-iperf.csv')
data6 = pd.read_csv('data/raw/20220210T031531_700-4000-true-700-4000-iperf.csv')
data7 = pd.read_csv('data/raw/20220210T032446_900-6000-true-900-6000-iperf.csv')
data8 = pd.read_csv('data/raw/20220210T033058_1000-7000-true-1000-7000-iperf.csv')
data9 = pd.read_csv('data/raw/20220210T063725_1100-2500-true-1100-2500-iperf.csv')

In [None]:
#optional - take out first 25 seconds of data
data1 = data1[25:]
data2 = data2[25:]
data3 = data3[25:]
data4 = data4[25:]
data5 = data5[25:]
data6 = data6[25:]
data7 = data7[25:]
data8 = data8[25:]
data9 = data9[25:]

In [None]:
#reset "Time" to arbitrary values
data1['time'] = np.arange(len(data1))
data2['time'] = np.arange(len(data2))
data3['time'] = np.arange(len(data3))
data4['time'] = np.arange(len(data4))
data5['time'] = np.arange(len(data5))
data6['time'] = np.arange(len(data6))
data7['time'] = np.arange(len(data7))
data8['time'] = np.arange(len(data8))
data9['time'] = np.arange(len(data9))

In [None]:
#Adding the packet loss ratio for each dataset
packet_ratio1 = np.ones(len(data1)) * (1/5000)
latency1 = np.ones(len(data1)) * (300)
data1['packet_loss_ratio'] = packet_ratio1
data1["latency"] = latency1

packet_ratio2 = np.ones(len(data2)) * (1/500)
latency2 = np.ones(len(data2)) * (300)
data2['packet_loss_ratio'] = packet_ratio2
data2["latency"] = latency2

packet_ratio3 = np.ones(len(data3)) * (1/2000)
latency3 = np.ones(len(data3)) * (400)
data3['packet_loss_ratio'] = packet_ratio3
data3["latency"] = latency3

packet_ratio4 = np.ones(len(data4)) * (1/3000)
latency4 = np.ones(len(data4)) * (500)
data4['packet_loss_ratio'] = packet_ratio4
data4["latency"] = latency4

packet_ratio5 = np.ones(len(data5)) * (1/4000)
latency5 = np.ones(len(data5)) * (600)
data5['packet_loss_ratio'] = packet_ratio5
data5["latency"] = latency5

packet_ratio6 = np.ones(len(data6)) * (1/4000)
latency6 = np.ones(len(data6)) * (700)
data6['packet_loss_ratio'] = packet_ratio6
data6["latency"] = latency6

packet_ratio7 = np.ones(len(data7)) * (1/6000)
latency7 = np.ones(len(data7)) * (900)
data7['packet_loss_ratio'] = packet_ratio7
data7["latency"] = latency7

packet_ratio8 = np.ones(len(data8)) * (1/7000)
latency8 = np.ones(len(data8)) * (1000)
data8['packet_loss_ratio'] = packet_ratio8
data8["latency"] = latency8

packet_ratio9 = np.ones(len(data9)) * (1/2500)
latency9 = np.ones(len(data9)) * (1100)
data9['packet_loss_ratio'] = packet_ratio9
data9["latency"] = latency9

In [None]:
#combine datasets
data = pd.concat([data1,data2,data3,data4,data5,data6,data7,data8,data9])
data

# Feature Engineering

In [None]:
# Feature 1: Maximum Packet Size for each interaction
def max_size(x):
    """
    Helper function used to add max packet size feature
    """
    nums = x.split(';')[:-1]
    nums_int = list(map(int, nums))
    return max(nums_int)

In [None]:
# Feature 2: Range of Packet Size for each interaction
def range_size(x):
    """
    Helper function used to add range of packet size feature
    """
    nums = x.split(';')[:-1]
    nums_int = list(map(int, nums))
    return max(nums_int) - min(nums_int)

In [None]:
# Feature 3: Average of Packet Size for each interaction
def avg_size(x):
    """
    Helper function used to add average packet size feature
    """
    nums = x.split(';')[:-1]
    numsInt = list(map(int, nums))
    return np.mean(numsInt)

In [None]:
# Feature 4: Average Packet Duration
def packet_dur(x):
    """
    Helper function used to add longest packet duration feature
    """
    return np.mean(np.diff(list(map(int, x.split(';')[-2]))))
    #nums = x.split(';')[:-2]
    #numsInt = list(map(int, nums))
    #return np.mean(np.diff(numsInt))

In [None]:
# Feature 5: Total packet Direction
def total_packet_dir(x):
    """
    Helper function used to add total packet direction feature
    """
    dirs = x.split(';')[:-1]
    totalDirs = 0

    for i in dirs:
        if i == '1':
            totalDirs += 1
        elif i == '2':
            totalDirs -= 1

    return totalDirs

In [None]:
# Feature 6: total packets -> Done in apply_features()
# Feature 7: total bytes -> Done in apply_features()

In [None]:
# Feature 8: Interaction length
def interaction_length(x):
    """
    Helper function used to add interaction length feature
    """
    times = x.split(';')[:-1]
    times2 = list(map(int, times))
    startTime = min(times2)
    endTime = max(times2)

    return endTime - startTime

In [None]:
# Feature 9: total packets over time ratio -> Done in apply_features()
# Feature 10: total bytes over time ratio -> Done in apply_features()

In [None]:
# Convert packet loss ratio into categorical values
def ratio_to_category(x):
        return str(x)

In [None]:
def apply_features(df):
    """
    Takes in a raw dataframe from etl.py and 
    applys all the custom features into one dataframe
    """
    df['max_packet_size'] = df['packet_sizes'].apply(max_size)
    df['range_packet_size'] = df['packet_sizes'].apply(range_size)
    df['avg_packet_size'] = df['packet_sizes'].apply(avg_size)
    df['avg_packet_dur'] = df['packet_times'].apply(packet_dur)
    df['total_packet_dir'] = df['packet_dirs'].apply(total_packet_dir)
    df['total_packets'] = df['1->2Pkts'] + df['2->1Pkts']
    df['total_bytes'] = df['1->2Bytes'] + df['2->1Bytes']
    df['interaction_length'] = df['packet_times'].apply(interaction_length)
    df['packets_time_ratio'] = df['total_packets'] / df['interaction_length']
    df['bytes_time_ratio'] = df['total_bytes'] / df['interaction_length']
    df['packet_loss_ratio_class'] = df['packet_loss_ratio'].apply(ratio_to_category)
    df['latency_class'] = df['latency'].apply(ratio_to_category)

In [None]:
#apply feature engineering
apply_features(data)

In [None]:
def modify(x):
    if x == float('inf'):
        return 0
    else:
        return x

In [None]:
data['packets_time_ratio'] = data['packets_time_ratio'].apply(modify)
data['bytes_time_ratio'] = data['bytes_time_ratio'].apply(modify)

# EDA

In [None]:
data.columns

# Features List

In [None]:
list1 = ['time', 'total_packets', 'total_bytes', 'total_packet_dir', 'interaction_length',
         'packets_time_ratio','bytes_time_ratio', 'avg_packet_size', 'avg_packet_dur',
         'max_packet_size', 'range_packet_size']

In [None]:
features_list = list1

# Linear Regresesion Model

In [None]:
#loss
def linear_regression(df, features_list, y = "packet_loss_ratio"):
    # Load the dataset
    features = features_list
    df_X = df[features]
    df_y = df[y]

    # Split the data into training/testing sets
    X_train, X_rem, y_train, y_rem = train_test_split(df_X, df_y, train_size=0.8, random_state=42)

    X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Train the model using the training sets
    regr.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred = regr.predict(X_test)

    # The coefficients
    #print("Coefficients: \n", regr.coef_)
    
    # The mean squared error
    #print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    
    # The coefficient of determination: 1 is perfect prediction
    #print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
    return r2_score(y_test.reset_index(drop=True), y_pred), y_test.reset_index(drop=True), y_pred

In [None]:
# run model
results = []
for i in range(100):
    results.append(linear_regression(data, features_list))

In [None]:
# (coefficient of determination) regression score function
scores = []
for i in results:
    scores.append(i[0])
plt.plot(scores)

In [None]:
# prection values vs true values
#result = linear_regression(data, features_list)
#plt.plot(result[1])
#plt.plot(result[2])
#plt.legend(["y_test", "y_pred"])

In [None]:
#latency
def linear_regression(df, features_list, y = "latency"):
    # Load the dataset
    features = features_list
    df_X = df[features]
    df_y = df[y]

    # Split the data into training/testing sets
    X_train, X_rem, y_train, y_rem = train_test_split(df_X, df_y, train_size=0.8, random_state=42)

    X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Train the model using the training sets
    regr.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred = regr.predict(X_test)

    # The coefficients
    #print("Coefficients: \n", regr.coef_)
    
    # The mean squared error
    #print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    
    # The coefficient of determination: 1 is perfect prediction
    #print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
    return r2_score(y_test.reset_index(drop=True), y_pred), y_test.reset_index(drop=True), y_pred

In [None]:
# run model
results = []
for i in range(100):
    results.append(linear_regression(data, features_list))

In [None]:
# (coefficient of determination) regression score function
scores = []
for i in results:
    scores.append(i[0])
plt.plot(scores)

In [None]:
# prection values vs true values
#result = linear_regression(data, features_list)
#plt.plot(result[1])
#plt.plot(result[2])
#plt.legend(["y_test", "y_pred"])

# Decision Trees Model

In [None]:
#loss
def decision_tree(df, features_list, y = "packet_loss_ratio_class"):
    # Load the dataset 'packet_sizes_var', "Time"
    features = features_list
    df_X = df[features]
    df_y = df[y]

    # Split the data into training/testing sets
    X_train, X_rem, y_train, y_rem = train_test_split(df_X, df_y, train_size=0.8, random_state=42)

    X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

    # Create linear regression object
    clf = tree.DecisionTreeClassifier()

    # Train the model using the training sets
    clf = clf.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred = clf.predict(X_test)

    # The coefficients
    #print("Coefficients: \n", regr.coef_)
    
    # The mean squared error
    #print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    
    # The coefficient of determination: 1 is perfect prediction
    #print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
    
    return clf.score(X_test, y_test.reset_index(drop=True)), y_test.reset_index(drop=True), y_pred

In [None]:
# run model
results = []
for i in range(100):
    results.append(decision_tree(data, features_list))

In [None]:
# mean training accuracy
scores = []
for i in results:
    scores.append(i[0])
plt.plot(scores)

In [None]:
# prection values vs true values
#result = decision_tree(data, features_list)
#plt.plot(result[1])
#plt.plot(result[2])
#plt.legend(["y_test", "y_pred"])

In [None]:
#latency
def decision_tree(df, features_list, y = "latency"):
    # Load the dataset 'packet_sizes_var', "Time"
    features = features_list
    df_X = df[features]
    df_y = df[y]

    # Split the data into training/testing sets
    X_train, X_rem, y_train, y_rem = train_test_split(df_X, df_y, train_size=0.8, random_state=42)

    X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

    # Create linear regression object
    clf = tree.DecisionTreeClassifier()

    # Train the model using the training sets
    clf = clf.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred = clf.predict(X_test)

    # The coefficients
    #print("Coefficients: \n", regr.coef_)
    
    # The mean squared error
    #print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    
    # The coefficient of determination: 1 is perfect prediction
    #print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
    
    return clf.score(X_test, y_test.reset_index(drop=True)), y_test.reset_index(drop=True), y_pred

In [None]:
# run model
results = []
for i in range(100):
    results.append(decision_tree(data, features_list))

In [None]:
# mean training accuracy
scores = []
for i in results:
    scores.append(i[0])
plt.plot(scores)

In [None]:
# prection values vs true values
#result = decision_tree(data, features_list)
#plt.plot(result[1])
#plt.plot(result[2])
#plt.legend(["y_test", "y_pred"])

In [None]:
#latency class
def decision_tree(df, features_list, y = "latency_class"):
    # Load the dataset 'packet_sizes_var', "Time"
    features = features_list
    df_X = df[features]
    df_y = df[y]

    # Split the data into training/testing sets
    X_train, X_rem, y_train, y_rem = train_test_split(df_X, df_y, train_size=0.8, random_state=42)

    X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

    # Create linear regression object
    clf = tree.DecisionTreeClassifier()

    # Train the model using the training sets
    clf = clf.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred = clf.predict(X_test)

    # The coefficients
    #print("Coefficients: \n", regr.coef_)
    
    # The mean squared error
    #print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    
    # The coefficient of determination: 1 is perfect prediction
    #print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
    
    return clf.score(X_test, y_test.reset_index(drop=True)), y_test.reset_index(drop=True), y_pred

In [None]:
# run model
results = []
for i in range(100):
    results.append(decision_tree(data, features_list))

In [None]:
# mean training accuracy
scores = []
for i in results:
    scores.append(i[0])
plt.plot(scores)

In [None]:
# prection values vs true values
#result = decision_tree(data, features_list)
#plt.plot(result[1])
#plt.plot(result[2])
#plt.legend(["y_test", "y_pred"])

# SVM Model

In [None]:
#loss
def svm(df, features_list, y = "packet_loss_ratio_class"):
    # Load the dataset 'packet_sizes_var', "Time"
    features = features_list
    df_X = df[features]
    df_y = df[y]

    # Split the data into training/testing sets
    X_train, X_rem, y_train, y_rem = train_test_split(df_X, df_y, train_size=0.8, random_state=42)

    X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

    # Create linear regression object
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))

    # Train the model using the training sets
    clf = clf.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred = clf.predict(X_test)

    # The coefficients
    #print("Coefficients: \n", regr.coef_)
    
    # The mean squared error
    #print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    
    # The coefficient of determination: 1 is perfect prediction
    #print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
    
    return clf.score(X_test, y_test.reset_index(drop=True)), y_test.reset_index(drop=True), y_pred

In [None]:
# run model
results = []
for i in range(100):
    results.append(svm(data, features_list))

In [None]:
# mean training accuracy
scores = []
for i in results:
    scores.append(i[0])
plt.plot(scores)

In [None]:
# prection values vs true values
#result = svm(data, features_list)
#plt.plot(result[1])
#plt.plot(result[2])
#plt.legend(["y_test", "y_pred"])

In [None]:
#latency
def svm(df, features_list, y = "latency"):
    # Load the dataset 'packet_sizes_var', "Time"
    features = features_list
    df_X = df[features]
    df_y = df[y]

    # Split the data into training/testing sets
    X_train, X_rem, y_train, y_rem = train_test_split(df_X, df_y, train_size=0.8, random_state=42)

    X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

    # Create linear regression object
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))

    # Train the model using the training sets
    clf = clf.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred = clf.predict(X_test)

    # The coefficients
    #print("Coefficients: \n", regr.coef_)
    
    # The mean squared error
    #print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    
    # The coefficient of determination: 1 is perfect prediction
    #print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
    
    return clf.score(X_test, y_test.reset_index(drop=True)), y_test.reset_index(drop=True), y_pred

In [None]:
# run model
results = []
for i in range(100):
    results.append(svm(data, features_list))

In [None]:
# mean training accuracy
scores = []
for i in results:
    scores.append(i[0])
plt.plot(scores)

In [None]:
# prection values vs true values
#result = svm(data, features_list)
#plt.plot(result[1])
#plt.plot(result[2])
#plt.legend(["y_test", "y_pred"])

In [None]:
#latency class
def svm(df, features_list, y = "latency_class"):
    # Load the dataset 'packet_sizes_var', "Time"
    features = features_list
    df_X = df[features]
    df_y = df[y]

    # Split the data into training/testing sets
    X_train, X_rem, y_train, y_rem = train_test_split(df_X, df_y, train_size=0.8, random_state=42)

    X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

    # Create linear regression object
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))

    # Train the model using the training sets
    clf = clf.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred = clf.predict(X_test)

    # The coefficients
    #print("Coefficients: \n", regr.coef_)
    
    # The mean squared error
    #print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    
    # The coefficient of determination: 1 is perfect prediction
    #print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
    
    return clf.score(X_test, y_test.reset_index(drop=True)), y_test.reset_index(drop=True), y_pred

In [None]:
# run model
results = []
for i in range(100):
    results.append(svm(data, features_list))

In [None]:
# mean training accuracy
scores = []
for i in results:
    scores.append(i[0])
plt.plot(scores)

In [None]:
# prection values vs true values
##result = svm(data, features_list)
#plt.plot(result[1])
#plt.plot(result[2])
#plt.legend(["y_test", "y_pred"])