In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import keras
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = np.asarray(df[name], dtype = np.float).mean()

    if sd is None:
        sd = np.asarray(df[name], dtype = np.float).std()

    df[name] = (np.asarray(df[name], dtype = np.float) - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

In [4]:

# This file is a CSV, just no CSV extension or headers

df_ = pd.read_csv("./train_50%_v4.csv")
df_test = pd.read_csv("./test_50%_v4.csv")
print("Read df_ {} rows.".format(len(df_)))
print("Read df_test {} rows.".format(len(df_test)))
#print("Read {} rows.".format(len(df1)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
df_ = df_.drop([df_.columns[0], df_.columns[8], df_.columns[9], df_.columns[10]], axis=1)
df_test = df_test.drop([df_test.columns[0], df_test.columns[8], df_test.columns[9], df_test.columns[10]], axis=1)

df_.dropna(inplace=True,axis=1) # For now, just drop NA's (rows with missing values)
df_test.dropna(inplace=True,axis=1)

Read df_ 999 rows.
Read df_test 999 rows.


In [5]:
df_.head(5)

Unnamed: 0,avg(pkt_len),stddev(pkt_len),fb_ratio,inter_arrival_time,pkt_count,duration,is_tcp,outcome
0,73.2,20.571825,1.859375,0.017486,5,0.069943,1,download
1,70.92381,11.51228,-1.0,0.002947,105,0.306517,1,game
2,193.84,364.80996,0.429077,0.0075,25,0.179999,1,download
3,347.0,488.311807,0.454012,0.266814,20,5.069462,1,voip
4,195.933333,371.472834,0.271744,0.093039,15,1.30254,1,voip


In [6]:
df_test.head(5)

Unnamed: 0,avg(pkt_len),stddev(pkt_len),fb_ratio,inter_arrival_time,pkt_count,duration,is_tcp,outcome
0,95.0,70.710678,0.310345,0.22253,2,0.22253,0,streaming
1,355.071429,519.826105,0.192086,1.134777,28,30.638989,1,game
2,73.2,20.571825,1.859375,0.000913,5,0.003653,1,streaming
3,195.933333,371.472834,0.271744,0.012241,15,0.171377,1,voip
4,1352.0,0.0,-1.0,9.3e-05,775,0.071769,1,download


In [7]:
encode_numeric_zscore(df_, 'fb_ratio')
encode_numeric_zscore(df_, 'pkt_count')
encode_numeric_zscore(df_, 'inter_arrival_time')
encode_numeric_zscore(df_, 'stddev(pkt_len)')
encode_numeric_zscore(df_, 'avg(pkt_len)')
# encode_numeric_zscore(df_, 'pkt_len')
encode_numeric_zscore(df_, 'duration')
encode_text_index(df_, 'outcome')

encode_numeric_zscore(df_test, 'fb_ratio')
encode_numeric_zscore(df_test, 'pkt_count')
encode_numeric_zscore(df_test, 'inter_arrival_time')
encode_numeric_zscore(df_test, 'stddev(pkt_len)')
encode_numeric_zscore(df_test, 'avg(pkt_len)')
# encode_numeric_zscore(df_test, 'pkt_len')
encode_numeric_zscore(df_test, 'duration')
encode_text_index(df_test, 'outcome')

array(['download', 'game', 'streaming', 'voip'], dtype=object)

In [8]:
x_train, y_train = to_xy(df_, 'outcome')
x_test, y_test = to_xy(df_test, 'outcome')
# x, y = to_xy(df_, ' Label')

In [9]:
y_test_eval = np.argmax(y_test,axis=1)

In [10]:
y_train_eval = np.argmax(y_train,axis=1)

In [11]:
df_.head(5)

Unnamed: 0,avg(pkt_len),stddev(pkt_len),fb_ratio,inter_arrival_time,pkt_count,duration,is_tcp,outcome
0,-0.842605,-0.857108,0.297072,-0.09354,-0.381792,-0.211227,1,0
1,-0.847032,-0.900791,-0.252081,-0.094069,-0.273562,-0.209695,1,1
2,-0.607948,0.802716,0.022378,-0.093903,-0.360146,-0.210514,1,0
3,-0.310037,1.39821,0.027167,-0.084476,-0.365558,-0.178858,1,3
4,-0.603877,0.834843,-0.007838,-0.090794,-0.370969,-0.203246,1,3


In [43]:
from sklearn.ensemble import RandomForestClassifier

# Load pandas
import pandas as pd

# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=3)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(x_train, y_train_eval)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=3, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [44]:
y_train_pred = clf.predict(x_train)
print("training score: {}".format(metrics.accuracy_score(y_train_eval, y_train_pred)))

training score: 0.990990990990991


In [45]:
y_pred = clf.predict(x_test)

In [46]:
# y_pred = np.argmax(y_pred,axis=1)

In [47]:
score = metrics.accuracy_score(y_test_eval, y_pred)
print("Validation score: {}".format(score))

Validation score: 0.43843843843843844


In [48]:
y_prob_test = clf.predict_proba(x_test)[0:3]

In [49]:
from sklearn.metrics import classification_report,confusion_matrix
Cm_rf = confusion_matrix(y_test_eval,y_pred)
C_rf = np.sum(Cm_rf)
Cm_rf = Cm_rf/C_rf
print('Random Forest Confusion Matrix:')
print(np.array_str(Cm_rf, precision=4, suppress_small=True))

Random Forest Confusion Matrix:
[[ 0.1552  0.028   0.0641  0.001 ]
 [ 0.027   0.1852  0.0501  0.005 ]
 [ 0.0521  0.1081  0.0701  0.006 ]
 [ 0.      0.0821  0.1381  0.028 ]]


In [19]:
print("Random Forest")
# print(Cm[0])
nd = [None]*4
for i,nd in enumerate (Cm_rf):
    for j in range (len(nd)):
        
        print(i,j,"%0.2f"%(nd[j]/np.sum(nd)))
    print("---")

Random Forest
0 0 0.58
0 1 0.08
0 2 0.32
0 3 0.02
---
1 0 0.01
1 1 0.66
1 2 0.31
1 3 0.01
---
2 0 0.21
2 1 0.31
2 2 0.44
2 3 0.04
---
3 0 0.00
3 1 0.21
3 2 0.73
3 3 0.06
---


In [20]:
from sklearn import svm

In [21]:
clf_svm = svm.SVC()

In [22]:
clf_svm.fit(x_train, y_train_eval)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [23]:
y_pred = clf_svm.predict(x_test)

In [24]:
acc = metrics.accuracy_score(y_test_eval, y_pred)
print(acc)

0.633633633634


In [25]:
y_train_pred = clf_svm.predict(x_train)
print("training score: {}".format(metrics.accuracy_score(y_train_eval, y_train_pred)))

training score: 0.6696696696696697


In [26]:
from sklearn.metrics import classification_report,confusion_matrix
Cm_svm = confusion_matrix(y_test_eval,y_pred)
C_svm = np.sum(Cm_svm)
Cm_svm = Cm_svm/C_svm
print('SVM Confusion Matrix:')
print(np.array_str(Cm_svm, precision=4, suppress_small=True))

SVM Confusion Matrix:
[[ 0.1441  0.023   0.015   0.0661]
 [ 0.002   0.1792  0.046   0.04  ]
 [ 0.028   0.0541  0.1331  0.021 ]
 [ 0.001   0.043   0.027   0.1772]]


In [27]:
print("SVM")
# print(Cm[0])
nd = [None]*4
for i,nd in enumerate (Cm_svm):
    for j in range (len(nd)):
        
        print(i,j,"%0.2f"%(nd[j]/np.sum(nd)))
    print("---")

SVM
0 0 0.58
0 1 0.09
0 2 0.06
0 3 0.27
---
1 0 0.01
1 1 0.67
1 2 0.17
1 3 0.15
---
2 0 0.12
2 1 0.23
2 2 0.56
2 3 0.09
---
3 0 0.00
3 1 0.17
3 2 0.11
3 3 0.71
---
