In [None]:
import sys
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation

%matplotlib inline

In [None]:
feat_labels =  ['dimension', 
            'size',
            'fps',
            'temporal_difference-euclidean', 
            #'temporal_difference-manhattan',
            #'temporal_difference-max', 
            #'temporal_difference-mean',
            #'temporal_difference-std', 
            'temporal_cross_correlation-euclidean', 
            #'temporal_cross_correlation-manhattan',
            #'temporal_cross_correlation-max', 
            #'temporal_cross_correlation-mean',
            #'temporal_cross_correlation-std',
            'temporal_dct-euclidean', 
            #'temporal_dct-manhattan',
            #'temporal_dct-max', 
            #'temporal_dct-mean',
            #'temporal_dct-std',
            'temporal_canny-euclidean', 
            #'temporal_canny-manhattan',
            #'temporal_canny-max', 
            #'temporal_canny-mean',
            #'temporal_canny-std',
            'temporal_gaussian-euclidean', 
            #'temporal_gaussian-manhattan',
            #'temporal_gaussian-max', 
            #'temporal_gaussian-mean',
            #'temporal_gaussian-std',
            'temporal_histogram_distance-euclidean',
            #'temporal_histogram_distance-manhattan',
            #'temporal_histogram_distance-max', 
            #'temporal_histogram_distance-mean',
            #'temporal_histogram_distance-std'
               ]


path = '../../machine_learning/cloud_functions/data-large.csv'

metric_processor = MetricProcessor(feat_labels,'UL', path, reduced=True)
df = metric_processor.read_and_process_data()

N=10000
df = df[:N]
df.shape

In [None]:
df.head()

In [None]:
print(feat_labels)
# Create X from the features
X = df[feat_labels].drop(['title', 'attack', 'attack_ID'], axis=1).values

# Create y from output
y = df['attack_ID'].values

In [None]:
# View the features
X[0:5]

In [None]:
# View the target data
y

In [None]:
# Split the data into 40% test and 60% training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1, verbose=1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feat_labels, clf.feature_importances_):
    print(feature)

In [None]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.15
sfm = SelectFromModel(clf, threshold=0.05)

# Train the selector
sfm.fit(X_train, y_train)

In [None]:
features_df = pd.DataFrame()

features_df['importance'] = clf.feature_importances_
features_df['feature_name'] = list(df[feat_labels].drop(['title', 'attack', 'attack_ID'], axis=1))

features_df.sort_values(by=['importance'], ascending=False)

In [None]:
# Transform the data to create a new dataset containing only the most important features
# Note: We have to apply the transform to both the training X and test X data.
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [None]:
# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
clf_important.fit(X_important_train, y_train)

In [None]:
# Apply The Full Featured Classifier To The Test Data
y_pred = clf.predict(X_test)

# View The Accuracy Of Our Full Feature (reduced Features) Model
accuracy_score(y_test, y_pred)

In [None]:
# Apply The Full Featured Classifier To The Test Data
y_important_pred = clf_important.predict(X_important_test)

# View The Accuracy Of Our Limited Feature (2 Features) Model
accuracy_score(y_test, y_important_pred)