In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

width = 12
height = 7
plt.rcParams["figure.figsize"] = (width, height)

from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import tflscripts
import json
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn import tree
import pickle
from sklearn.metrics import confusion_matrix

configuration = tflscripts.read_configuration()
df = pd.read_pickle('results_filtered.p')

In [2]:
without_transfer = df.loc[df.type_of_transfer == 'No transfer']

def apply_accuracy_without_transfer(x):
    queried = without_transfer.query('source_device_name == "{}" & source_dataset == "{}" & features == "{}" & label == "{}" & classifier == "{}"'.format(
        x['source_device_name'],
        x['source_dataset'],
        x['features'],
        x['label'],
        x['classifier']
    ))

    if len(queried) > 0:
        return queried['accuracy_positive'].iloc[0]

    return -1

df['accuracy_without_transfer'] = df.apply(apply_accuracy_without_transfer, axis=1)

In [3]:
df['accuracy_without_transfer_r'] = df['accuracy_without_transfer'].round(1)
df['accuracy_negative_r'] = df['accuracy_negative'].round(1)

In [4]:
def to_accuracy_bin(acc):
    bins = [
        [0.0, 0.75],
        [0.75, 1.0]
    ]
    return [i for i, b in enumerate(bins) if b[0] <= acc and b[1] >= acc][0]

df['accuracy_bin'] = [to_accuracy_bin(a) for a in df['accuracy_positive']]

In [5]:
filtered = df.loc[df.type_of_transfer != 'No transfer']

test_split = 0.33
msk = np.random.rand(len(filtered)) <= test_split
train = filtered[msk]
test = filtered[~msk]

In [6]:
columns = [
    'features',
    'classifier',
    'label',
    'samples',
    'type_of_transfer',
    'accuracy_without_transfer_r',
    'accuracy_negative_r'
]

def apply_avg(x):
    queried = df.query('features == "{}" & classifier == "{}" & label == "{}" & samples == {} & type_of_transfer == "{}" & accuracy_without_transfer_r == "{}" & accuracy_negative_r == "{}"'.format(
        x['features'],
        x['classifier'],
        x['label'],
        x['samples'],
        x['type_of_transfer'],
        x['accuracy_without_transfer_r'],
        x['accuracy_negative_r']
    ))

    if len(queried) > 0:
        median = queried['accuracy_positive'].median()
        return median

    return -1

aggregated = train[columns]
aggregated = aggregated.drop_duplicates()
aggregated['accuracy_positive'] = aggregated.apply(apply_avg, axis=1)
aggregated['accuracy_bin'] = [to_accuracy_bin(a) for a in aggregated['accuracy_positive']]

In [7]:
def to_x_and_y(filtered):
    X = filtered[columns]
    y = filtered['accuracy_bin']

    X = [dict(r.iteritems()) for _, r in X.iterrows()]
    y = y.values

    return X, y

X_train, y_train = to_x_and_y(aggregated)
X_test, y_test = to_x_and_y(test)

ppl = Pipeline([
    ('vect', DictVectorizer()),
    ('impute', Imputer()),
    ('clf', RandomForestClassifier())
])

ppl.fit(X_train, y_train)
predicted = ppl.predict(X_test)
accuracy = accuracy_score(y_test, predicted)
print(accuracy)

0.937396469983


In [8]:
confusion_matrix(y_test, predicted)

array([[15208,   262],
       [  834,  1203]])

In [9]:
predictions = pd.Series(predicted)
predictions.index = test.index
df['predictions'] = predictions

df.to_pickle('results_with_accuracy_classified.p')

In [10]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.95      0.98      0.97     15470
          1       0.82      0.59      0.69      2037

avg / total       0.93      0.94      0.93     17507

