In [71]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

width = 12
height = 7
plt.rcParams["figure.figsize"] = (width, height)

import pandas as pd
import numpy as np
import tflscripts
import json
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn import tree

configuration = tflscripts.read_configuration()
df = pd.read_pickle('results.p')

In [72]:
df['key'] = df['label'] + df['features'] + df['source_dataset'] + df['source_device'] + df['target_dataset'] + df['target_device']

df_ = df[['key', 'classifier', 'accuracy_positive']]
results = pd.pivot_table(df_, values=['accuracy_positive'], columns=['classifier'], index=['key']).idxmax(axis=1)

In [73]:
df['results'] = [results[key] for key in df['key']]
df_ = df.drop_duplicates(subset='key', keep='first')
df_['results'] = [r[1] for r in df_['results']]
df_ = df_[['label', 'features', 'type_of_transfer', 'results']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [74]:
df_no_transfer = df_.loc[df_.type_of_transfer == 'No transfer']
df_transfer_within_spaces = df_.loc[df_.type_of_transfer.isin(['Same device type in same place',
       'Same device in different place',
       'Same device type in different place'])]

In [75]:
def test_decision_tree(X_train, X_test, y_train, y_test):
    ppl = Pipeline([
        ('vect', DictVectorizer()),
        ('impute', Imputer()),
        ('clf', tree.DecisionTreeClassifier())
    ])

    ppl.fit(X_train, y_train)
    predicted = ppl.predict(X_test)
    return accuracy_score(y_test, predicted)

# training and testing on data without transfer

X = df_no_transfer[['label', 'features']]
y = df_no_transfer['results']

X = [dict(r.iteritems()) for _, r in X.iterrows()]
y = y.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

accuracy_no_transfer = test_decision_tree(X_train, X_test, y_train, y_test)

# training and testing on data with transfer within spaces

X = df_transfer_within_spaces[['label', 'features']]
y = df_transfer_within_spaces['results']

X = [dict(r.iteritems()) for _, r in X.iterrows()]
y = y.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

accuracy_transfer_within_space = test_decision_tree(X_train, X_test, y_train, y_test)

# training on no transfer and testing on data with transfer within spaces

X_train = df_no_transfer[['label', 'features']]
y_train = df_no_transfer['results']

X_train = [dict(r.iteritems()) for _, r in X_train.iterrows()]
y_train = y_train.values

X_test = df_transfer_within_spaces[['label', 'features']]
y_test = df_transfer_within_spaces['results']

X_test = [dict(r.iteritems()) for _, r in X_test.iterrows()]
y_test = y_test.values

accuracy_train_no_transfer_test_within_space = test_decision_tree(X_train, X_test, y_train, y_test)

print('Predicting best performing classifier')
print('Training and testing on data without transfer', accuracy_no_transfer)
print('Training and testing on transfer within spaces', accuracy_transfer_within_space)
print('Training without transfer and testing on transfer within spaces', accuracy_train_no_transfer_test_within_space)

Predicting best performing classifier
Training and testing on data without transfer 0.741935483871
Training and testing on transfer within spaces 0.716586151369
Training without transfer and testing on transfer within spaces 0.529004789782
