# Visual Data

## Recreation of Table 2
Since every .csv file contains two rows for each movie, we will try different aggregation method in order to recreacte table 2 from the paper. We will later use the same aggregation method to recreate table 1 using Weka.

In [1]:
# suppress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

import pandas as pd
import numpy as np
import csv
import glob
import random
import ntpath
import os

import sklearn
from sklearn import preprocessing
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

## Load Training Data

In [2]:
def load_visuals_data(path, agg = {'none', 'sum', 'mean', 'max', 'min', 'first', 'last'}):
    """ 
    Load all visuals data files and combine them into a single Pandas DataFrame.
    
    Parameter agg:
    'none': keep both rows as single row
    'sum': take the columnwise sum of the rows
    'mean': take the columnwise mean of the rows
    'max': take the columnwise max of the rows
    'min': take the columnwise min of the rows
    'first': keep only the first row
    'last': keep only the last row
    
    Returns
    --------
    visuals_data: data frame containing the visuals data
    """
    
    # create a list of all csv files
    all_files = glob.glob(path + "/*.csv")

    data = []
    df = pd.DataFrame()

    if agg is 'none':
        for filename in all_files:
            li = pd.read_csv(filename, index_col = None, header = None)
            df = pd.DataFrame(li.values.flatten()).transpose()
            #df['file_name'] = filename.rsplit('\\', 1)[1].rsplit('.', 1)[0]
            df['file_name'] = ntpath.basename(filename).split('.', 1)[0]
            data.append(df)
    
    elif agg is 'sum':
        for filename in all_files:
            df = pd.read_csv(filename, index_col = None, header = None)
            df.loc['file_name',:]= df.sum(axis=0)
            #df['file_name'] = filename.rsplit('\\', 1)[1].rsplit('.', 1)[0]
            df['file_name'] = ntpath.basename(filename).split('.', 1)[0]
            df = pd.DataFrame(df.drop([0,1]))
            data.append(df)
            
    elif agg is 'mean':
        for filename in all_files:
            df = pd.read_csv(filename, index_col = None, header = None)
            df.loc['file_name',:]= df.mean(axis=0)
            #df['file_name'] = filename.rsplit('\\', 1)[1].rsplit('.', 1)[0]
            df['file_name'] = ntpath.basename(filename).split('.', 1)[0]
            df = pd.DataFrame(df.drop([0,1]))
            data.append(df)
       
    elif agg is 'max':
        for filename in all_files:
            df = pd.read_csv(filename, index_col = None, header = None)
            df.loc['file_name',:]= df.max(axis=0)
            #df['file_name'] = filename.rsplit('\\', 1)[1].rsplit('.', 1)[0]
            df['file_name'] = ntpath.basename(filename).split('.', 1)[0]
            df = pd.DataFrame(df.drop([0,1]))
            data.append(df)
            
    elif agg is 'min':
        for filename in all_files:
            df = pd.read_csv(filename, index_col = None, header = None)
            df.loc['file_name',:]= df.min(axis=0)
            #df['file_name'] = filename.rsplit('\\', 1)[1].rsplit('.', 1)[0]
            df['file_name'] = ntpath.basename(filename).split('.', 1)[0]
            df = pd.DataFrame(df.drop([0,1]))
            data.append(df)
            
    elif agg is 'first':
        for filename in all_files:
            df = pd.read_csv(filename, index_col = None, header = None)
            #df['file_name'] = filename.rsplit('\\', 1)[1].rsplit('.', 1)[0]
            df['file_name'] = ntpath.basename(filename).split('.', 1)[0]
            df = pd.DataFrame(df.drop([1]))
            data.append(df)
            
    elif agg is 'last':
        for filename in all_files:
            df = pd.read_csv(filename, index_col = None, header = None)
            #df['file_name'] = filename.rsplit('\\', 1)[1].rsplit('.', 1)[0]
            df['file_name'] = ntpath.basename(filename).split('.', 1)[0]
            df = pd.DataFrame(df.drop([0]))
            data.append(df)
            
    else:
        print('Parameter \'agg\' needs to be one of the following strings:' )
        print('\'none\', \'sum\', \'mean\', \'max\', \'min\', \'first\', \'last\'')
        return
            

    visuals_data = pd.concat(data, axis=0, ignore_index=True)
    visuals_data.set_index(['file_name'], inplace = True)
    
    return visuals_data

# load visuals files
path_vis_train = r'./data/Dev_Set/vis_descriptors'

data_visuals = load_visuals_data(path_vis_train, agg = 'none')
data_visuals.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1642,1643,1644,1645,1646,1647,1648,1649,1650,1651
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Words_and_Pictures,0.26733,0.31877,0.30814,0.25255,0.30517,0.32915,0.33434,0.33807,0.27418,0.24486,...,767.16,477.38,22.941,19.609,17.061,19.442,132420.0,19124.0,36269.0,19072.0
Welcome_to_Me,0.30717,0.33422,0.33112,0.33124,0.31114,0.33644,0.33616,0.34479,0.16983,0.27379,...,397.26,168.23,20.426,21.3,18.608,21.182,83171.0,13714.0,32774.0,13780.0
Wild_Tales,0.30768,0.3376,0.34787,0.33882,0.3162,0.33023,0.33942,0.36068,0.11968,0.2602,...,368.64,177.21,22.276,22.18,19.554,21.984,83572.0,14376.0,30306.0,14468.0
V_for_Vendetta,0.038715,0.13579,0.15695,0.052015,0.1034,0.24479,0.32119,0.080424,0.11021,0.18988,...,6684.4,2339.7,0.40354,0.49571,0.38531,0.48153,10770.0,3543.4,9500.8,3256.9
Transformers__Age_of_Extinction,0.19996,0.26934,0.27986,0.23725,0.30844,0.33242,0.32998,0.325,0.30735,0.33431,...,1120.6,669.56,15.086,14.7,14.859,14.723,211630.0,24019.0,48339.0,24090.0


In [3]:
data_visuals.shape

(95, 1652)

In [4]:
def load_train_test_data(path):
    """ 
    Load all training or testing data files and combine them into a single Pandas DataFrame.
    
    Returns
    --------
    train_test_data: data frame containing the training or testing data
    """
    
    train_test_data = pd.read_csv(path, index_col = None, header = 0)
    train_test_data = train_test_data.drop("movie_name", axis=1)
    train_test_data.set_index(['file_name'], inplace = True)
    
    return train_test_data


# load training data
path_train = r'./data/Dev_Set/CoeDevelopmentTrainingdata.csv'

data_training = load_train_test_data(path_train)
data_training.head()

Unnamed: 0_level_0,goodforairplanes
file_name,Unnamed: 1_level_1
Seventh_Son,1
2_States,0
Welcome_to_Me,0
The_Judge,0
Transformers__Age_of_Extinction,0


In [5]:
# merge training data with visuals data
train = pd.merge(data_visuals, data_training, left_index = True, right_index=True)
train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1643,1644,1645,1646,1647,1648,1649,1650,1651,goodforairplanes
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Words_and_Pictures,0.26733,0.31877,0.30814,0.25255,0.30517,0.32915,0.33434,0.33807,0.27418,0.24486,...,477.38,22.941,19.609,17.061,19.442,132420.0,19124.0,36269.0,19072.0,1
Welcome_to_Me,0.30717,0.33422,0.33112,0.33124,0.31114,0.33644,0.33616,0.34479,0.16983,0.27379,...,168.23,20.426,21.3,18.608,21.182,83171.0,13714.0,32774.0,13780.0,0
Wild_Tales,0.30768,0.3376,0.34787,0.33882,0.3162,0.33023,0.33942,0.36068,0.11968,0.2602,...,177.21,22.276,22.18,19.554,21.984,83572.0,14376.0,30306.0,14468.0,0
V_for_Vendetta,0.038715,0.13579,0.15695,0.052015,0.1034,0.24479,0.32119,0.080424,0.11021,0.18988,...,2339.7,0.40354,0.49571,0.38531,0.48153,10770.0,3543.4,9500.8,3256.9,0
Transformers__Age_of_Extinction,0.19996,0.26934,0.27986,0.23725,0.30844,0.33242,0.32998,0.325,0.30735,0.33431,...,669.56,15.086,14.7,14.859,14.723,211630.0,24019.0,48339.0,24090.0,0


In [6]:
train.shape

(93, 1653)

## Load Testing Data

In [7]:
# load visuals data for testing
path_vis_test = r'./data/Test_Set/vis_descriptors'

data_visuals_test = load_visuals_data(path_vis_test, agg = 'none')
# rename first index
data_visuals_test = data_visuals_test.reset_index()
data_visuals_test['file_name'].iloc[0] = "10.000_Km"
data_visuals_test.set_index(['file_name'], inplace = True)

data_visuals_test.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1642,1643,1644,1645,1646,1647,1648,1649,1650,1651
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.000_Km,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,518400.0,269700.0,6.1035e-07,0.001645,2e-06,0.001645,1638400.0,269700.0,518400.0,269700.0
12_Years_a_Slave,0.0,0.193,0.19299,0.0,0.0,0.2711,0.2711,0.0,0.0,0.27024,...,30830.0,15454.0,15.294,13.422,13.418,13.307,71633.0,21855.0,48314.0,24434.0
21_Jump_Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,230400.0,119790.0,1.3776e-06,0.00247,4e-06,0.00247,725900.0,119790.0,230400.0,119790.0
2_States,0.033181,0.08976,0.073469,0.012184,0.10137,0.28969,0.26856,0.02602,0.071337,0.3302,...,629.31,476.71,4.4431,3.5334,2.7159,3.7869,203660.0,20777.0,29696.0,19740.0
A_Bug's_Life,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,81.909,43.269,2.9645,3.7378,2.4635,3.9526,4444.8,2405.4,5602.6,2611.1


In [8]:
# load testing data
path_test = r'./data/CoeTestLabels.csv'

data_testing = load_train_test_data(path_test)
data_testing.head()

Unnamed: 0_level_0,goodforairplanes
file_name,Unnamed: 1_level_1
10.000_Km,1
12_Years_a_Slave,1
2_States,1
21_Jump_Street,1
A_Bug's_Life,1


In [9]:
# merge testing data with visuals data
test = pd.merge(data_visuals_test, data_testing, left_index = True, right_index=True)
test.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1643,1644,1645,1646,1647,1648,1649,1650,1651,goodforairplanes
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.000_Km,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,269700.0,6.1035e-07,0.001645,2e-06,0.001645,1638400.0,269700.0,518400.0,269700.0,1
12_Years_a_Slave,0.0,0.193,0.19299,0.0,0.0,0.2711,0.2711,0.0,0.0,0.27024,...,15454.0,15.294,13.422,13.418,13.307,71633.0,21855.0,48314.0,24434.0,1
21_Jump_Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,119790.0,1.3776e-06,0.00247,4e-06,0.00247,725900.0,119790.0,230400.0,119790.0,1
2_States,0.033181,0.08976,0.073469,0.012184,0.10137,0.28969,0.26856,0.02602,0.071337,0.3302,...,476.71,4.4431,3.5334,2.7159,3.7869,203660.0,20777.0,29696.0,19740.0,1
A_Bug's_Life,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,43.269,2.9645,3.7378,2.4635,3.9526,4444.8,2405.4,5602.6,2611.1,1


# Prediction Models

In [10]:
def Classifier(tX, ty, vX, vy, CV = 10,
              classifier = {'knn', 'decision_tree', 'logisticregression', 'svm',
                            'random_forest', 'adaboost', 'gradient_boost'}, test = False):
    
    if classifier is 'knn':
        clf = KNeighborsClassifier()
        
    elif classifier is 'decision_tree':
        clf = DecisionTreeClassifier()
        
    elif classifier is 'logisticregression':
        clf = LogisticRegression()
        
    elif classifier is 'svm':
        clf = SVC(kernel = 'rbf')
        
    elif classifier is 'random_forest':
        clf = RandomForestClassifier()
        
    elif classifier is 'adaboost':
        clf = AdaBoostClassifier()
        
    elif classifier is 'gradient_boost':
        clf = GradientBoostingClassifier()
        
    else:
        print('Parameter \'classifier\' needs to be one of the following strings:' )
        print('\'knn\', \'decision_tree\', \'logisticregression\', \'svm\', \'random_forest\', \'adaboost\', \'gradient_boost\'')
        return

    
    if test:
        clf.fit(tX, ty)

        # test set
        predictions = np.array(clf.predict(vX))

        return predictions
    
    else:
        # CV
        precision = cross_val_score(clf, tX, ty, cv = CV, scoring = 'precision')
        recall = cross_val_score(clf, tX, ty, cv = CV, scoring = 'recall')
        f1 = cross_val_score(clf, tX, ty, cv = CV, scoring = 'f1')

        # predictions
        predictions = cross_val_predict(clf, tX, ty, cv = CV)

        scores = dict()
        scores['Precision'] = np.mean(precision)
        scores['Recall'] = np.mean(recall)
        scores['F1'] = np.mean(f1)
        scores['Predictions'] = np.array(predictions)
    
        return scores

# Las Vegas Wrapper - Feature Selection

In [11]:
def LVW(tX, ty, vX, vy, K, original_features,
       classifier = {'knn', 'decision_tree', 'logisticregression', 'svm',
                     'random_forest', 'adaboost', 'gradient_boost'}):
    
    
    acc = 0
    k = 0
    C = len(original_features)
    
    while k < K:
        #print('k: ', k)
        ran_choice = range(1,len(original_features)-1)
        S1 = random.sample(original_features, random.choice(ran_choice))
        C1 = len(S1)
        
        x_train = tX[tX.columns.intersection(S1)]
        x_test = vX[vX.columns.intersection(S1)]
        
        acc1 = Classifier(x_train, ty, x_test, vy, 10, classifier)['F1']
        
        if (acc1 > acc) or (acc1 == acc and C1 < C):
            k = 0
            acc = acc1
            C = C1
            S = S1
        
        else:
            k += 1
            
    return S

# Load Data and Perform Train/Test Split

In [12]:
def load_train_test_split(classifier = {'knn', 'decision_tree', 'logisticregression', 'svm',
                                        'random_forest', 'adaboost', 'gradient_boost'},
                          agg = {'none', 'sum', 'mean', 'max', 'min', 'first', 'last'}):   

    
    # laod visuals data
    data_visuals = load_visuals_data(path_vis_train, agg = agg)
    
    # merge training data with visuals data
    train = pd.merge(data_visuals, data_training, left_index = True, right_index=True)

    # same with testing set
    data_visuals_test = load_visuals_data(path_vis_test, agg = agg)

    # merge training data with visuals data
    test = pd.merge(data_visuals_test, data_testing, left_index = True, right_index=True)
    
    # train/test split
    X_train = train.iloc[:, :-1]
    Y_train = train.iloc[:,-1]
    
    X_test = test.iloc[:, :-1]
    Y_test = test.iloc[:,-1]
       
    # LVW
    if classifier is not 'random_forest':
        
        features = LVW(X_train, Y_train, X_test, Y_test, 5, range(0, X_train.shape[1]-1), classifier)

        # train/test split
        X_train = X_train[X_train.columns.intersection(features)]
        X_test = X_test[X_test.columns.intersection(features)]
  
        return X_train, Y_train, X_test, Y_test, features

    else:
        return X_train, Y_train, X_test, Y_test

## KNN
Perform Feature Selection (LVW) for all models

In [13]:
# set seed
np.random.seed(1)

# Keep both Rows
tX, ty, vX, vy, knn_features = load_train_test_split(classifier = 'knn', agg = 'none')
knn_scores = Classifier(tX, ty, vX, vy, classifier = 'knn')

# Sum
tX_sum, ty_sum, vX_sum, vy_sum, knn_features_sum = load_train_test_split(classifier = 'knn', agg = 'sum')
knn_scores_sum = Classifier(tX_sum, ty_sum, vX_sum, vy_sum, classifier = 'knn')

# Mean
tX_mean, ty_mean, vX_mean, vy_mean, knn_features_mean = load_train_test_split(classifier = 'knn', agg = 'mean')
knn_scores_mean = Classifier(tX_mean, ty_mean, vX_mean, vy_mean, classifier = 'knn')

# Max
tX_max, ty_max, vX_max, vy_max, knn_features_max = load_train_test_split(classifier = 'knn', agg = 'max')
knn_scores_max = Classifier(tX_max, ty_max, vX_max, vy_max, classifier = 'knn')

# Min
tX_min, ty_min, vX_min, vy_min, knn_features_min = load_train_test_split(classifier = 'knn', agg = 'min')
knn_scores_min = Classifier(tX_min, ty_min, vX_min, vy_min, classifier = 'knn')

# Keep only the first row
tX_first, ty_first, vX_first, vy_first, knn_features_first = load_train_test_split(classifier = 'knn', agg = 'first')
knn_scores_first = Classifier(tX_first, ty_first, vX_first, vy_first, classifier = 'knn')

# Keep only the second row
tX_last, ty_last, vX_last, vy_last, knn_features_last = load_train_test_split(classifier = 'knn', agg = 'last')
knn_scores_last = Classifier(tX_last, ty_last, vX_last, vy_last, classifier = 'knn')

#### Print Precision, Recall, F1

In [14]:
def print_scores(scores_dict):
    
    print("Precision: ", scores_dict["Precision"])
    print("Recall: ", scores_dict["Recall"])
    print("F1: ", scores_dict["F1"])
    print("----------------------------------------------------------------")

In [15]:
print("KNN using LVW Feature Selection")
print("----------------------------------------------------------------")

# none
print("KNN - Keep both rows")
print_scores(knn_scores)

# sum
print("KNN - Keep only columnwise sum")
print_scores(knn_scores_sum)

# mean
print("KNN - Keep only columnwise mean")
print_scores(knn_scores_mean)

# max
print("KNN - Keep only columnwise max")
print_scores(knn_scores_max)

# min
print("KNN - Keep only columnwise min")
print_scores(knn_scores_min)

# first
print("KNN - Keep only the first row")
print_scores(knn_scores_first)

# last
print("KNN - Keep only the last row")
print_scores(knn_scores_last)

KNN using LVW Feature Selection
----------------------------------------------------------------
KNN - Keep both rows
Precision:  0.5825396825396826
Recall:  0.8400000000000001
F1:  0.6816683316683316
----------------------------------------------------------------
KNN - Keep only columnwise sum
Precision:  0.596984126984127
Recall:  0.7
F1:  0.6383982683982683
----------------------------------------------------------------
KNN - Keep only columnwise mean
Precision:  0.6725
Recall:  0.7
F1:  0.6758119658119657
----------------------------------------------------------------
KNN - Keep only columnwise max
Precision:  0.5776984126984127
Recall:  0.7
F1:  0.6275901875901876
----------------------------------------------------------------
KNN - Keep only columnwise min
Precision:  0.6634523809523809
Recall:  0.7000000000000001
F1:  0.6601265401265402
----------------------------------------------------------------
KNN - Keep only the first row
Precision:  0.5699603174603174
Recall:  0.74


## Decision Tree
Perform Feature Selection (LVW) for all models

In [16]:
# set seed
np.random.seed(1)

# Keep both Rows
tX, ty, vX, vy, tree_features = load_train_test_split(classifier = 'decision_tree', agg = 'none')
tree_scores = Classifier(tX, ty, vX, vy, classifier = 'decision_tree')

# Sum
tX_sum, ty_sum, vX_sum, vy_sum, tree_features_sum = load_train_test_split(classifier = 'decision_tree', agg = 'sum')
tree_scores_sum = Classifier(tX_sum, ty_sum, vX_sum, vy_sum, classifier = 'decision_tree')

# Mean
tX_mean, ty_mean, vX_mean, vy_mean, tree_features_mean = load_train_test_split(classifier = 'decision_tree', agg = 'mean')
tree_scores_mean = Classifier(tX_mean, ty_mean, vX_mean, vy_mean, classifier = 'decision_tree')

# Max
tX_max, ty_max, vX_max, vy_max, tree_features_max = load_train_test_split(classifier = 'decision_tree', agg = 'max')
tree_scores_max = Classifier(tX_max, ty_max, vX_max, vy_max, classifier = 'decision_tree')

# Min
tX_min, ty_min, vX_min, vy_min, tree_features_min = load_train_test_split(classifier = 'decision_tree', agg = 'min')
tree_scores_min = Classifier(tX_min, ty_min, vX_min, vy_min, classifier = 'decision_tree')

# Keep only the first row
tX_first, ty_first, vX_first, vy_first, tree_features_first = load_train_test_split(classifier = 'decision_tree', agg = 'first')
tree_scores_first = Classifier(tX_first, ty_first, vX_first, vy_first, classifier = 'decision_tree')

# Keep only the second row
tX_last, ty_last, vX_last, vy_last, tree_features_last = load_train_test_split(classifier = 'decision_tree', agg = 'last')
tree_scores_last = Classifier(tX_last, ty_last, vX_last, vy_last, classifier = 'decision_tree')

#### Print Precision, Recall, F1

In [17]:
print("Decision Tree using LVW Feature Selection")
print("----------------------------------------------------------------")

# none
print("Decision Tree - Keep both rows")
print_scores(tree_scores)

# sum
print("Decision Tree - Keep only columnwise sum")
print_scores(tree_scores_sum)

# mean
print("Decision Tree - Keep only columnwise mean")
print_scores(tree_scores_mean)

# max
print("Decision Tree - Keep only columnwise max")
print_scores(tree_scores_max)

# last
print("Decision Tree - Keep only columnwise min")
print_scores(tree_scores_min)

# first
print("Decision Tree - Keep only the first row")
print_scores(tree_scores_first)

# last
print("Decision Tree - Keep only the last row")
print_scores(tree_scores_last)

Decision Tree using LVW Feature Selection
----------------------------------------------------------------
Decision Tree - Keep both rows
Precision:  0.6144047619047619
Recall:  0.6199999999999999
F1:  0.6193073593073593
----------------------------------------------------------------
Decision Tree - Keep only columnwise sum
Precision:  0.6059523809523809
Recall:  0.64
F1:  0.5993939393939394
----------------------------------------------------------------
Decision Tree - Keep only columnwise mean
Precision:  0.696031746031746
Recall:  0.6399999999999999
F1:  0.6314285714285715
----------------------------------------------------------------
Decision Tree - Keep only columnwise max
Precision:  0.638968253968254
Recall:  0.72
F1:  0.6624875124875125
----------------------------------------------------------------
Decision Tree - Keep only columnwise min
Precision:  0.6465079365079365
Recall:  0.64
F1:  0.6401082251082252
----------------------------------------------------------------
D

## Logistic Regression
Perform Feature Selection (LVW) for all models

In [18]:
# set seed
np.random.seed(1)

# Keep both Rows
tX, ty, vX, vy, reg_features = load_train_test_split(classifier = 'logisticregression', agg = 'none')
reg_scores = Classifier(tX, ty, vX, vy, classifier = 'logisticregression')

# Sum
tX_sum, ty_sum, vX_sum, vy_sum, reg_features_sum = load_train_test_split(classifier = 'logisticregression', agg = 'sum')
reg_scores_sum = Classifier(tX_sum, ty_sum, vX_sum, vy_sum, classifier = 'logisticregression')

# Mean
tX_mean, ty_mean, vX_mean, vy_mean, reg_features_mean = load_train_test_split(classifier = 'logisticregression', agg = 'mean')
reg_scores_mean = Classifier(tX_mean, ty_mean, vX_mean, vy_mean, classifier = 'logisticregression')

# Max
tX_max, ty_max, vX_max, vy_max, reg_features_max = load_train_test_split(classifier = 'logisticregression', agg = 'max')
reg_scores_max = Classifier(tX_max, ty_max, vX_max, vy_max, classifier = 'logisticregression')

# Min
tX_min, ty_min, vX_min, vy_min, reg_features_min = load_train_test_split(classifier = 'logisticregression', agg = 'min')
reg_scores_min = Classifier(tX_min, ty_min, vX_min, vy_min, classifier = 'logisticregression')

# Keep only the first row
tX_first, ty_first, vX_first, vy_first, reg_features_first = load_train_test_split(classifier = 'logisticregression', agg = 'first')
reg_scores_first = Classifier(tX_first, ty_first, vX_first, vy_first, classifier = 'logisticregression')

# Keep only the second row
tX_last, ty_last, vX_last, vy_last, reg_features_last = load_train_test_split(classifier = 'logisticregression', agg = 'last')
reg_scores_last = Classifier(tX_last, ty_last, vX_last, vy_last, classifier = 'logisticregression')







  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


#### Print Precision, Recall, F1

In [19]:
print("Logistic Regression using LVW Feature Selection")
print("----------------------------------------------------------------")

# none
print("Logistic Regression - Keep both rows")
print_scores(reg_scores)

# sum
print("Logistic Regression - Keep only columnwise sum")
print_scores(reg_scores_sum)

# mean
print("Logistic Regression - Keep only columnwise mean")
print_scores(reg_scores_mean)

# max
print("Logistic Regression - Keep only columnwise max")
print_scores(reg_scores_max)

# last
print("Logistic Regression - Keep only columnwise min")
print_scores(reg_scores_min)

# first
print("Logistic Regression - Keep only the first row")
print_scores(reg_scores_first)

# last
print("Logistic Regression - Keep only the last row")
print_scores(reg_scores_last)

Logistic Regression using LVW Feature Selection
----------------------------------------------------------------
Logistic Regression - Keep both rows
Precision:  0.6485714285714286
Recall:  0.72
F1:  0.6739627039627039
----------------------------------------------------------------
Logistic Regression - Keep only columnwise sum
Precision:  0.6185714285714285
Recall:  0.78
F1:  0.6848717948717948
----------------------------------------------------------------
Logistic Regression - Keep only columnwise mean
Precision:  0.5926190476190476
Recall:  0.7599999999999999
F1:  0.6617249417249418
----------------------------------------------------------------
Logistic Regression - Keep only columnwise max
Precision:  0.5333333333333333
Recall:  0.9800000000000001
F1:  0.6901098901098901
----------------------------------------------------------------
Logistic Regression - Keep only columnwise min
Precision:  0.5418650793650793
Recall:  0.8800000000000001
F1:  0.6657342657342656
--------------

## SVM (Gaussian Kernel)
Perform Feature Selection (LVW) for all models

In [20]:
# set seed
np.random.seed(1)

# Keep both Rows
tX, ty, vX, vy, svm_features = load_train_test_split(classifier = 'svm', agg = 'none')
svm_scores = Classifier(tX, ty, vX, vy, classifier = 'svm')

# Sum
tX_sum, ty_sum, vX_sum, vy_sum, svm_features_sum = load_train_test_split(classifier = 'svm', agg = 'sum')
svm_scores_sum = Classifier(tX_sum, ty_sum, vX_sum, vy_sum, classifier = 'svm')

# Mean
tX_mean, ty_mean, vX_mean, vy_mean, svm_features_mean = load_train_test_split(classifier = 'svm', agg = 'mean')
svm_scores_mean = Classifier(tX_mean, ty_mean, vX_mean, vy_mean, classifier = 'svm')

# Max
tX_max, ty_max, vX_max, vy_max, svm_features_max = load_train_test_split(classifier = 'svm', agg = 'max')
svm_scores_max = Classifier(tX_max, ty_max, vX_max, vy_max, classifier = 'svm')

# Min
tX_min, ty_min, vX_min, vy_min, svm_features_min = load_train_test_split(classifier = 'svm', agg = 'min')
svm_scores_min = Classifier(tX_min, ty_min, vX_min, vy_min, classifier = 'svm')

# Keep only the first row
tX_first, ty_first, vX_first, vy_first, svm_features_first = load_train_test_split(classifier = 'svm', agg = 'first')
svm_scores_first = Classifier(tX_first, ty_first, vX_first, vy_first, classifier = 'svm')

# Keep only the second row
tX_last, ty_last, vX_last, vy_last, svm_features_last = load_train_test_split(classifier = 'svm', agg = 'last')
svm_scores_last = Classifier(tX_last, ty_last, vX_last, vy_last, classifier = 'svm')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


#### Print Precision, Recall, F1

In [21]:
print("SVM using LVW Feature Selection")
print("----------------------------------------------------------------")

# none
print("SVM - Keep both rows")
print_scores(svm_scores)

# sum
print("SVM - Keep only columnwise sum")
print_scores(svm_scores_sum)

# mean
print("SVM - Keep only columnwise mean")
print_scores(svm_scores_mean)

# max
print("SVM - Keep only columnwise max")
print_scores(svm_scores_max)

# min
print("SVM - Keep only columnwise min")
print_scores(svm_scores_min)

# first
print("SVM - Keep only the first row")
print_scores(svm_scores_first)

# last
print("SVM - Keep only the last row")
print_scores(svm_scores_last)

SVM using LVW Feature Selection
----------------------------------------------------------------
SVM - Keep both rows
Precision:  0.5388888888888889
Recall:  1.0
F1:  0.7
----------------------------------------------------------------
SVM - Keep only columnwise sum
Precision:  0.4666666666666666
Recall:  0.64
F1:  0.47619047619047616
----------------------------------------------------------------
SVM - Keep only columnwise mean
Precision:  0.4666666666666666
Recall:  0.64
F1:  0.47619047619047616
----------------------------------------------------------------
SVM - Keep only columnwise max
Precision:  0.4666666666666666
Recall:  0.56
F1:  0.43809523809523804
----------------------------------------------------------------
SVM - Keep only columnwise min
Precision:  0.5388888888888889
Recall:  1.0
F1:  0.7
----------------------------------------------------------------
SVM - Keep only the first row
Precision:  0.5363095238095238
Recall:  0.9199999999999999
F1:  0.6739593739593739
---

## Random Forest
Use all the features and perform feature selection in the model

In [22]:
# set seed
np.random.seed(1)

# Keep both Rows
tX, ty, vX, vy = load_train_test_split(classifier = 'random_forest', agg = 'none')
forest_scores = Classifier(tX, ty, vX, vy, classifier = 'random_forest')

# Sum
tX_sum, ty_sum, vX_sum, vy_sum = load_train_test_split(classifier = 'random_forest', agg = 'sum')
forest_scores_sum = Classifier(tX_sum, ty_sum, vX_sum, vy_sum, classifier = 'random_forest')

# Mean
tX_mean, ty_mean, vX_mean, vy_mean = load_train_test_split(classifier = 'random_forest', agg = 'mean')
forest_scores_mean = Classifier(tX_mean, ty_mean, vX_mean, vy_mean, classifier = 'random_forest')

# Max
tX_max, ty_max, vX_max, vy_max = load_train_test_split(classifier = 'random_forest', agg = 'max')
forest_scores_max = Classifier(tX_max, ty_max, vX_max, vy_max, classifier = 'random_forest')

# Min
tX_min, ty_min, vX_min, vy_min = load_train_test_split(classifier = 'random_forest', agg = 'min')
forest_scores_min = Classifier(tX_min, ty_min, vX_min, vy_min, classifier = 'random_forest')

# Keep only the first row
tX_first, ty_first, vX_first, vy_first = load_train_test_split(classifier = 'random_forest', agg = 'first')
forest_scores_first = Classifier(tX_first, ty_first, vX_first, vy_first, classifier = 'random_forest')

# Keep only the second row
tX_last, ty_last, vX_last, vy_last = load_train_test_split(classifier = 'random_forest', agg = 'last')
forest_scores_last = Classifier(tX_last, ty_last, vX_last, vy_last, classifier = 'random_forest')

#### Print Precision, Recall, F1

In [23]:
print("Random Forest using all features")
print("----------------------------------------------------------------")

# none
print("Random Forest - Keep both rows")
print_scores(forest_scores)

# sum
print("Random Forest - Keep only columnwise sum")
print_scores(forest_scores_sum)

# mean
print("Random Forest - Keep only columnwise mean")
print_scores(forest_scores_mean)

# max
print("Random Forest - Keep only columnwise max")
print_scores(forest_scores_max)

# min
print("Random Forest - Keep only columnwise min")
print_scores(forest_scores_min)

# first
print("Random Forest - Keep only the first row")
print_scores(forest_scores_first)

# last
print("Random Forest - Keep only the last row")
print_scores(forest_scores_last)

Random Forest using all features
----------------------------------------------------------------
Random Forest - Keep both rows
Precision:  0.5292857142857144
Recall:  0.5999999999999999
F1:  0.5912393162393162
----------------------------------------------------------------
Random Forest - Keep only columnwise sum
Precision:  0.5663095238095237
Recall:  0.5599999999999999
F1:  0.5924891774891774
----------------------------------------------------------------
Random Forest - Keep only columnwise mean
Precision:  0.6227777777777778
Recall:  0.6399999999999999
F1:  0.5347513597513598
----------------------------------------------------------------
Random Forest - Keep only columnwise max
Precision:  0.575
Recall:  0.5800000000000001
F1:  0.5458441558441557
----------------------------------------------------------------
Random Forest - Keep only columnwise min
Precision:  0.6029761904761906
Recall:  0.58
F1:  0.5548717948717948
----------------------------------------------------------

## AdaBoost
Perform Feature Selection (LVW) for all models  

In [24]:
# set seed
np.random.seed(1)

# Keep both Rows
tX, ty, vX, vy, ada_features = load_train_test_split(classifier = 'adaboost', agg = 'none')
ada_scores = Classifier(tX, ty, vX, vy, classifier = 'adaboost')

# Sum
tX_sum, ty_sum, vX_sum, vy_sum, ada_features_sum = load_train_test_split(classifier = 'adaboost', agg = 'sum')
ada_scores_sum = Classifier(tX_sum, ty_sum, vX_sum, vy_sum, classifier = 'adaboost')

# Mean
tX_mean, ty_mean, vX_mean, vy_mean, ada_features_mean = load_train_test_split(classifier = 'adaboost', agg = 'mean')
ada_scores_mean = Classifier(tX_mean, ty_mean, vX_mean, vy_mean, classifier = 'adaboost')

# Max
tX_max, ty_max, vX_max, vy_max, ada_features_max = load_train_test_split(classifier = 'adaboost', agg = 'max')
ada_scores_max = Classifier(tX_max, ty_max, vX_max, vy_max, classifier = 'adaboost')

# Min
tX_min, ty_min, vX_min, vy_min, ada_features_min = load_train_test_split(classifier = 'adaboost', agg = 'min')
ada_scores_min = Classifier(tX_min, ty_min, vX_min, vy_min, classifier = 'adaboost')

# Keep only the first row
tX_first, ty_first, vX_first, vy_first, ada_features_first = load_train_test_split(classifier = 'adaboost', agg = 'first')
ada_scores_first = Classifier(tX_first, ty_first, vX_first, vy_first, classifier = 'adaboost')

# Keep only the second row
tX_last, ty_last, vX_last, vy_last, ada_features_last = load_train_test_split(classifier = 'adaboost', agg = 'last')
ada_scores_last = Classifier(tX_last, ty_last, vX_last, vy_last, classifier = 'adaboost')

#### Print Precision, Recall, F1

In [25]:
print("AdaBoost using LVW Feature Selection")
print("----------------------------------------------------------------")

# none
print("AdaBoost - Keep both rows")
print_scores(ada_scores)

# sum
print("AdaBoost - Keep only columnwise sum")
print_scores(ada_scores_sum)

# mean
print("AdaBoost - Keep only columnwise mean")
print_scores(ada_scores_mean)

# max
print("AdaBoost - Keep only columnwise max")
print_scores(ada_scores_max)

# min
print("AdaBoost - Keep only columnwise min")
print_scores(ada_scores_min)

# first
print("AdaBoost - Keep only the first row")
print_scores(ada_scores_first)

# last
print("AdaBoost - Keep only the last row")
print_scores(ada_scores_last)

AdaBoost using LVW Feature Selection
----------------------------------------------------------------
AdaBoost - Keep both rows
Precision:  0.6141666666666667
Recall:  0.6799999999999999
F1:  0.6266666666666667
----------------------------------------------------------------
AdaBoost - Keep only columnwise sum
Precision:  0.6134126984126984
Recall:  0.6399999999999999
F1:  0.5948201798201798
----------------------------------------------------------------
AdaBoost - Keep only columnwise mean
Precision:  0.6438095238095237
Recall:  0.6399999999999999
F1:  0.6317171717171716
----------------------------------------------------------------
AdaBoost - Keep only columnwise max
Precision:  0.6190476190476191
Recall:  0.72
F1:  0.6598484848484848
----------------------------------------------------------------
AdaBoost - Keep only columnwise min
Precision:  0.5993650793650793
Recall:  0.6799999999999999
F1:  0.6158041958041959
----------------------------------------------------------------
A

## Gradient Boosting Tree
Perform Feature Selection (LVW) for all models  

In [26]:
# set seed
np.random.seed(1)

# Keep both Rows
tX, ty, vX, vy, grad_features = load_train_test_split(classifier = 'gradient_boost', agg = 'none')
grad_scores = Classifier(tX, ty, vX, vy, classifier = 'gradient_boost')

# Sum
tX_sum, ty_sum, vX_sum, vy_sum, grad_features_sum = load_train_test_split(classifier = 'gradient_boost', agg = 'sum')
grad_scores_sum = Classifier(tX_sum, ty_sum, vX_sum, vy_sum, classifier = 'gradient_boost')

# Mean
tX_mean, ty_mean, vX_mean, vy_mean, grad_features_mean = load_train_test_split(classifier = 'gradient_boost', agg = 'mean')
grad_scores_mean = Classifier(tX_mean, ty_mean, vX_mean, vy_mean, classifier = 'gradient_boost')

# Max
tX_max, ty_max, vX_max, vy_max, grad_features_max = load_train_test_split(classifier = 'gradient_boost', agg = 'max')
grad_scores_max = Classifier(tX_max, ty_max, vX_max, vy_max, classifier = 'gradient_boost')

# Min
tX_min, ty_min, vX_min, vy_min, grad_features_min = load_train_test_split(classifier = 'gradient_boost', agg = 'min')
grad_scores_min = Classifier(tX_min, ty_min, vX_min, vy_min, classifier = 'gradient_boost')

# Keep only the first row
tX_first, ty_first, vX_first, vy_first, grad_features_first = load_train_test_split(classifier = 'gradient_boost', agg = 'first')
grad_scores_first = Classifier(tX_first, ty_first, vX_first, vy_first, classifier = 'gradient_boost')

# Keep only the second row
tX_last, ty_last, vX_last, vy_last, grad_features_last = load_train_test_split(classifier = 'gradient_boost', agg = 'last')
grad_scores_last = Classifier(tX_last, ty_last, vX_last, vy_last, classifier = 'gradient_boost')

  'precision', 'predicted', average, warn_for)


#### Print Precision, Recall, F1

In [27]:
print("Gradient Boosting Tree using LVW Feature Selection")
print("----------------------------------------------------------------")

# none
print("Gradient Boosting Tree - Keep both rows")
print_scores(grad_scores)

# sum
print("Gradient Boosting Tree - Keep only columnwise sum")
print_scores(grad_scores_sum)

# mean
print("Gradient Boosting Tree - Keep only columnwise mean")
print_scores(grad_scores_mean)

# max
print("Gradient Boosting Tree - Keep only columnwise max")
print_scores(grad_scores_max)

# min
print("Gradient Boosting Tree - Keep only columnwise min")
print_scores(grad_scores_min)

# first
print("Gradient Boosting Tree - Keep only the first row")
print_scores(grad_scores_first)

# last
print("Gradient Boosting Tree - Keep only the last row")
print_scores(grad_scores_last)

Gradient Boosting Tree using LVW Feature Selection
----------------------------------------------------------------
Gradient Boosting Tree - Keep both rows
Precision:  0.633095238095238
Recall:  0.7
F1:  0.6365734265734264
----------------------------------------------------------------
Gradient Boosting Tree - Keep only columnwise sum
Precision:  0.5911111111111111
Recall:  0.68
F1:  0.6381274281274281
----------------------------------------------------------------
Gradient Boosting Tree - Keep only columnwise mean
Precision:  0.6083333333333333
Recall:  0.6599999999999999
F1:  0.6135120435120435
----------------------------------------------------------------
Gradient Boosting Tree - Keep only columnwise max
Precision:  0.5561904761904761
Recall:  0.68
F1:  0.6061616161616162
----------------------------------------------------------------
Gradient Boosting Tree - Keep only columnwise min
Precision:  0.5530952380952381
Recall:  0.6599999999999999
F1:  0.5683549783549784
------------

# Decide which Aggregation to keep

In [28]:
def get_min_difference_aggregation(dict_scores, dict_none, dict_sum, dict_mean, dict_max, dict_min,
                                   dict_first, dict_last):
        
    # subtract the keys
    diff_none = {key: abs(dict_scores[key] - dict_none.get(key, 0)) for key in dict_scores.keys()}
    diff_sum = {key: abs(dict_scores[key] - dict_sum.get(key, 0)) for key in dict_scores.keys()}
    diff_mean = {key: abs(dict_scores[key] - dict_mean.get(key, 0)) for key in dict_scores.keys()}
    diff_max = {key: abs(dict_scores[key] - dict_max.get(key, 0)) for key in dict_scores.keys()}
    diff_min = {key: abs(dict_scores[key] - dict_min.get(key, 0)) for key in dict_scores.keys()}
    diff_first = {key: abs(dict_scores[key] - dict_first.get(key, 0)) for key in dict_scores.keys()}
    diff_last = {key: abs(dict_scores[key] - dict_last.get(key, 0)) for key in dict_scores.keys()}

    # get min of differences
    min_diff = {'None': sum(diff_none.values()),
                'Sum': sum(diff_sum.values()),
                'Mean': sum(diff_mean.values()),
                'Max': sum(diff_max.values()),
                'Min': sum(diff_min.values()),
                'First': sum(diff_first.values()),
                'Last': sum(diff_last.values())}
    
    print('Min difference of aggregations: ' ,min(min_diff, key = min_diff.get))

In [29]:
# dictionary with values from table 2
knn_table2 = {'Precision': 0.582, 'Recall': 0.636, 'F1': 0.608}
tree_table2 = {'Precision': 0.521, 'Recall': 0.550, 'F1': 0.535}
reg_table2 = {'Precision': 0.616, 'Recall': 0.600, 'F1': 0.608}
svm_table2 = {'Precision': 0.511, 'Recall': 0.670, 'F1': 0.580}
forest_table2 = {'Precision': 0.614, 'Recall': 0.664, 'F1': 0.638}
ada_table2 = {'Precision': 0.601, 'Recall': 0.717, 'F1': 0.654}
grad_table2 = {'Precision': 0.561, 'Recall': 0.616, 'F1': 0.587}

print("KNN")
get_min_difference_aggregation(knn_table2, knn_scores, knn_scores_sum, knn_scores_mean, knn_scores_max,
                               knn_scores_min, knn_scores_first, knn_scores_last)
print("Decision Tree")
get_min_difference_aggregation(tree_table2, tree_scores, tree_scores_sum, tree_scores_mean, tree_scores_max,
                               tree_scores_min, tree_scores_first, tree_scores_last)
print("Logistic Regression")
get_min_difference_aggregation(reg_table2, reg_scores, reg_scores_sum, reg_scores_mean, reg_scores_max,
                               reg_scores_min, reg_scores_first, reg_scores_last)
print("SVM")
get_min_difference_aggregation(svm_table2, svm_scores, svm_scores_sum, svm_scores_mean, svm_scores_max,
                               svm_scores_min, svm_scores_first, svm_scores_last)
print("Random Forest (not stable!)")
get_min_difference_aggregation(forest_table2, forest_scores, forest_scores_sum, forest_scores_mean, forest_scores_max,
                               forest_scores_min, forest_scores_first, forest_scores_last)
print("AdaBoost")
get_min_difference_aggregation(ada_table2, ada_scores, ada_scores_sum, ada_scores_mean, ada_scores_max,
                               ada_scores_min, ada_scores_first, ada_scores_last)
print("Gradient Boosting Tree")
get_min_difference_aggregation(grad_table2, grad_scores, grad_scores_sum, grad_scores_mean, grad_scores_max,
                               grad_scores_min, grad_scores_first, grad_scores_last)

KNN
Min difference of aggregations:  Max
Decision Tree
Min difference of aggregations:  Sum
Logistic Regression
Min difference of aggregations:  Last
SVM
Min difference of aggregations:  Sum
Random Forest (not stable!)
Min difference of aggregations:  Last
AdaBoost
Min difference of aggregations:  Max
Gradient Boosting Tree
Min difference of aggregations:  Min


Since keeping only the first row of each column works best for two of the models we are going to use this aggregation method for the classifier stacking.  
But first, we have to check if the Precision, Recall and F1 scores are all > 0.5:

In [30]:
# get subdict that does not contain predictions
def subdict_scores(full_dict):
    
    score_dict = dict((k, full_dict[k]) for k in ('Precision', 'Recall', 'F1'))
    return score_dict

# check if Precision, Recall and F1 are > 0.5
print(sum(value > 0.5 for value in subdict_scores(knn_scores_first).values()) == 3 and
      sum(value > 0.5 for value in subdict_scores(tree_scores_first).values()) == 3 and
      sum(value > 0.5 for value in subdict_scores(reg_scores_first).values()) == 3 and
      sum(value > 0.5 for value in subdict_scores(svm_scores_first).values()) == 3 and
      sum(value > 0.5 for value in subdict_scores(forest_scores_first).values()) == 3 and
      sum(value > 0.5 for value in subdict_scores(ada_scores_first).values()) == 3 and
      sum(value > 0.5 for value in subdict_scores(grad_scores_first).values()) == 3)

True


We compare the scores of our models with the ones from the paper:

In [31]:
print("KNN")
print("Scores from the paper: ", knn_table2)
print("Scores from our models:")
print_scores(knn_scores_first)
print("---------------------------------------------------------------------------------------------------")

print("Decision Tree")
print("Scores from the paper: ", tree_table2)
print("Scores from our models:")
print_scores(tree_scores_first)
print("---------------------------------------------------------------------------------------------------")

print("Logistic Regression")
print("Scores from the paper: ", reg_table2)
print("Scores from our models:")
print_scores(reg_scores_first)
print("---------------------------------------------------------------------------------------------------")

print("SVM")
print("Scores from the paper: ", svm_table2)
print("Scores from our models:")
print_scores(svm_scores_first)
print("---------------------------------------------------------------------------------------------------")

print("Random Forest (not stable!)")
print("Scores from the paper: ", forest_table2)
print("Scores from our models:")
print_scores(forest_scores_first)
print("---------------------------------------------------------------------------------------------------")

print("AdaBoost")
print("Scores from the paper: ", ada_table2)
print("Scores from our models:")
print_scores(ada_scores_first)
print("---------------------------------------------------------------------------------------------------")

print("Gradient Boosting Tree")
print("Scores from the paper: ", grad_table2)
print("Scores from our models:")
print_scores(grad_scores_first)
print("---------------------------------------------------------------------------------------------------")

KNN
Scores from the paper:  {'Precision': 0.582, 'Recall': 0.636, 'F1': 0.608}
Scores from our models:
Precision:  0.5699603174603174
Recall:  0.74
F1:  0.6383516483516483
----------------------------------------------------------------
---------------------------------------------------------------------------------------------------
Decision Tree
Scores from the paper:  {'Precision': 0.521, 'Recall': 0.55, 'F1': 0.535}
Scores from our models:
Precision:  0.673611111111111
Recall:  0.76
F1:  0.6901875901875902
----------------------------------------------------------------
---------------------------------------------------------------------------------------------------
Logistic Regression
Scores from the paper:  {'Precision': 0.616, 'Recall': 0.6, 'F1': 0.608}
Scores from our models:
Precision:  0.5913492063492063
Recall:  0.8600000000000001
F1:  0.6960972360972362
----------------------------------------------------------------
-----------------------------------------------------

## Recreation of Table 1

Save the train data in a new CSV:

In [32]:
data_visuals = load_visuals_data(path_vis_train, agg = 'first')
train = pd.merge(data_visuals, data_training, left_index = True, right_index=True)

train = train.reset_index()
train["file_name"] = train["file_name"].str.replace("\'", "")
train.set_index(['file_name'], inplace = True)

# output directory path
OUT_PATH = os.path.join("out")

# save as csv
train.to_csv(os.path.join(OUT_PATH, "visual_training_first.csv"), index=True)

In [33]:
data_visuals_test = load_visuals_data(path_vis_test, agg = 'first')

# rename first index
data_visuals_test = data_visuals_test.reset_index()
data_visuals_test['file_name'].iloc[0] = "10.000_Km"
data_visuals_test.set_index(['file_name'], inplace = True)

test = pd.merge(data_visuals_test, data_testing, left_index = True, right_index=True)

test = test.reset_index()
test["file_name"] = test["file_name"].str.replace("\'", "")
test.set_index(['file_name'], inplace = True)

# save as csv
test.to_csv(os.path.join(OUT_PATH, "visual_test_first.csv"), index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Merge Visuals with Metadata

In [34]:
# training data
meta_dev_path = r'out/metadata_dev.csv'
meta_train = pd.read_csv(meta_dev_path, index_col = 0, header = 0)
meta_train.index.names = ['movie']

# test data
meta_test_path = r'out/metadata_test.csv'
meta_test = pd.read_csv(meta_test_path, index_col = 0, header = 0)
meta_test.index.names = ['movie']

In [35]:
meta_test.head()

Unnamed: 0_level_0,country_Australia,country_Bahamas,country_Belgium,country_Bosnia and Herzegovina,country_Bulgaria,country_Canada,country_China,country_Czech Republic,country_Denmark,country_Egypt,...,language_Vietnamese,language_Yiddish,rated_G,rated_N/A,rated_PG,rated_PG-13,rated_R,rated_TV-MA,runtime,year
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.000 Km,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,99.0,2014
12 Years a Slave,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,134.0,2013
21 Jump Street,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,109.0,2012
2 States,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,149.0,2014
Aanmodderfakker,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,100.0,2014


In [36]:
# prepare visuals train set
train_vis = train.reset_index()
train_vis["file_name"] = train_vis["file_name"].str.replace("_", " ")
train_vis.set_index(['file_name'], inplace = True)
train_vis.index.names = ['movie']

# prepare visuals test set
test_vis = test.reset_index()
test_vis["file_name"] = test_vis["file_name"].str.replace("_", " ")
test_vis.set_index(['file_name'], inplace = True)
test_vis.index.names = ['movie']

# merge train and test sets
meta_vis_train = pd.merge(meta_train, train_vis, left_index = True, right_index=True)
meta_vis_test = pd.merge(meta_test, test_vis, left_index = True, right_index=True)

In [37]:
meta_vis_train.head()

Unnamed: 0_level_0,country_Australia,country_Bahamas,country_Belgium,country_Bosnia and Herzegovina,country_Bulgaria,country_Canada,country_China,country_Czech Republic,country_Denmark,country_Egypt,...,817,818,819,820,821,822,823,824,825,goodforairplanes
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
American Gangster,0,0,0,0,0,0,0,0,0,0,...,109.62,17.237,15.543,13.671,15.231,53559.0,8637.4,18597.0,8679.0,1
American Pie,0,0,0,0,0,0,0,0,0,0,...,121.15,1.4326,1.9853,1.6303,2.0991,2724.9,1431.5,4602.2,1330.1,1
Andaz Apna Apna,0,0,0,0,0,0,0,0,0,0,...,20058.0,9.7304,10.051,7.5919,11.133,66358.0,27000.0,60473.0,20441.0,1
Anna Karenina,0,0,0,0,0,0,0,0,0,0,...,119950.0,1e-06,0.002466,4e-06,0.002466,729320.0,119950.0,230400.0,119950.0,1
A Fish Called Wanda,0,0,0,0,0,0,0,0,0,0,...,363.66,8.5763,9.1327,8.3996,8.7881,1485.4,418.65,893.5,436.22,1


In [38]:
# save as csv files
meta_vis_train.to_csv(os.path.join(OUT_PATH, "metadata_visual_train_first.csv"), index=True)
meta_vis_test.to_csv(os.path.join(OUT_PATH, "metadata_visual_test_first.csv"), index=True)

In [39]:
# save metadata as csv files
meta_train.to_csv(os.path.join(OUT_PATH, "meta_train.csv"), index=True)
meta_test.to_csv(os.path.join(OUT_PATH, "meta_test.csv"), index=True)

Load metadata + ratings files and save as csv

In [40]:
# training data
meta_ratings_dev_path = r'out/metadata_ratings_dev.csv'
meta_ratings_train = pd.read_csv(meta_ratings_dev_path, index_col = 0, header = 0)
meta_ratings_train.index.names = ['movie']

# test data
meta_ratings_test_path = r'out/metadata_ratings_test.csv'
meta_ratings_test = pd.read_csv(meta_ratings_test_path, index_col = 0, header = 0)
meta_ratings_test.index.names = ['movie']

In [41]:
# save metadata and ratings as csv files
meta_ratings_train.to_csv(os.path.join(OUT_PATH, "meta_ratings_train.csv"), index=True)
meta_ratings_test.to_csv(os.path.join(OUT_PATH, "meta_ratings_test.csv"), index=True)

Load ratings files and save as csv

In [42]:
# training data
ratings_dev_path = r'out/ratings_dev.csv'
ratings_train = pd.read_csv(ratings_dev_path, index_col = 0, header = 0)
ratings_train.index.names = ['movie']

# test data
ratings_test_path = r'out/ratings_test.csv'
ratings_test = pd.read_csv(ratings_test_path, index_col = 0, header = 0)
ratings_test.index.names = ['movie']

In [43]:
# save metadata and ratings as csv files
ratings_train.to_csv(os.path.join(OUT_PATH, "userratings_train.csv"), index=True)
ratings_test.to_csv(os.path.join(OUT_PATH, "userratings_test.csv"), index=True)

## Recreation of Table 3
The classifier stacking is performed in a different .ipynp file, but we need to prepare the necessary data and save it as .csv

In [44]:
visual_predictions_train = train.copy()
visual_predictions_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,817,818,819,820,821,822,823,824,825,goodforairplanes
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Words_and_Pictures,0.26733,0.31877,0.30814,0.25255,0.30517,0.32915,0.33434,0.33807,0.27418,0.24486,...,500.12,22.886,20.237,17.771,20.043,132130.0,19884.0,37972.0,19805.0,1
Welcome_to_Me,0.30717,0.33422,0.33112,0.33124,0.31114,0.33644,0.33616,0.34479,0.16983,0.27379,...,167.91,20.337,21.276,18.527,21.189,81665.0,13672.0,32531.0,13753.0,0
Wild_Tales,0.30768,0.3376,0.34787,0.33882,0.3162,0.33023,0.33942,0.36068,0.11968,0.2602,...,177.12,22.268,22.143,19.507,21.992,83112.0,14388.0,30180.0,14458.0,0
V_for_Vendetta,0.038715,0.13579,0.15695,0.052015,0.1034,0.24479,0.32119,0.080424,0.11021,0.18988,...,18953.0,0.29655,0.35476,0.32137,0.37115,85196.0,18074.0,40052.0,19017.0,0
Transformers__Age_of_Extinction,0.19996,0.26934,0.27986,0.23725,0.30844,0.33242,0.32998,0.325,0.30735,0.33431,...,668.67,15.79,14.923,15.017,14.779,208630.0,23968.0,47979.0,24059.0,0


In [45]:
# drop unecessary columns
visual_predictions_train.drop(visual_predictions_train.iloc[:, 0:826], inplace=True, axis=1)

# add predictions
visual_predictions_train["KNN"] = knn_scores_first["Predictions"]
visual_predictions_train["Decision Tree"] = tree_scores_first["Predictions"]
visual_predictions_train["Logistic Regression"] = reg_scores_first["Predictions"]
visual_predictions_train["SVM"] = svm_scores_first["Predictions"]
visual_predictions_train["Random Forest"] = forest_scores_first["Predictions"]
visual_predictions_train["AdaBoost"] = ada_scores_first["Predictions"]
visual_predictions_train["Gradient Boosting Tree"] = grad_scores_first["Predictions"]

# index corrections
visual_predictions_train = visual_predictions_train.reset_index()
visual_predictions_train["file_name"] = visual_predictions_train["file_name"].str.replace("_", " ")
visual_predictions_train.set_index(['file_name'], inplace = True)
visual_predictions_train.index.names = ['movie']

visual_predictions_train.head()

Unnamed: 0_level_0,goodforairplanes,KNN,Decision Tree,Logistic Regression,SVM,Random Forest,AdaBoost,Gradient Boosting Tree
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Words and Pictures,1,0,0,1,1,0,0,0
Welcome to Me,0,0,1,0,1,1,1,0
Wild Tales,0,0,1,1,1,0,1,0
V for Vendetta,0,0,0,1,1,0,0,0
Transformers Age of Extinction,0,0,0,0,1,0,0,0


In [46]:
# save as csv
visual_predictions_train.to_csv(os.path.join(OUT_PATH, "visual_predictions_train.csv"), index=True)

Prepare the test set

In [47]:
# train/test sets
tX = train.iloc[:, :-1]
ty = train.iloc[:,-1]
    
vX = test.iloc[:, :-1]
vy = test.iloc[:,-1]

# test predictions
knn_test_predictions = Classifier(tX, ty, vX, vy, classifier = 'knn', test = True)
tree_test_predictions = Classifier(tX, ty, vX, vy, classifier = 'decision_tree', test = True)
reg_test_predictions = Classifier(tX, ty, vX, vy, classifier = 'logisticregression', test = True)
svm_test_predictions = Classifier(tX, ty, vX, vy, classifier = 'svm', test = True)
forest_test_predictions = Classifier(tX, ty, vX, vy, classifier = 'random_forest', test = True)
ada_test_predictions = Classifier(tX, ty, vX, vy, classifier = 'adaboost', test = True)
grad_test_predictions = Classifier(tX, ty, vX, vy, classifier = 'gradient_boost', test = True)

In [48]:
visual_predictions_test = test.copy()
visual_predictions_test.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,817,818,819,820,821,822,823,824,825,goodforairplanes
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.000_Km,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,269700.0,6.1035e-07,0.001645,2e-06,0.001645,1638400.0,269700.0,518400.0,269700.0,1
12_Years_a_Slave,0.0,0.193,0.19299,0.0,0.0,0.2711,0.2711,0.0,0.0,0.27024,...,12811.0,8.6933,8.4239,7.5465,8.4304,46471.0,13339.0,30212.0,14583.0,1
21_Jump_Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,119790.0,1.3776e-06,0.00247,4e-06,0.00247,725900.0,119790.0,230400.0,119790.0,1
2_States,0.033181,0.08976,0.073469,0.012184,0.10137,0.28969,0.26856,0.02602,0.071337,0.3302,...,333.43,8.9366,6.9124,5.5794,7.197,273490.0,33778.0,48676.0,34515.0,1
A_Bugs_Life,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,37959.0,4.3403e-06,0.004386,1.4e-05,0.004386,230400.0,37959.0,72900.0,37959.0,1


In [49]:
# drop unecessary columns
visual_predictions_test.drop(visual_predictions_test.iloc[:, 0:826], inplace=True, axis=1)

# add predictions
visual_predictions_test["KNN"] = knn_test_predictions
visual_predictions_test["Decision Tree"] = tree_test_predictions
visual_predictions_test["Logistic Regression"] = reg_test_predictions
visual_predictions_test["SVM"] = svm_test_predictions
visual_predictions_test["Random Forest"] = forest_test_predictions
visual_predictions_test["AdaBoost"] = ada_test_predictions
visual_predictions_test["Gradient Boosting Tree"] = grad_test_predictions

# index corrections
visual_predictions_test = visual_predictions_test.reset_index()
visual_predictions_test["file_name"] = visual_predictions_test["file_name"].str.replace("_", " ")
visual_predictions_test.set_index(['file_name'], inplace = True)
visual_predictions_test.index.names = ['movie']

visual_predictions_test.head()

Unnamed: 0_level_0,goodforairplanes,KNN,Decision Tree,Logistic Regression,SVM,Random Forest,AdaBoost,Gradient Boosting Tree
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10.000 Km,1,1,1,1,1,1,1,1
12 Years a Slave,1,0,0,1,1,1,0,0
21 Jump Street,1,0,0,1,1,0,0,0
2 States,1,0,1,0,1,1,1,0
A Bugs Life,1,1,0,1,0,1,0,1


In [50]:
# save as csv
visual_predictions_test.to_csv(os.path.join(OUT_PATH, "visual_predictions_test.csv"), index=True)

#### Prepare Dataframes for Label-Feature Stacking

In [51]:
#knn_features = train.iloc[:, knn_features_first]
#tree_features = train.iloc[:, tree_features_first]
#reg_features = train.iloc[:, reg_features_first]
#svm_features = train.iloc[:, svm_features_first]
#forest_features = train.iloc[:, forest_features_first]
#ada_features = train.iloc[:, ada_features_first]
#boost_features = train.iloc[:, grad_features_first]

In [57]:
train_features = train.copy()
train_features.iloc[:, :-1]

# index corrections
train_features = train_features.reset_index()
train_features["file_name"] = train_features["file_name"].str.replace("_", " ")
train_features.set_index(['file_name'], inplace = True)
train_features.index.names = ['movie']
train_features = train_features.drop('goodforairplanes', axis='columns')

visual_predictions_labels_train = pd.merge(train_features, visual_predictions_train, left_index = True, right_index=True)

In [58]:
visual_predictions_labels_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,824,825,goodforairplanes,KNN,Decision Tree,Logistic Regression,SVM,Random Forest,AdaBoost,Gradient Boosting Tree
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Words and Pictures,0.26733,0.31877,0.30814,0.25255,0.30517,0.32915,0.33434,0.33807,0.27418,0.24486,...,37972.0,19805.0,1,0,0,1,1,0,0,0
Welcome to Me,0.30717,0.33422,0.33112,0.33124,0.31114,0.33644,0.33616,0.34479,0.16983,0.27379,...,32531.0,13753.0,0,0,1,0,1,1,1,0
Wild Tales,0.30768,0.3376,0.34787,0.33882,0.3162,0.33023,0.33942,0.36068,0.11968,0.2602,...,30180.0,14458.0,0,0,1,1,1,0,1,0
V for Vendetta,0.038715,0.13579,0.15695,0.052015,0.1034,0.24479,0.32119,0.080424,0.11021,0.18988,...,40052.0,19017.0,0,0,0,1,1,0,0,0
Transformers Age of Extinction,0.19996,0.26934,0.27986,0.23725,0.30844,0.33242,0.32998,0.325,0.30735,0.33431,...,47979.0,24059.0,0,0,0,0,1,0,0,0


In [59]:
# save as csv
visual_predictions_labels_train.to_csv(os.path.join(OUT_PATH, "visual_predictions_labels_train.csv"), index=True)

Prepare test set

In [60]:
test_features = test.copy()
test_features.iloc[:, :-1]

# index corrections
test_features = test_features.reset_index()
test_features["file_name"] = test_features["file_name"].str.replace("_", " ")
test_features.set_index(['file_name'], inplace = True)
test_features.index.names = ['movie']
test_features = test_features.drop('goodforairplanes', axis='columns')

visual_predictions_labels_test = pd.merge(test_features, visual_predictions_test, left_index = True, right_index=True)

In [61]:
# save as csv
visual_predictions_labels_test.to_csv(os.path.join(OUT_PATH, "visual_predictions_labels_test.csv"), index=True)