In [1]:
# suppress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

import pandas as pd
import numpy as np
import csv
import glob
import random

import sklearn
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

## Load Training Data

In [2]:
def load_visuals_data(path, agg = {'none', 'sum', 'mean', 'max', 'min', 'first', 'last'}):
    """ 
    Load all visuals data files and combine them into a single Pandas DataFrame.
    
    Parameter agg:
    'none': keep both rows as single row
    'sum': take the columnwise sum of the rows
    'mean': take the columnwise mean of the rows
    'max': take the columnwise max of the rows
    'min': take the columnwise min of the rows
    'first': keep only the first row
    'last': keep only the last row
    
    Returns
    --------
    visuals_data: data frame containing the visuals data
    """
    
    # create a list of all csv files
    all_files = glob.glob(path + "/*.csv")

    data = []
    df = pd.DataFrame()

    if agg is 'none':
        for filename in all_files:
            li = pd.read_csv(filename, index_col = None, header = None)
            df = pd.DataFrame(li.values.flatten()).transpose()
            df['file_name'] = filename.rsplit('\\', 1)[1].rsplit('.', 1)[0]
            data.append(df)
    
    elif agg is 'sum':
        for filename in all_files:
            df = pd.read_csv(filename, index_col = None, header = None)
            df.loc['file_name',:]= df.sum(axis=0)
            df['file_name'] = filename.rsplit('\\', 1)[1].rsplit('.', 1)[0]
            df = pd.DataFrame(df.drop([0,1]))
            data.append(df)
            
    elif agg is 'mean':
        for filename in all_files:
            df = pd.read_csv(filename, index_col = None, header = None)
            df.loc['file_name',:]= df.mean(axis=0)
            df['file_name'] = filename.rsplit('\\', 1)[1].rsplit('.', 1)[0]
            df = pd.DataFrame(df.drop([0,1]))
            data.append(df)
       
    elif agg is 'max':
        for filename in all_files:
            df = pd.read_csv(filename, index_col = None, header = None)
            df.loc['file_name',:]= df.max(axis=0)
            df['file_name'] = filename.rsplit('\\', 1)[1].rsplit('.', 1)[0]
            df = pd.DataFrame(df.drop([0,1]))
            data.append(df)
            
    elif agg is 'min':
        for filename in all_files:
            df = pd.read_csv(filename, index_col = None, header = None)
            df.loc['file_name',:]= df.min(axis=0)
            df['file_name'] = filename.rsplit('\\', 1)[1].rsplit('.', 1)[0]
            df = pd.DataFrame(df.drop([0,1]))
            data.append(df)
            
    elif agg is 'first':
        for filename in all_files:
            df = pd.read_csv(filename, index_col = None, header = None)
            df['file_name'] = filename.rsplit('\\', 1)[1].rsplit('.', 1)[0]
            df = pd.DataFrame(df.drop([1]))
            data.append(df)
            
    elif agg is 'last':
        for filename in all_files:
            df = pd.read_csv(filename, index_col = None, header = None)
            df['file_name'] = filename.rsplit('\\', 1)[1].rsplit('.', 1)[0]
            df = pd.DataFrame(df.drop([0]))
            data.append(df)
            
    else:
        print('Parameter \'agg\' needs to be one of the following strings:' )
        print('\'none\', \'sum\', \'mean\', \'max\', \'min\', \'first\', \'last\'')
        return
            

    visuals_data = pd.concat(data, axis=0, ignore_index=True)
    visuals_data.set_index(['file_name'], inplace = True)
    
    return visuals_data

# load visuals files
path_vis_train = r'./data/Dev_Set/vis_descriptors'

data_visuals = load_visuals_data(path_vis_train, agg = 'none')
data_visuals.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1642,1643,1644,1645,1646,1647,1648,1649,1650,1651
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
American_Gangster,0.23164,0.28629,0.30068,0.28118,0.24489,0.27849,0.29076,0.2938,0.16104,0.16685,...,232.17,109.62,17.237,15.543,13.671,15.231,53559.0,8637.4,18597.0,8679.0
American_Pie,0.23736,0.25019,0.25513,0.26025,0.19318,0.2084,0.29547,0.36341,0.24142,0.25817,...,389.28,112.03,1.4815,2.0824,1.6756,2.1408,2639.4,1377.0,4321.3,1299.1
Andaz_Apna_Apna,0.0,0.29416,0.29007,0.011351,0.10093,0.35576,0.36437,0.23632,0.10078,0.34758,...,57486.0,20076.0,9.7158,10.036,7.6307,11.156,66445.0,27016.0,60427.0,20459.0
Anna_Karenina,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,230400.0,119950.0,1e-06,0.002466,4e-06,0.002466,729320.0,119950.0,230400.0,119950.0
A_Fish_Called_Wanda,0.43031,0.38101,0.34082,0.31642,0.41465,0.38599,0.32938,0.31212,0.35067,0.34246,...,765.82,362.83,8.5923,9.1427,8.4101,8.7924,1483.3,417.21,892.59,435.28


In [3]:
data_visuals.shape

(95, 1652)

In [4]:
def load_train_test_data(path):
    """ 
    Load all training or testing data files and combine them into a single Pandas DataFrame.
    
    Returns
    --------
    train_test_data: data frame containing the training or testing data
    """
    
    train_test_data = pd.read_csv(path, index_col = None, header = 0)
    train_test_data = train_test_data.drop("movie_name", axis=1)
    train_test_data.set_index(['file_name'], inplace = True)
    
    return train_test_data


# load training data
path_train = r'./data/Dev_Set/CoeDevelopmentTrainingdata.csv'

data_training = load_train_test_data(path_train)
data_training.head()

Unnamed: 0_level_0,goodforairplanes
file_name,Unnamed: 1_level_1
Seventh_Son,1
2_States,0
Welcome_to_Me,0
The_Judge,0
Transformers__Age_of_Extinction,0


In [5]:
data_training.shape

(96, 1)

In [6]:
# merge training data with visuals data
train = pd.merge(data_visuals, data_training, left_index = True, right_index=True)
train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1643,1644,1645,1646,1647,1648,1649,1650,1651,goodforairplanes
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
American_Gangster,0.23164,0.28629,0.30068,0.28118,0.24489,0.27849,0.29076,0.2938,0.16104,0.16685,...,109.62,17.237,15.543,13.671,15.231,53559.0,8637.4,18597.0,8679.0,1
American_Pie,0.23736,0.25019,0.25513,0.26025,0.19318,0.2084,0.29547,0.36341,0.24142,0.25817,...,112.03,1.4815,2.0824,1.6756,2.1408,2639.4,1377.0,4321.3,1299.1,1
Andaz_Apna_Apna,0.0,0.29416,0.29007,0.011351,0.10093,0.35576,0.36437,0.23632,0.10078,0.34758,...,20076.0,9.7158,10.036,7.6307,11.156,66445.0,27016.0,60427.0,20459.0,1
Anna_Karenina,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,119950.0,1e-06,0.002466,4e-06,0.002466,729320.0,119950.0,230400.0,119950.0,1
A_Fish_Called_Wanda,0.43031,0.38101,0.34082,0.31642,0.41465,0.38599,0.32938,0.31212,0.35067,0.34246,...,362.83,8.5923,9.1427,8.4101,8.7924,1483.3,417.21,892.59,435.28,1


In [7]:
train.shape

(94, 1653)

## Load Testing Data

In [8]:
# load visuals data for testing
path_vis_test = r'./data/Test_Set/vis_descriptors'

data_visuals_test = load_visuals_data(path_vis_test, agg = 'none')
data_visuals_test.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1642,1643,1644,1645,1646,1647,1648,1649,1650,1651
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.000_Km,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,518400.0,269700.0,6.1035e-07,0.001645,2e-06,0.001645,1638400.0,269700.0,518400.0,269700.0
12_Years_a_Slave,0.0,0.193,0.19299,0.0,0.0,0.2711,0.2711,0.0,0.0,0.27024,...,30830.0,15454.0,15.294,13.422,13.418,13.307,71633.0,21855.0,48314.0,24434.0
21_Jump_Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,230400.0,119790.0,1.3776e-06,0.00247,4e-06,0.00247,725900.0,119790.0,230400.0,119790.0
2_States,0.033181,0.08976,0.073469,0.012184,0.10137,0.28969,0.26856,0.02602,0.071337,0.3302,...,629.31,476.71,4.4431,3.5334,2.7159,3.7869,203660.0,20777.0,29696.0,19740.0
Aanmodderfakker,0.27197,0.33421,0.31689,0.30772,0.26515,0.27671,0.26563,0.2759,0.24006,0.14513,...,126.59,86.86,6.8199,7.1846,6.1967,7.1607,5318.3,1022.9,3030.9,1012.1


In [9]:
data_visuals_test.shape

(223, 1652)

In [10]:
# load testing data
path_test = r'./data/Dev_Set/CoeDevelopmentTestdata.csv'

data_testing = load_train_test_data(path_test)
data_testing.head()

Unnamed: 0_level_0,goodforairplanes
file_name,Unnamed: 1_level_1
Belle_de_Jour,1
Big_Game,0
Birdman__Or_(The_Unexpected_Virtue_of_Ignorance).mp4,1
Dances_with_Wolves,0
Dilwale_Dulhania_Le_Jayenge,0


In [11]:
data_testing.shape

(50, 1)

In [12]:
# merge testing data with visuals data
test = pd.merge(data_visuals_test, data_testing, left_index = True, right_index=True)
test.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1643,1644,1645,1646,1647,1648,1649,1650,1651,goodforairplanes
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Belle_de_Jour,0.005963,0.023951,0.02851,0.061558,0.044613,0.049806,0.032554,0.070085,0.040795,0.04645,...,121.67,2.1137,1.9939,1.0164,2.1264,6797.7,2722.4,8510.8,2390.5,1
Big_Game,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,269700.0,6.1035e-07,0.001645,2e-06,0.001645,1638400.0,269700.0,518400.0,269700.0,0
Dances_with_Wolves,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,428.99,1.4983,1.6961,1.4805,1.8075,10665.0,3592.4,7577.9,3450.6,0
Dilwale_Dulhania_Le_Jayenge,0.40341,0.23464,0.26453,0.116,0.27926,0.20721,0.2538,0.11875,0.055308,0.14537,...,4403.8,4.5156,6.0129,6.0129,6.0268,40098.0,8133.8,16631.0,6912.5,0
Dorsvloer_vol_confetti,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,108440.0,2.5038,0.83142,1.2636,1.883,549320.0,131340.0,242430.0,111750.0,1


In [13]:
test.shape

(48, 1653)

# Las Vegas Wrapper - Feature Selection

In [14]:
def LVW(classifier_function, tX, ty, vX, vy, K, original_features):
    
    err = 0
    k = 0
    C = 100
    
    while k < K:
        print('k: ', k)
        ran_choice = range(1,len(original_features)-1)
        S1 = random.sample(original_features, random.choice(ran_choice))
        C1 = len(S1)
        
        x_train = tX[tX.columns.intersection(S1)]
        x_test = vX[vX.columns.intersection(S1)]
        
        err1 = classifier_function(tX, ty, vX, vy, acc = True)
        
        if (err1 > err) or (err1 == err and C1 < C):
            k = 0
            err = err1
            C = C1
            S = S1
        
        else:
            k += 1
            
    return S

# Prediction Models

## KNN

In [15]:
def KNN(tX, ty, vX, vy, acc = False, CV = 10):
    
    knn = KNeighborsClassifier()

    # CV
    param_grid = {'n_neighbors': range(1, 83)}

    grid_knn = GridSearchCV(knn, param_grid, cv = CV)
    grid_knn.fit(tX, ty)

    # test set
    predictions = grid_knn.predict(vX)
    
    if acc:
        return accuracy_score(vy, predictions)
    else:    
        return predictions

### Try different aggregations for the training and testing set using all features

#### Keep both rows

In [16]:
# train/test split
X_train = train.iloc[:, :-1]
Y_train = train.iloc[:,-1]

X_test = test.iloc[:, :-1]
Y_test = test.iloc[:,-1]

# KNN
Y_pred = KNN(X_train, Y_train, X_test, Y_test)

knn_precision = precision_score(Y_test, Y_pred)
knn_recall = recall_score(Y_test, Y_pred)
knn_f1 = f1_score(Y_test, Y_pred)



#### Sum

In [17]:
# try a different training and testing set
data_visuals_total = load_visuals_data(path_vis_train, agg = 'sum')

# merge training data with visuals data
train_total = pd.merge(data_visuals_total, data_training, left_index = True, right_index=True)

# same with testing set
data_visuals_test_total = load_visuals_data(path_vis_test, agg = 'sum')

# merge training data with visuals data
test_total = pd.merge(data_visuals_test_total, data_testing, left_index = True, right_index=True)

In [18]:
# train/test split
X_train_total = train_total.iloc[:, :-1]
Y_train_total = train_total.iloc[:,-1]

X_test_total = test_total.iloc[:, :-1]
Y_test_total = test_total.iloc[:,-1]

# KNN
Y_pred_total = KNN(X_train_total, Y_train_total, X_test_total, Y_test_total)

knn_precision_total = precision_score(Y_test_total, Y_pred_total)
knn_recall_total = recall_score(Y_test_total, Y_pred_total)
knn_f1_total = f1_score(Y_test_total, Y_pred_total)



#### Mean

In [19]:
# try a different training and testing set
data_visuals_mean = load_visuals_data(path_vis_train, agg = 'mean')

# merge training data with visuals data
train_mean = pd.merge(data_visuals_mean, data_training, left_index = True, right_index=True)

# same with testing set
data_visuals_test_mean = load_visuals_data(path_vis_test, agg = 'mean')

# merge training data with visuals data
test_mean = pd.merge(data_visuals_test_mean, data_testing, left_index = True, right_index=True)

In [20]:
# train/test split
X_train_mean = train_mean.iloc[:, :-1]
Y_train_mean = train_mean.iloc[:,-1]

X_test_mean = test_mean.iloc[:, :-1]
Y_test_mean = test_mean.iloc[:,-1]

# KNN
Y_pred_mean = KNN(X_train_mean, Y_train_mean, X_test_mean, Y_test_mean)

knn_precision_mean = precision_score(Y_test_mean, Y_pred_mean)
knn_recall_mean = recall_score(Y_test_mean, Y_pred_mean)
knn_f1_mean = f1_score(Y_test_mean, Y_pred_mean)



#### Max

In [21]:
# try a different training and testing set
data_visuals_max = load_visuals_data(path_vis_train, agg = 'max')

# merge training data with visuals data
train_max = pd.merge(data_visuals_max, data_training, left_index = True, right_index=True)

# same with testing set
data_visuals_test_max = load_visuals_data(path_vis_test, agg = 'max')

# merge training data with visuals data
test_max = pd.merge(data_visuals_test_max, data_testing, left_index = True, right_index=True)

In [22]:
# train/test split
X_train_max = train_max.iloc[:, :-1]
Y_train_max = train_max.iloc[:,-1]

X_test_max = test_max.iloc[:, :-1]
Y_test_max = test_max.iloc[:,-1]

# KNN
Y_pred_max = KNN(X_train_max, Y_train_max, X_test_max, Y_test_max)

knn_precision_max = precision_score(Y_test_max, Y_pred_max)
knn_recall_max = recall_score(Y_test_max, Y_pred_max)
knn_f1_max = f1_score(Y_test_max, Y_pred_max)



#### Min

In [23]:
# try a different training and testing set
data_visuals_min = load_visuals_data(path_vis_train, agg = 'min')

# merge training data with visuals data
train_min = pd.merge(data_visuals_min, data_training, left_index = True, right_index=True)

# same with testing set
data_visuals_test_min = load_visuals_data(path_vis_test, agg = 'min')

# merge training data with visuals data
test_min = pd.merge(data_visuals_test_min, data_testing, left_index = True, right_index=True)

In [24]:
# train/test split
X_train_min = train_min.iloc[:, :-1]
Y_train_min = train_min.iloc[:,-1]

X_test_min = test_min.iloc[:, :-1]
Y_test_min = test_min.iloc[:,-1]

# KNN
Y_pred_min = KNN(X_train_min, Y_train_min, X_test_min, Y_test_min)

knn_precision_min = precision_score(Y_test_min, Y_pred_min)
knn_recall_min = recall_score(Y_test_min, Y_pred_min)
knn_f1_min = f1_score(Y_test_min, Y_pred_min)



#### Keep only the first row

In [25]:
# try a different training and testing set
data_visuals_first = load_visuals_data(path_vis_train, agg = 'first')

# merge training data with visuals data
train_first = pd.merge(data_visuals_first, data_training, left_index = True, right_index=True)

# same with testing set
data_visuals_test_first = load_visuals_data(path_vis_test, agg = 'first')

# merge training data with visuals data
test_first = pd.merge(data_visuals_test_first, data_testing, left_index = True, right_index=True)

In [26]:
# train/test split
X_train_first = train_first.iloc[:, :-1]
Y_train_first = train_first.iloc[:,-1]

X_test_first = test_first.iloc[:, :-1]
Y_test_first = test_first.iloc[:,-1]

# KNN
Y_pred_first = KNN(X_train_first, Y_train_first, X_test_first, Y_test_first)

knn_precision_first = precision_score(Y_test_first, Y_pred_first)
knn_recall_first = recall_score(Y_test_first, Y_pred_first)
knn_f1_first = f1_score(Y_test_first, Y_pred_first)



#### Keep only the second row

In [27]:
# try a different training and testing set
data_visuals_last = load_visuals_data(path_vis_train, agg = 'last')

# merge training data with visuals data
train_last = pd.merge(data_visuals_last, data_training, left_index = True, right_index=True)

# same with testing set
data_visuals_test_last = load_visuals_data(path_vis_test, agg = 'last')

# merge training data with visuals data
test_last = pd.merge(data_visuals_test_last, data_testing, left_index = True, right_index=True)

In [28]:
# train/test split
X_train_last = train_last.iloc[:, :-1]
Y_train_last = train_last.iloc[:,-1]

X_test_last = test_last.iloc[:, :-1]
Y_test_last = test_last.iloc[:,-1]

# KNN
Y_pred_last = KNN(X_train_last, Y_train_last, X_test_last, Y_test_last)

knn_precision_last = precision_score(Y_test_last, Y_pred_last)
knn_recall_last = recall_score(Y_test_last, Y_pred_last)
knn_f1_last = f1_score(Y_test_last, Y_pred_last)



#### Print Precision, Recall, F1

In [29]:
print("KNN using all available Features")
print("-------------------------------------------------------------")

# none
print("KNN - Keep both rows")

print("Precision of KNN : " + str(knn_precision))
print("Recall of KNN : " + str(knn_recall))
print("F1 of KNN : " + str(knn_f1))
print("-------------------------------------------------------------")

# sum
print("KNN - Keep only columnwise sum")

print("Precision of KNN : " + str(knn_precision_total))
print("Recall of KNN : " + str(knn_recall_total))
print("F1 of KNN : " + str(knn_f1_total))
print("-------------------------------------------------------------")

# mean
print("KNN - Keep only columnwise mean")

print("Precision of KNN : " + str(knn_precision_mean))
print("Recall of KNN : " + str(knn_recall_mean))
print("F1 of KNN : " + str(knn_f1_mean))
print("-------------------------------------------------------------")

# max
print("KNN - Keep only columnwise max")

print("Precision of KNN : " + str(knn_precision_max))
print("Recall of KNN : " + str(knn_recall_max))
print("F1 of KNN : " + str(knn_f1_max))
print("-------------------------------------------------------------")

# min
print("KNN - Keep only columnwise min")

print("Precision of KNN : " + str(knn_precision_min))
print("Recall of KNN : " + str(knn_recall_min))
print("F1 of KNN : " + str(knn_f1_min))
print("-------------------------------------------------------------")

# first
print("KNN - Keep only the first row")

print("Precision of KNN : " + str(knn_precision_first))
print("Recall of KNN : " + str(knn_recall_first))
print("F1 of KNN : " + str(knn_f1_first))
print("-------------------------------------------------------------")

# last
print("KNN - Keep only the last row")

print("Precision of KNN : " + str(knn_precision_last))
print("Recall of KNN : " + str(knn_recall_last))
print("F1 of KNN : " + str(knn_f1_last))
print("-------------------------------------------------------------")

KNN using all available Features
-------------------------------------------------------------
KNN - Keep both rows
Precision of KNN : 0.5263157894736842
Recall of KNN : 0.8333333333333334
F1 of KNN : 0.6451612903225806
-------------------------------------------------------------
KNN - Keep only columnwise sum
Precision of KNN : 0.5161290322580645
Recall of KNN : 0.6666666666666666
F1 of KNN : 0.5818181818181819
-------------------------------------------------------------
KNN - Keep only columnwise mean
Precision of KNN : 0.5161290322580645
Recall of KNN : 0.6666666666666666
F1 of KNN : 0.5818181818181819
-------------------------------------------------------------
KNN - Keep only columnwise max
Precision of KNN : 0.5588235294117647
Recall of KNN : 0.7916666666666666
F1 of KNN : 0.6551724137931034
-------------------------------------------------------------
KNN - Keep only columnwise min
Precision of KNN : 0.5151515151515151
Recall of KNN : 0.7083333333333334
F1 of KNN : 0.59649122

### Try different aggregations for the training and testing set using all features

#### Keep both rows

In [31]:
# set seed
np.random.seed(1)

# get features
features = LVW(KNN, X_train, Y_train, X_test, Y_test, 2, range(0,X_train.shape[1]-1))

# train/test split
X_train_lvw = X_train[X_train.columns.intersection(features)]
X_test_lvw = X_test[X_test.columns.intersection(features)]

# KNN
Y_pred_lvw = KNN(X_train_lvw, Y_train, X_test_lvw, Y_test)

knn_precision_lvw = precision_score(Y_test, Y_pred_lvw)
knn_recall_lvw = recall_score(Y_test, Y_pred_lvw)
knn_f1_lvw = f1_score(Y_test, Y_pred_lvw)

k:  0




k:  0




k:  0




k:  0




k:  1




k:  0




k:  1




#### Keep the columnwise sum

In [32]:
# set seed
np.random.seed(1)

# get features
features_total = LVW(KNN, X_train_total, Y_train_total, X_test_total, Y_test_total, 2,
                     range(0,X_train_total.shape[1]-1))

# train/test split
X_train_lvw_total = X_train_total[X_train_total.columns.intersection(features_total)]
X_test_lvw_total = X_test_total[X_test_total.columns.intersection(features_total)]

# KNN
Y_pred_lvw_total = KNN(X_train_lvw_total, Y_train_total, X_test_lvw_total, Y_test_total)

knn_precision_lvw_total = precision_score(Y_test_total, Y_pred_lvw_total)
knn_recall_lvw_total = recall_score(Y_test_total, Y_pred_lvw_total)
knn_f1_lvw_total = f1_score(Y_test_total, Y_pred_lvw_total)

k:  0




k:  0




k:  0




k:  1




#### Keep only the columnwise mean

In [33]:
# set seed
np.random.seed(1)

# get features
features_mean = LVW(KNN, X_train_mean, Y_train_mean, X_test_mean, Y_test_mean, 2,
                     range(0,X_train_mean.shape[1]-1))

# train/test split
X_train_lvw_mean = X_train_mean[X_train_mean.columns.intersection(features_mean)]
X_test_lvw_mean = X_test_mean[X_test_mean.columns.intersection(features_mean)]

# KNN
Y_pred_lvw_mean = KNN(X_train_lvw_mean, Y_train_mean, X_test_lvw_mean, Y_test_mean)

knn_precision_lvw_mean = precision_score(Y_test_mean, Y_pred_lvw_mean)
knn_recall_lvw_mean = recall_score(Y_test_mean, Y_pred_lvw_mean)
knn_f1_lvw_mean = f1_score(Y_test_mean, Y_pred_lvw_mean)

k:  0




k:  0




k:  1




#### Keep only the columnwise max

In [34]:
# set seed
np.random.seed(1)

# get features
features_max = LVW(KNN, X_train_max, Y_train_max, X_test_max, Y_test_max, 2,
                     range(0,X_train_max.shape[1]-1))

# train/test split
X_train_lvw_max = X_train_max[X_train_max.columns.intersection(features_max)]
X_test_lvw_max = X_test_max[X_test_max.columns.intersection(features_max)]

# KNN
Y_pred_lvw_max = KNN(X_train_lvw_max, Y_train_max, X_test_lvw_max, Y_test_max)

knn_precision_lvw_max = precision_score(Y_test_max, Y_pred_lvw_max)
knn_recall_lvw_max = recall_score(Y_test_max, Y_pred_lvw_max)
knn_f1_lvw_max = f1_score(Y_test_max, Y_pred_lvw_max)

k:  0




k:  0




k:  0




k:  1




k:  0




k:  1




#### Keep only the columnwise min

In [35]:
# set seed
np.random.seed(1)

# get features
features_min = LVW(KNN, X_train_min, Y_train_min, X_test_min, Y_test_min, 2,
                     range(0,X_train_min.shape[1]-1))

# train/test split
X_train_lvw_min = X_train_min[X_train_min.columns.intersection(features_min)]
X_test_lvw_min = X_test_min[X_test_min.columns.intersection(features_min)]

# KNN
Y_pred_lvw_min = KNN(X_train_lvw_min, Y_train_min, X_test_lvw_min, Y_test_min)

knn_precision_lvw_min = precision_score(Y_test_min, Y_pred_lvw_min)
knn_recall_lvw_min = recall_score(Y_test_min, Y_pred_lvw_min)
knn_f1_lvw_min = f1_score(Y_test_min, Y_pred_lvw_min)

k:  0




k:  0




k:  1




k:  0




k:  1




#### Keep only the first row

In [36]:
# set seed
np.random.seed(1)

# get features
features_first = LVW(KNN, X_train_first, Y_train_first, X_test_first, Y_test_first, 2,
                     range(0,X_train_first.shape[1]-1))

# train/test split
X_train_lvw_first = X_train_first[X_train_first.columns.intersection(features_first)]
X_test_lvw_first = X_test_first[X_test_first.columns.intersection(features_first)]

# KNN
Y_pred_lvw_first = KNN(X_train_lvw_first, Y_train_first, X_test_lvw_first, Y_test_first)

knn_precision_lvw_first = precision_score(Y_test_first, Y_pred_lvw_first)
knn_recall_lvw_first = recall_score(Y_test_first, Y_pred_lvw_first)
knn_f1_lvw_first = f1_score(Y_test_first, Y_pred_lvw_first)

k:  0




k:  0




k:  1




#### Keep only the second row

In [37]:
# set seed
np.random.seed(1)

# get features
features_last = LVW(KNN, X_train_last, Y_train_last, X_test_last, Y_test_last, 2,
                     range(0,X_train_last.shape[1]-1))

# train/test split
X_train_lvw_last = X_train_last[X_train_last.columns.intersection(features_last)]
X_test_lvw_last = X_test_last[X_test_last.columns.intersection(features_last)]

# KNN
Y_pred_lvw_last = KNN(X_train_lvw_last, Y_train_last, X_test_lvw_last, Y_test_last)

knn_precision_lvw_last = precision_score(Y_test_last, Y_pred_lvw_last)
knn_recall_lvw_last = recall_score(Y_test_last, Y_pred_lvw_last)
knn_f1_lvw_last = f1_score(Y_test_last, Y_pred_lvw_last)

k:  0




k:  0




k:  1




#### Print Precision, Recall, F1

In [38]:
print("KNN using LVW Feature Selection")
print("-------------------------------------------------------------")

# none
print("KNN - Keep both rows")

print("Precision of KNN : " + str(knn_precision_lvw))
print("Recall of KNN : " + str(knn_recall_lvw))
print("F1 of KNN : " + str(knn_f1_lvw))
print("-------------------------------------------------------------")

# sum
print("KNN - Keep only columnwise sum")

print("Precision of KNN : " + str(knn_precision_lvw_total))
print("Recall of KNN : " + str(knn_recall_lvw_total))
print("F1 of KNN : " + str(knn_f1_lvw_total))
print("-------------------------------------------------------------")

# mean
print("KNN - Keep only columnwise mean")

print("Precision of KNN : " + str(knn_precision_lvw_mean))
print("Recall of KNN : " + str(knn_recall_lvw_mean))
print("F1 of KNN : " + str(knn_f1_lvw_mean))
print("-------------------------------------------------------------")

# max
print("KNN - Keep only columnwise max")

print("Precision of KNN : " + str(knn_precision_lvw_max))
print("Recall of KNN : " + str(knn_recall_lvw_max))
print("F1 of KNN : " + str(knn_f1_lvw_max))
print("-------------------------------------------------------------")

# last
print("KNN - Keep only columnwise min")

print("Precision of KNN : " + str(knn_precision_lvw_min))
print("Recall of KNN : " + str(knn_recall_lvw_min))
print("F1 of KNN : " + str(knn_f1_lvw_min))
print("-------------------------------------------------------------")

# first
print("KNN - Keep only the first row")

print("Precision of KNN : " + str(knn_precision_lvw_first))
print("Recall of KNN : " + str(knn_recall_lvw_first))
print("F1 of KNN : " + str(knn_f1_lvw_first))
print("-------------------------------------------------------------")

# last
print("KNN - Keep only the last row")

print("Precision of KNN : " + str(knn_precision_lvw_last))
print("Recall of KNN : " + str(knn_recall_lvw_last))
print("F1 of KNN : " + str(knn_f1_lvw_last))
print("-------------------------------------------------------------")

KNN using LVW Feature Selection
-------------------------------------------------------------
KNN - Keep both rows
Precision of KNN : 0.4444444444444444
Recall of KNN : 0.3333333333333333
F1 of KNN : 0.380952380952381
-------------------------------------------------------------
KNN - Keep only columnwise sum
Precision of KNN : 0.4782608695652174
Recall of KNN : 0.4583333333333333
F1 of KNN : 0.4680851063829787
-------------------------------------------------------------
KNN - Keep only columnwise mean
Precision of KNN : 0.4782608695652174
Recall of KNN : 0.4583333333333333
F1 of KNN : 0.4680851063829787
-------------------------------------------------------------
KNN - Keep only columnwise max
Precision of KNN : 0.4594594594594595
Recall of KNN : 0.7083333333333334
F1 of KNN : 0.5573770491803279
-------------------------------------------------------------
KNN - Keep only columnwise min
Precision of KNN : 0.5
Recall of KNN : 0.5416666666666666
F1 of KNN : 0.52
----------------------