<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Preparation" data-toc-modified-id="Preparation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preparation</a></span><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Load-and-prep-data" data-toc-modified-id="Load-and-prep-data-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Load and prep data</a></span></li><li><span><a href="#Las-Vegas-Wrapper-for-feature-selection" data-toc-modified-id="Las-Vegas-Wrapper-for-feature-selection-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Las Vegas Wrapper for feature selection</a></span></li></ul></li><li><span><a href="#Learning-algorithms" data-toc-modified-id="Learning-algorithms-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Learning algorithms</a></span></li><li><span><a href="#Reproducing-table-2" data-toc-modified-id="Reproducing-table-2-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Reproducing table 2</a></span></li></ul></div>

# Preparation

## Imports

In [1]:
import csv
import os
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
import time
import random

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict

## Load and prep data

In [47]:
def load_audio(trainpath="./data/CoE_dataset_icpr/Dev_Set/audio_descriptors/*",
               testpath="./data/CoE_dataset_icpr/Test_Set/audio_descriptors/*",
               trainrefpath="./data/CoeTraining.csv",
               testrefpath="./data/CoeTestLabels.csv"):
    """ loads all audio data and includes the target variable
    
    kwargs:
        trainpath: path to the folder containing all audio descriptor csv files of the train set
        testpath: path to the folder containing all audio descriptor csv files of the test set
        trainrefpath: path to the csv containing filename, movie name and the target variable for the train set
        testrefpath: path to the csv containing filename, movie name and the target variable for the test set
    """
    
    train = pd.DataFrame()
    test = pd.DataFrame()
    trainfiles = []
    testfiles = []

    # load training data
    for csvpath in tqdm(glob.glob(trainpath), desc='Loading audio train data'):
        trainfiles.append(csvpath.split('/')[-1].split('.csv')[0])
        tmp = pd.DataFrame(pd.read_csv(csvpath, header=None)).mean(axis=1)
        train = train.append(pd.DataFrame(tmp.values.flatten()).transpose())

    # load test data
    for csvpath in tqdm(glob.glob(testpath), desc='Loading audio test data'):
        testfiles.append(csvpath.split('/')[-1].split('.csv')[0])
        tmp = pd.DataFrame(pd.read_csv(csvpath, header=None)).mean(axis=1)
        test = test.append(pd.DataFrame(tmp.values.flatten()).transpose())

    # add filename and target variable
    train['fname'] = trainfiles
    test['fname'] = testfiles
    train = train.merge(pd.read_csv(trainrefpath)[['file_name', 'goodforairplanes']], left_on='fname', right_on='file_name').drop(columns=['file_name'])
    test = test.merge(pd.read_csv(testrefpath)[['file_name', 'goodforairplanes']], left_on='fname', right_on='file_name').drop(columns=['file_name'])

    # set file name as index
    train.set_index(['fname'], inplace=True)
    test.set_index(['fname'], inplace=True)

    # replace NAs in audio
    train = train.fillna(0)
    test = test.fillna(0)

    return train, test

In [48]:
def load_text(trainpath = "./data/CoE_dataset_icpr/Dev_Set/text_descriptors/tdf_idf_dev.csv",
              testpath = "./data/CoE_dataset_icpr/Test_Set/text_descriptors/tdf_idf_test.csv",
              trainrefpath="./data/CoeTraining.csv",
              testrefpath="./data/CoeTestLabels.csv"):
    """ loads text dataset and returns train and test DFs"""

    train = pd.read_csv(trainpath).T.dropna().reset_index(drop=True)
    test = pd.read_csv(testpath).T.dropna().reset_index(drop=True)
    trainref = pd.read_csv(trainrefpath).reset_index(drop=True)
    testref = pd.read_csv(testrefpath).reset_index(drop=True)
    
    train = train.join(trainref).drop(columns=['movie_name', "file_name"])
    test = test.join(testref).drop(columns=['movie_name', "file_name"])
    
    return train, test

In [49]:
audio_train, audio_test = load_audio()
text_train, text_test = load_text()

audio = audio_train.append(audio_test)

Loading audio train data: 100%|██████████| 95/95 [00:51<00:00,  1.83it/s]
Loading audio test data: 100%|██████████| 223/223 [01:58<00:00,  1.88it/s]


In [50]:
text_train.isna().any().any()

False

In [51]:
text_test#.isna().any().any()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6308,6309,6310,6311,6312,6313,6314,6315,6316,goodforairplanes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024316,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,
219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,
220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,
221,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,


In [52]:
audio_train.isna().any().any()

False

In [53]:
audio_test.isna().any().any()

False

## Las Vegas Wrapper for feature selection

In [54]:
def LVW(tX, ty, vX, vy, K, original_features,
       classifier = {'KNN', 'Decision tree', 'Logistic Regression', 'SVM',
                     'Random forest', 'AdaBoost', 'Gradient Boosting Tree'}):
    
    
    acc = 0
    k = 0
    C = len(original_features)
    
    while k < K:
        #print('k: ', k)
        ran_choice = range(1,len(original_features)-1)
        S1 = random.sample(original_features, random.choice(ran_choice))
        C1 = len(S1)
        
        x_train = tX[tX.columns.intersection(S1)]
        x_test = vX[vX.columns.intersection(S1)]
        
        acc1 = Classifier(x_train, ty, x_test, vy, 10, classifier)['F1']
        
        if (acc1 > acc) or (acc1 == acc and C1 < C):
            k = 0
            acc = acc1
            C = C1
            S = S1
        
        else:
            k += 1
            
    return S

# Learning algorithms

In [55]:
def seperate_tvar(data, target_var='goodforairplanes'):
    """ seperate target variable from the rest of the dataset
    kwargs:
        data: dataframe containing dependent and independent variables
        target_var: column name of the target variable as a string
    """
    if target_var in data.columns:
        data_y = data[target_var]
        data_x = data.drop(columns=target_var)

    return data_y, data_x

In [56]:
# dict with classifier name: time spent for fitting and classifier obejcts
classifier = {"KNN": KNeighborsClassifier(),
              "Decision tree": DecisionTreeClassifier(),
              "SVM": SVC(),
              "Bagging": BaggingClassifier(),
              "Random forest": RandomForestClassifier(),
              "AdaBoost": AdaBoostClassifier(),
              "Naive bayes": GaussianNB(),
              "Logistic Regression": LogisticRegression(),
              "Gradient Boosting Tree": GradientBoostingClassifier()
              }

# seperate tvar for full audio datasets
audio_y, audio_x = seperate_tvar(audio)

# seperate tvar for tvt splits
audio_train_y, audio_train_x = seperate_tvar(audio_train)
audio_test_y, audio_test_x = seperate_tvar(audio_test)

text_train_y, text_train_x = seperate_tvar(text_train)
text_test_y, text_test_x = seperate_tvar(text_train)

# tables to store our results for table 2
table2_train, table2_full, table3_train = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
filler_audio_train, filler_audio_test = {}, {}
filler_text_train, filler_text_test = {}, {}

for c in classifier:
#     # apply LVW feature selection on train_x and test_x
#     if c is not "Random forest":
#         for set in [(audio_train_x, audio_train_y, audio_test_x, audio_test_y), (text_train_x, text_train_y, text_test_x, text_test_y)]:
#             features = LVW(set[0], set[1], set[2], set[3], 5, set[0].columns.to_list(), c)
#             train_x = set[0][set[0].columns.intersection(features)]
#             test_x = set[3][set[3].columns.intersection(features)]

    model = classifier[c]

    # recreating the audio results
#     yhat_audio_full = cross_validate(model, audio_x, audio_y, scoring=['precision', 'recall', 'f1'], cv=10)
    score_audio_train = cross_validate(model, audio_train_x, audio_train_y, scoring=['precision', 'recall', 'f1'], cv=10)
    yhat_audio_train = cross_val_predict(model, audio_train_x, audio_train_y, cv=10)
    yhat_audio_test = cross_val_predict(model, audio_test_x, audio_test_y, cv=10)
    
#     table2_full = table2_full.append([[c, 'Reproduced',
#                                        np.mean(yhat_audio_full['test_precision']),
#                                        np.mean(yhat_audio_full['test_recall']),
#                                        np.mean(yhat_audio_full['test_f1']),
#                                        np.mean(yhat_audio_full['fit_time']),
#                                        'Audio']])
    table2_train= table2_train.append([[c, 'Reproduced',
                                       np.mean(score_audio_train['test_precision']),
                                       np.mean(score_audio_train['test_recall']),
                                       np.mean(score_audio_train['test_f1']),
                                       np.mean(score_audio_train['fit_time']),
                                       'Audio']])
    filler_audio_train[c] = yhat_audio_train
    filler_audio_test[c] = yhat_audio_test
    
    # recreating the textual results
    score_text_train = cross_validate(model, text_train_x, text_train_y, scoring=["precision", "recall", "f1"], cv=10)
    yhat_text_train = cross_val_predict(model, text_train_x, text_train_y, cv=10)
    yhat_text_test = cross_val_predict(model, text_test_x, text_test_y, cv=10)
    
    table2_train= table2_train.append([[c, 'Reproduced',
                                       np.mean(score_text_train['test_precision']),
                                       np.mean(score_text_train['test_recall']),
                                       np.mean(score_text_train['test_f1']),
                                       np.mean(score_text_train['fit_time']),
                                       'Textual']])
    filler_text_train[c] = yhat_text_train
    filler_text_test[c] = yhat_text_test

    
# add audio results from the paper for comparison
table2_train = table2_train.append([['Logistic Regression', 'Paper', 0.507, 0.597, 0.546, np.nan, 'Audio'],
                                    ['Gradient Boosting Tree', 'Paper', 0.560, 0.617, 0.587, np.nan, 'Audio']])

# table2_full = table2_full.append([['Logistic Regression', 'Paper', 0.507, 0.597, 0.546, np.nan, 'Audio'],
#                                   ['Gradient Boosting Tree', 'Paper', 0.560, 0.617, 0.587, np.nan, 'Audio']])

# add textual results from the paper for comparison
table2_train = table2_train.append([['Naive Bayes', 'Paper', 0.545, 0.987, 0.702, np.nan, 'Textual'],
                                    ['k-Nearest neighbor', 'Paper', 0.549, 0.844, 0.666, np.nan, 'Textual'],
                                    ['SVM', 'Paper', 0.547, 1.000, 0.700, np.nan, 'Textual']])

# table2_full = table2_full.append([['Naive Bayes', 'Paper', 0.545, 0.987, 0.702, np.nan, 'Textual'],
#                                   ['k-Nearest neighbor', 'Paper', 0.549, 0.844, 0.666, np.nan, 'Textual'],
#                                   ['SVM', 'Paper', 0.547, 1.000, 0.700, np.nan, 'Textual']])


# name columns and add multilevel idx
table2_train.rename(columns={0: 'Algorithm', 1: 'Source', 2: 'Precision', 3: 'Recall',
                             4: 'F1', 5: 'Training time (s)', 6: 'Modality'}, inplace=True)

table2_train = table2_train.sort_values(by=["Modality", "Source", "Algorithm"]) # sort for multilevel idx to work

savefile = os.path.join("data", "out", "table2_audio_textual.csv")
if not os.path.isdir(os.path.split(savefile)[-2]):
    os.mkdir(os.path.split(savefile)[-2])
table2_train[table2_train['F1']>0.5].to_csv(savefile, index=False)

table2_train.set_index(['Modality', 'Source', 'Algorithm'], inplace=True)

# table2_full.rename(columns={0: 'Algorithm', 1: 'Source', 2: 'Precision', 3: 'Recall',
#                             4: 'F1', 5: 'Training time (s)', 6: 'Modality'}, inplace=True)
# table2_full.set_index(['Modality', 'Source', 'Algorithm'], inplace=True)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Reproducing table 3

In [57]:
table3_audio_train = pd.DataFrame.from_dict(filler_audio_train)
table3_audio_train = table3_audio_train.join(pd.read_csv("./data/CoeTraining.csv")[["goodforairplanes", "movie_name"]])
table3_audio_train.to_csv("./out/audio_yhat_train.csv", index=False)

table3_audio_test = pd.DataFrame.from_dict(filler_audio_test)
table3_audio_test = table3_audio_test.join(pd.read_csv("./data/CoE_dataset_icpr/Dev_Set/CoeDevelopmentTestdata.csv")[["goodforairplanes", "movie_name"]])
table3_audio_test.to_csv("./out/audio_yhat_test.csv", index=False)

table3_text_train = pd.DataFrame.from_dict(filler_text_train)
table3_text_train = table3_text_train.join(pd.read_csv("./data/CoeTestLabels.csv")[["goodforairplanes", "movie_name"]])
table3_text_train.to_csv("./out/text_yhat_train.csv", index=False)

table3_text_test = pd.DataFrame.from_dict(filler_text_test)
table3_text_test = table3_text_test.join(pd.read_csv("./data/CoeTestLabels.csv")[["goodforairplanes", "movie_name"]])
table3_text_test.to_csv("./out/text_yhat_test.csv", index=False)

## Reproducing table 2

- **Textual:** We were able to perform pretty similar to the paper's authors as well, except for the Naive Bayes classifier, with which we only achieved 0.466941 (which would thus not be taken into consideration by the authors' baseline of random guessing $\rightarrow 0,5$ compared to the paper's author's $0.702000$.
- **Audio:** We performed really well for the audio modality, achieving scores extremely close to those specified in the paper.

> This can be seen in the following table, which can be reproduced (see below) via `table2_train[table2_train['F1']>0.5]`:

Algorithm|Source|Precision|Recall|F1|Training time (s)|Modality
--|--|--|--|--|--|--
Gradient Boosting Tree|Paper|0.56|0.617|0.587||Audio
Logistic Regression|Paper|0.507|0.597|0.546||Audio
AdaBoost|Reproduced|0.5616666666666668|0.5666666666666667|0.558888888888889|0.04517538547515869|Audio
Bagging|Reproduced|0.5216666666666667|0.4333333333333334|0.4688888888888889|0.01115422248840332|Audio
Decision tree|Reproduced|0.4783333333333334|0.37|0.3927272727272727|0.0014510393142700196|Audio
Gradient Boosting Tree|Reproduced|0.5445238095238095|0.5466666666666666|0.5385858585858585|0.03991997241973877|Audio
KNN|Reproduced|0.5083333333333333|0.52|0.4996969696969697|0.0010471582412719727|Audio
Logistic Regression|Reproduced|0.5114285714285713|0.5633333333333334|0.5292968142968143|0.0017587661743164063|Audio
Naive bayes|Reproduced|0.5166666666666667|0.38333333333333336|0.42936507936507934|0.0011977195739746095|Audio
Random forest|Reproduced|0.4716666666666667|0.37|0.40969696969696967|0.011317920684814454|Audio
SVM|Reproduced|0.46761904761904766|0.6|0.5221445221445222|0.0013969898223876952|Audio
Naive Bayes|Paper|0.545|0.987|0.702||Textual
SVM|Paper|0.547|1.0|0.7||Textual
k-Nearest neighbor|Paper|0.549|0.844|0.666||Textual
AdaBoost|Reproduced|0.628095238095238|0.7|0.6487229437229438|0.11005747318267822|Textual
Bagging|Reproduced|0.7095238095238094|0.6500000000000001|0.639891774891775|0.05156505107879639|Textual
Decision tree|Reproduced|0.5327380952380952|0.6566666666666665|0.5773504273504274|0.009209346771240235|Textual
Gradient Boosting Tree|Reproduced|0.5858333333333333|0.7299999999999999|0.6458275058275058|0.24995677471160888|Textual
KNN|Reproduced|0.5543650793650793|0.9633333333333333|0.7020604395604396|0.002978992462158203|Textual
Logistic Regression|Reproduced|0.5378787878787878|1.0|0.6991596638655462|0.0037820100784301757|Textual
Naive bayes|Reproduced|0.512857142857143|0.4533333333333334|0.46694083694083705|0.004088997840881348|Textual
Random forest|Reproduced|0.5346031746031745|0.6666666666666666|0.5748934398934399|0.013343906402587891|Textual
SVM|Reproduced|0.5378787878787878|1.0|0.6991596638655462|0.020474004745483398|Textual

In [13]:
table2_train[table2_train['F1']>0.5]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Precision,Recall,F1,Training time (s)
Modality,Source,Algorithm,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Audio,Paper,Gradient Boosting Tree,0.56,0.617,0.587,
Audio,Paper,Logistic Regression,0.507,0.597,0.546,
Audio,Reproduced,AdaBoost,0.561667,0.566667,0.558889,0.043517
Audio,Reproduced,Gradient Boosting Tree,0.555,0.55,0.547071,0.039629
Audio,Reproduced,Logistic Regression,0.511429,0.563333,0.529297,0.00148
Audio,Reproduced,SVM,0.467619,0.6,0.522145,0.001436
Textual,Paper,Naive Bayes,0.545,0.987,0.702,
Textual,Paper,SVM,0.547,1.0,0.7,
Textual,Paper,k-Nearest neighbor,0.549,0.844,0.666,
Textual,Reproduced,AdaBoost,0.675238,0.68,0.634177,0.103605
