In [25]:
import sklearn 
import pandas as pd
import xml.etree.ElementTree as ET
import random
import sys

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from math import sqrt
from scipy.stats import t


__First of all they did not mentioned what sklearn version they used!!!__


As described in the paper, the first step is to select base classifiers. 
The selected base classifiers are trained with default parameter settings with 10-fold cross-validation.
As input data, the training data set and its ground truth labels, per single modality is used.
For the audio MFCC features, we set NaN values to 0, and calculate the average of each MFCC coefficient over all frames.

# Load input data


# Description:

## Available Data
There are variouse csv files and data files available. It's very messy. 
There is one file called "CoE_dataset_offical_release.zip"! 
We extract this file and use this data included their for now! 

## Meta Data
In the original paper there is no information given what is included in the metadata. 
Looking at the paper describing the data set (Right Inflight? A Dataset for Exploring the Automatic Prediction of Movies Suitable for a Watching Situation
) we found out that as metadata they used language, year published, genre, country, runtime and age rating. We assume, since the author of our paper didn't say otherwise, that they used the same metadata. 

## Visual Data: 
The visual data is provied as a csv file for each movie, containing two rows. According to the paper of the dataset they calculated following visual features, Histogram of Oriented Gradients (HOG) gray, Color Moments, local binary patterns (LBP) and Gray Level Run Length Matrix, but don't say how the csv file represents them. Also as mentioned the csv file just has two rows which would not ad up to the mentioned 4 visual features. __We are treating all values as seperate column!__

## Audio Data: 
Audio features is also provided per movie as a csv file. Each audio feature consits of 12 coefficients for multiple frames.

## Textual Data
The textual data is just one file containing the tdf-idf matrix. The first line are the row names for each word. 
While the columns are the associated movie. __There is no indication to which movie each column belongs! Thus we need to assume this!__

__For now we assume the order is the same as in the df_labled_movies dataframe!!!__



In [2]:

df_labled_movies = pd.read_csv("./data/CoE_dataset/Dev_set/dev_set_groundtruth_and_trailers.csv", sep=';')
del df_labled_movies['trailer']
df_labled_movies = df_labled_movies[['movie','filename', 'goodforairplane']]
display(df_labled_movies.head(10))


### Load Meta Data ###

def load_meta_data( filenames ): 
    
    raw_data = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/XML/{file}.xml'
        with open(file_path) as f: 
            tree = ET.parse(f)
            movie = tree.find('movie')
            
            lang = movie.get('language')
            year = movie.get('year')
            genre = movie.get('genre')
            country = movie.get('country')
            runtime = movie.get('runtime')
            age_rating = movie.get('rated')
             
            raw_data.append( (file,lang,year,genre,country,runtime,age_rating) )
    
    return pd.DataFrame(raw_data, columns=['filename','language','year','genre','country','runtime','rated'])


df_meta_data = load_meta_data( df_labled_movies['filename']  )
display(df_meta_data.head(10))
display(df_meta_data.dtypes)

### Load Visual Data ###

def load_visual_data( filenames ):
    data_list = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/vis_descriptors/{file}.csv'
        df_data = pd.read_csv(file_path,index_col=None, header=None)
        data_list.append(df_data)
        
    return pd.concat(data_list, axis = 0, keys = filenames,names=('filename','vis_data'),  sort=False)

df_visual_data = load_visual_data( df_labled_movies['filename']  )
display(df_visual_data.head(10))

### Load Audio Data ###

def load_audio_data( filenames ):
    data_list = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/audio_descriptors/{file}.csv'
        df_data = pd.read_csv(file_path,index_col=None, header=None)
        data_list.append(df_data)
        
    return pd.concat(data_list, axis = 0, keys = filenames,names=('filename','freq_coeff'),  sort=False)

df_audio_data = load_audio_data( df_labled_movies['filename']  )
display(df_audio_data.head(20))


### Load textual Data ###

def load_text_data(filenames):
    

    data_list = []
    file_path = f'./data/CoE_dataset/Dev_set/text_descriptors/tdf_idf_dev.csv'
    #somehow pandas can not really handle that the first line is row names.(at least I didn't find a better way) 
    # thus we do it a little complicated here
    header_index = pd.read_csv(file_path, index_col=0,nrows=1 ).reset_index().columns
    df_data = pd.read_csv(file_path, header=None, index_col=False,skiprows=1)
    df_data.set_index(header_index, inplace=True)
    df_data.columns = filenames
    return df_data.T #row are should be represented by movie names

df_text_data = load_text_data(df_labled_movies['filename'] )
display(df_text_data.head(20))
display(df_text_data.shape)
display(df_text_data.describe())



Unnamed: 0,movie,filename,goodforairplane
0,Seventh Son,Seventh_Son,1
1,Welcome to Me,Welcome_to_Me,0
2,The Judge,The_Judge,0
3,Transformers Age of Extinction,Transformers__Age_of_Extinction,0
4,The Normal Heart,The_Normal_Heart,1
5,The Phantom Tollbooth,The_Phantom_Tollbooth,1
6,Andaz Apna Apna,Andaz_Apna_Apna,1
7,Hotel Transylvania,Hotel_Transylvania,1
8,The Matrix,The_Matrix,1
9,Into the Wild,Into_the_Wild,1


Unnamed: 0,filename,language,year,genre,country,runtime,rated
0,Seventh_Son,English,2014,"Action, Adventure, Fantasy","USA, UK, Canada, China",102 min,PG-13
1,Welcome_to_Me,English,2014,"Comedy, Drama",USA,105 min,R
2,The_Judge,English,2014,Drama,USA,141 min,R
3,Transformers__Age_of_Extinction,English,2014,"Action, Adventure, Sci-Fi","USA, China",165 min,PG-13
4,The_Normal_Heart,English,2014,Drama,USA,132 min,TV-MA
5,The_Phantom_Tollbooth,English,1970,"Family, Adventure, Animation",USA,90 min,G
6,Andaz_Apna_Apna,Hindi,1994,"Comedy, Family, Romance",India,160 min,PG
7,Hotel_Transylvania,English,2012,"Animation, Comedy, Family",USA,91 min,PG
8,The_Matrix,English,1999,"Action, Sci-Fi","USA, Australia",136 min,R
9,Into_the_Wild,"English, Danish",2007,"Adventure, Biography, Drama",USA,148 min,R


filename    object
language    object
year        object
genre       object
country     object
runtime     object
rated       object
dtype: object

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,816,817,818,819,820,821,822,823,824,825
filename,vis_data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Seventh_Son,0,0.047044,0.11619,0.13633,0.066194,0.072554,0.17267,0.21519,0.070574,0.071423,0.14938,...,731.69,502.01,1.897,2.2788,2.1412,2.9504,91672.0,22207.0,26201.0,14542.0
Seventh_Son,1,0.056526,0.12516,0.14628,0.082497,0.079331,0.17538,0.21839,0.093521,0.074837,0.15025,...,689.95,474.97,2.2676,2.5887,2.4022,3.2167,81373.0,21045.0,24225.0,13529.0
Welcome_to_Me,0,0.30717,0.33422,0.33112,0.33124,0.31114,0.33644,0.33616,0.34479,0.16983,0.27379,...,394.34,167.91,20.337,21.276,18.527,21.189,81665.0,13672.0,32531.0,13753.0
Welcome_to_Me,1,0.30466,0.33193,0.33124,0.33138,0.30788,0.3327,0.33357,0.34305,0.1733,0.28076,...,397.26,168.23,20.426,21.3,18.608,21.182,83171.0,13714.0,32774.0,13780.0
The_Judge,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,230400.0,119950.0,1e-06,0.002466,4e-06,0.002466,729320.0,119950.0,230400.0,119950.0
The_Judge,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,230400.0,119950.0,1e-06,0.002466,4e-06,0.002466,729320.0,119950.0,230400.0,119950.0
Transformers__Age_of_Extinction,0,0.19996,0.26934,0.27986,0.23725,0.30844,0.33242,0.32998,0.325,0.30735,0.33431,...,1112.6,668.67,15.79,14.923,15.017,14.779,208630.0,23968.0,47979.0,24059.0
Transformers__Age_of_Extinction,1,0.18913,0.25738,0.27465,0.23664,0.30332,0.32989,0.32888,0.32246,0.30543,0.33551,...,1120.6,669.56,15.086,14.7,14.859,14.723,211630.0,24019.0,48339.0,24090.0
The_Normal_Heart,0,0.0,0.0,0.0,0.0,0.038749,0.083701,0.10544,0.1215,0.038749,0.083701,...,34463.0,20376.0,1.3683,7.3447,8.0146,7.3798,145760.0,20730.0,35320.0,20831.0
The_Normal_Heart,1,0.0,0.0,0.0,0.0,0.20135,0.2979,0.39682,0.55336,0.20135,0.2979,...,41786.0,19786.0,13.071,11.296,11.202,11.306,79962.0,20617.0,45216.0,20738.0


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,23078,23079,23080,23081,23082,23083,23084,23085,23086,23087
filename,freq_coeff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Seventh_Son,0,,,,,,,,-51.235,-25.775,-17.41,...,,,,,,,,,,
Seventh_Son,1,,,,,,,,4.7601,10.414,15.935,...,,,,,,,,,,
Seventh_Son,2,,,,,,,,-8.6519,-6.1667,-7.3772,...,,,,,,,,,,
Seventh_Son,3,,,,,,,,-8.1397,-8.0911,-14.568,...,,,,,,,,,,
Seventh_Son,4,,,,,,,,-1.7245,2.1968,1.145,...,,,,,,,,,,
Seventh_Son,5,,,,,,,,0.93079,9.8801,15.528,...,,,,,,,,,,
Seventh_Son,6,,,,,,,,-2.2074,4.473,6.5692,...,,,,,,,,,,
Seventh_Son,7,,,,,,,,-2.6355,-1.6751,-6.13,...,,,,,,,,,,
Seventh_Son,8,,,,,,,,-0.3302,1.2823,-0.95568,...,,,,,,,,,,
Seventh_Son,9,,,,,,,,0.25014,7.6977,11.972,...,,,,,,,,,,


Unnamed: 0_level_0,24000,baby,baseball,big,doc,escort,frozen,heroes,high,huck,...,years.1,york,yorks,young,young.1,younger,youngja,zebra,zellweger,zoologists
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Seventh_Son,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Welcome_to_Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The_Judge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Transformers__Age_of_Extinction,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The_Normal_Heart,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.051657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The_Phantom_Tollbooth,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Andaz_Apna_Apna,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Hotel_Transylvania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.041679,0.0,0.0,0.0,0.0
The_Matrix,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.15957,0.15957,0.0,0.0,0.0,0.0,0.0
Into_the_Wild,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(95, 3283)

Unnamed: 0,24000,baby,baseball,big,doc,escort,frozen,heroes,high,huck,...,years.1,york,yorks,young,young.1,younger,youngja,zebra,zellweger,zoologists
count,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,...,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0
mean,0.0,0.002288,0.001108,0.000353,0.0,0.0,0.0,0.000753,0.001395,0.0,...,0.003679,0.002638,0.0,0.006366,0.006366,0.001488,0.0,0.000531,0.0,0.0
std,0.0,0.019976,0.010803,0.003444,0.0,0.0,0.0,0.007335,0.008181,0.0,...,0.013408,0.013584,0.0,0.021099,0.021099,0.008699,0.0,0.005178,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.19349,0.10529,0.033572,0.0,0.0,0.0,0.071495,0.062724,0.0,...,0.073645,0.092014,0.0,0.15957,0.15957,0.067395,0.0,0.050467,0.0,0.0


# Preprocess Data

## Description 

Beside a short description for the audio data there is no more information on how to handle the other data. For example the runtime currently is not handles as a number but as a string(object)
Since sklearn mostly expects numerical inputs, we need to encode the data. 

For different class normally you would use one-hot-encoding, but since it's not specified let's try first the easiest approach which is Labelencoding.


### Audio Data: 
As mentiones in the paper, NaN values of the audio data are set to 0 and the average of each MFCC coefficient is calculated over all frames.





In [3]:

def pre_process_audio_data():
    df_data = df_audio_data.fillna(0.0)
    return df_data.mean(axis=1)
    
def pre_process_visual_data():
    #create columns of the two rows belonging to each movie
    df_data = df_visual_data.unstack()
    return df_data
    
    
df_audio_data_processed = pre_process_audio_data()
display(df_audio_data_processed.head(20))

df_visual_data_processed = pre_process_visual_data()
display(df_visual_data_processed.head(20))

filename       freq_coeff
Seventh_Son    0             33.737346
               1             -2.259660
               2              0.822080
               3             -0.298483
               4              0.680520
               5             -0.679905
               6              0.085080
               7             -0.249879
               8             -0.025137
               9             -0.134721
               10            -0.116094
               11            -0.098648
               12             0.066234
Welcome_to_Me  0             39.561047
               1             -4.593651
               2             -0.709224
               3             -1.020713
               4              0.160524
               5              0.001964
               6             -1.487054
dtype: float64

Unnamed: 0_level_0,0,0,1,1,2,2,3,3,4,4,...,821,821,822,822,823,823,824,824,825,825
vis_data,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
filename,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Seventh_Son,0.047044,0.056526,0.11619,0.12516,0.13633,0.14628,0.066194,0.082497,0.072554,0.079331,...,2.9504,3.2167,91672.0,81373.0,22207.0,21045.0,26201.0,24225.0,14542.0,13529.0
Welcome_to_Me,0.30717,0.30466,0.33422,0.33193,0.33112,0.33124,0.33124,0.33138,0.31114,0.30788,...,21.189,21.182,81665.0,83171.0,13672.0,13714.0,32531.0,32774.0,13753.0,13780.0
The_Judge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002466,0.002466,729320.0,729320.0,119950.0,119950.0,230400.0,230400.0,119950.0,119950.0
Transformers__Age_of_Extinction,0.19996,0.18913,0.26934,0.25738,0.27986,0.27465,0.23725,0.23664,0.30844,0.30332,...,14.779,14.723,208630.0,211630.0,23968.0,24019.0,47979.0,48339.0,24059.0,24090.0
The_Normal_Heart,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038749,0.20135,...,7.3798,11.306,145760.0,79962.0,20730.0,20617.0,35320.0,45216.0,20831.0,20738.0
The_Phantom_Tollbooth,0.016911,0.068953,0.014269,0.099843,0.016807,0.12155,0.031862,0.17175,0.029332,0.22677,...,0.004375,0.87925,230400.0,17662.0,38355.0,1955.1,73984.0,6105.2,38355.0,2000.1
Andaz_Apna_Apna,0.0,0.0,0.29416,0.29452,0.29007,0.2904,0.011351,0.011381,0.10093,0.10176,...,11.133,11.156,66358.0,66445.0,27000.0,27016.0,60473.0,60427.0,20441.0,20459.0
Hotel_Transylvania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00247,0.00247,725900.0,725900.0,119790.0,119790.0,230400.0,230400.0,119790.0,119790.0
The_Matrix,0.0,0.11486,0.0,0.20847,0.0,0.22465,0.0,0.24525,0.0,0.2825,...,0.003916,1.8843,230400.0,22739.0,55609.0,3850.1,129600.0,6361.4,55609.0,3976.7
Into_the_Wild,0.0,0.24998,0.0,0.32256,0.0,0.31273,0.0,0.24374,0.0,0.25535,...,0.004386,15.352,230400.0,26102.0,37959.0,3523.4,72900.0,6113.7,37959.0,3568.1


# Define Models

## Description 
These are the models described in the paper. It is not allways clear which exact models they used. (see comments)

In [4]:
from  sklearn.neighbors import KNeighborsClassifier, NearestCentroid #(not sure if this is the nearest mean classifiert) 
from  sklearn.tree import DecisionTreeClassifier
from  sklearn.linear_model import LogisticRegression
from  sklearn.svm import SVC #(not clear which SVC, there is also NuSVC )
from  sklearn.ensemble import BaggingClassifier
from  sklearn.ensemble import AdaBoostClassifier
from  sklearn.ensemble import GradientBoostingClassifier
from  sklearn.ensemble import RandomForestClassifier
from  sklearn.naive_bayes import GaussianNB # there are 3 different naive bayes classifiers, it is not stated which one they used 


model_list = [KNeighborsClassifier(),
              NearestCentroid(),
                    DecisionTreeClassifier(),
                    LogisticRegression(),
                    SVC(),
                    BaggingClassifier(),
                    AdaBoostClassifier(),
                    GradientBoostingClassifier(),
                    RandomForestClassifier(),
                    GaussianNB() 
                   ]

    


# Define Performance measures:

As mentioned in the paper the performant measueres are the following Precision and Recall and F1-Score. To be more precise the weighted average of Precision and Recall and F1-Score as stated in the dataset paper. 

In [5]:
from sklearn.model_selection import cross_validate

def calculate_metrics(clf,X,y ):
    metric =  cross_validate(clf, X, y, scoring=('precision_weighted','recall_weighted','f1_weighted'), return_train_score=False, cv=10)  
    return pd.Series({'precision':metric['test_precision_weighted'].mean(),'recall':metric['test_recall_weighted'].mean(),'F1':metric['test_f1_weighted'].mean() })

# Select Models

As defined in the paper they use 10-fold CV on the classifiers for training and keep all the classifiers where the metrics are above 0.5 for later stacking.


In [6]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np

class MultiColumnLabelEncoder:
    
    def __init__(self, columns = None):
        self.columns = columns # list of column to encode

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        
        output = X.copy()
        
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname, col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        
        return output

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
    
def getModelName( object ): 

    if hasattr(object, '__module__') and hasattr(object, '__name__'):
        return  object.__name__
    elif hasattr(object, '__module__') and hasattr(object, '__class__'):
        return  object.__class__.__name__
    else:
        raise TypeError("Could not get name of object!")
    
def evaluate_models( X, y ):
    metrics = pd.DataFrame()

    for model in model_list:
        random.seed(123)
        m = calculate_metrics(model,X,y )
        metrics[getModelName(model)] = m

    return metrics.T


df_final_results = pd.DataFrame()

import warnings
warnings.filterwarnings('ignore')

In [7]:
    
df_train = pd.merge(df_labled_movies,df_meta_data, on='filename')
df_train.drop(['movie', 'filename'],axis=1, inplace=True)
display(df_train.head(2))
df_X = df_train.drop('goodforairplane',axis=1)
df_y = df_train['goodforairplane']



display("----  Lable encoded ----")
label_encoder = MultiColumnLabelEncoder(['language','year','genre','country','runtime','rated'])    
X_labelencoded = label_encoder.fit_transform(df_X)
metrics = evaluate_models(X_labelencoded, df_y)
display(metrics)

#convert runtime and year to actual number
df_X['runtime'] = df_X['runtime'].apply(lambda x: int(x.split(' ')[0]) )
df_X['year'] =  df_X['year'].apply(pd.to_numeric)

display("---- Lable encoded with float for year and runtime ----")
##optimizing encoding
label_encoder = MultiColumnLabelEncoder(['language','year','genre','country','rated'])    
X_labelencoded = label_encoder.fit_transform(df_X)
metrics = evaluate_models(X_labelencoded, df_y)
display(metrics)

display("---- Lable encoded without year ----")
label_encoder = MultiColumnLabelEncoder(['language','genre','country','rated'])    
X_labelencoded = label_encoder.fit_transform(df_X)
metrics = evaluate_models(X_labelencoded, df_y)
display(metrics)

# save the best of the for the final table 
metrics['Modality'] = 'metadata'
df_final_results = df_final_results.append(metrics)


display("---- OneHot Encoding ----")
##optimizing encoding further
X_onehotencoded = pd.get_dummies(df_X)
metrics = evaluate_models(X_onehotencoded, df_y)
display(metrics)


Unnamed: 0,goodforairplane,language,year,genre,country,runtime,rated
0,1,English,2014,"Action, Adventure, Fantasy","USA, UK, Canada, China",102 min,PG-13
1,0,English,2014,"Comedy, Drama",USA,105 min,R


'----  Lable encoded ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.5194,0.566162,0.52335
NearestCentroid,0.602922,0.583333,0.576202
DecisionTreeClassifier,0.507718,0.499596,0.482022
LogisticRegression,0.587637,0.585556,0.574014
SVC,0.297467,0.536869,0.382622
BaggingClassifier,0.470842,0.464242,0.453609
AdaBoostClassifier,0.516002,0.501717,0.490721
GradientBoostingClassifier,0.499794,0.500707,0.484501
RandomForestClassifier,0.503668,0.504646,0.487153
GaussianNB,0.47488,0.506667,0.481515


'---- Lable encoded with float for year and runtime ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.657535,0.618586,0.600698
NearestCentroid,0.466408,0.479091,0.465068
DecisionTreeClassifier,0.337437,0.347172,0.336767
LogisticRegression,0.512315,0.52404,0.493776
SVC,0.300554,0.54798,0.388116
BaggingClassifier,0.511906,0.512828,0.495845
AdaBoostClassifier,0.466827,0.474444,0.462241
GradientBoostingClassifier,0.421168,0.422929,0.407376
RandomForestClassifier,0.480388,0.494646,0.465267
GaussianNB,0.467194,0.499798,0.47404


'---- Lable encoded without year ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.632037,0.619697,0.589233
NearestCentroid,0.466408,0.479091,0.465068
DecisionTreeClassifier,0.37831,0.389596,0.371869
LogisticRegression,0.545546,0.549293,0.529509
SVC,0.300554,0.54798,0.388116
BaggingClassifier,0.494615,0.477576,0.462162
AdaBoostClassifier,0.466827,0.474444,0.462241
GradientBoostingClassifier,0.429316,0.43202,0.41589
RandomForestClassifier,0.526445,0.522828,0.510247
GaussianNB,0.517396,0.539798,0.499798


'---- OneHot Encoding ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.574384,0.554242,0.544317
NearestCentroid,0.392575,0.405354,0.387835
DecisionTreeClassifier,0.479864,0.478485,0.462399
LogisticRegression,0.335283,0.402727,0.359249
SVC,0.389456,0.453333,0.413346
BaggingClassifier,0.38462,0.409798,0.384517
AdaBoostClassifier,0.425618,0.416061,0.397258
GradientBoostingClassifier,0.323559,0.378485,0.341301
RandomForestClassifier,0.3438,0.386667,0.354105
GaussianNB,0.367316,0.400707,0.347878


In [8]:
from sklearn.preprocessing import Normalizer

################## Use textual data  ###################
display('################## Use textual data  ###################')

df_movies = df_labled_movies.drop(['movie'],axis=1)
df_train = pd.merge(df_movies,df_text_data, on='filename')
df_train.drop(['filename'],axis=1, inplace=True)
display(df_train.head(2))
df_X = df_train.drop('goodforairplane',axis=1)
df_y = df_train['goodforairplane']


display("---- RAW Data ----")
metrics = evaluate_models(df_X, df_y)
display(metrics)

# save  the final table 
metrics['Modality'] = 'textual'
df_final_results = df_final_results.append(metrics)

display("---- Normalize Data ----")
df_normalized_X = Normalizer().fit_transform(df_X)
metrics = evaluate_models(df_normalized_X, df_y)
display(metrics)




'################## Use textual data  ###################'

Unnamed: 0,goodforairplane,24000,baby,baseball,big,doc,escort,frozen,heroes,high,...,years.1,york,yorks,young,young.1,younger,youngja,zebra,zellweger,zoologists
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


'---- RAW Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.341994,0.465152,0.366055
NearestCentroid,0.452087,0.561111,0.461862
DecisionTreeClassifier,0.483368,0.502424,0.462482
LogisticRegression,0.300554,0.54798,0.388116
SVC,0.300554,0.54798,0.388116
BaggingClassifier,0.499774,0.493131,0.459634
AdaBoostClassifier,0.582774,0.570202,0.546826
GradientBoostingClassifier,0.622112,0.643939,0.590491
RandomForestClassifier,0.4171,0.465253,0.402927
GaussianNB,0.537073,0.558182,0.538881


'---- Normalize Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.506534,0.53303,0.512809
NearestCentroid,0.555174,0.555253,0.531696
DecisionTreeClassifier,0.466064,0.513737,0.468389
LogisticRegression,0.300554,0.54798,0.388116
SVC,0.300554,0.54798,0.388116
BaggingClassifier,0.504288,0.529798,0.491955
AdaBoostClassifier,0.489057,0.510606,0.488059
GradientBoostingClassifier,0.536372,0.575253,0.528507
RandomForestClassifier,0.454808,0.505758,0.452508
GaussianNB,0.537073,0.558182,0.538881


In [9]:
from sklearn.preprocessing import StandardScaler,RobustScaler

################## Use visual data  ###################
display('################## Use visual data  ###################')

df_movies = df_labled_movies.drop(['movie'],axis=1)
df_train = pd.merge(df_movies,df_visual_data_processed, on='filename')
df_train.drop(['filename'],axis=1, inplace=True)
display(df_train.head(5))
df_X = df_train.drop('goodforairplane',axis=1)
df_y = df_train['goodforairplane']


display("---- RAW Data ----")
metrics = evaluate_models(df_X, df_y)
display(metrics)

display("---- Scaled Data ----")
df_scaled_X = StandardScaler().fit_transform(df_X)
metrics = evaluate_models(df_scaled_X, df_y)
display(metrics)

# save  the final table 
metrics['Modality'] = 'visual'
df_final_results = df_final_results.append(metrics)

display("---- RobustScaler Data ----")
df_scaled_X = RobustScaler().fit_transform(df_X)
metrics = evaluate_models(df_scaled_X, df_y)
display(metrics)


'################## Use visual data  ###################'

Unnamed: 0,goodforairplane,"(0, 0)","(0, 1)","(1, 0)","(1, 1)","(2, 0)","(2, 1)","(3, 0)","(3, 1)","(4, 0)",...,"(821, 0)","(821, 1)","(822, 0)","(822, 1)","(823, 0)","(823, 1)","(824, 0)","(824, 1)","(825, 0)","(825, 1)"
0,1,0.047044,0.056526,0.11619,0.12516,0.13633,0.14628,0.066194,0.082497,0.072554,...,2.9504,3.2167,91672.0,81373.0,22207.0,21045.0,26201.0,24225.0,14542.0,13529.0
1,0,0.30717,0.30466,0.33422,0.33193,0.33112,0.33124,0.33124,0.33138,0.31114,...,21.189,21.182,81665.0,83171.0,13672.0,13714.0,32531.0,32774.0,13753.0,13780.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002466,0.002466,729320.0,729320.0,119950.0,119950.0,230400.0,230400.0,119950.0,119950.0
3,0,0.19996,0.18913,0.26934,0.25738,0.27986,0.27465,0.23725,0.23664,0.30844,...,14.779,14.723,208630.0,211630.0,23968.0,24019.0,47979.0,48339.0,24059.0,24090.0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038749,...,7.3798,11.306,145760.0,79962.0,20730.0,20617.0,35320.0,45216.0,20831.0,20738.0


'---- RAW Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.546787,0.542929,0.529648
NearestCentroid,0.405332,0.473333,0.405324
DecisionTreeClassifier,0.538254,0.542626,0.528211
LogisticRegression,0.561063,0.573939,0.555761
SVC,0.398547,0.55798,0.4225
BaggingClassifier,0.582886,0.565051,0.55455
AdaBoostClassifier,0.505459,0.509293,0.496069
GradientBoostingClassifier,0.477339,0.490202,0.472783
RandomForestClassifier,0.597453,0.580909,0.571998
GaussianNB,0.503584,0.518586,0.484453


'---- Scaled Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.570299,0.56,0.538146
NearestCentroid,0.515741,0.505354,0.487731
DecisionTreeClassifier,0.481938,0.506162,0.484809
LogisticRegression,0.580084,0.549798,0.538425
SVC,0.45307,0.538889,0.448757
BaggingClassifier,0.598845,0.577071,0.565515
AdaBoostClassifier,0.507985,0.504242,0.489265
GradientBoostingClassifier,0.500462,0.522424,0.506196
RandomForestClassifier,0.592787,0.587071,0.575469
GaussianNB,0.607893,0.587273,0.57359


'---- RobustScaler Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.558343,0.56,0.544837
NearestCentroid,0.576882,0.539596,0.508983
DecisionTreeClassifier,0.529937,0.556768,0.539513
LogisticRegression,0.513166,0.509394,0.501544
SVC,0.425366,0.467172,0.420111
BaggingClassifier,0.596847,0.58798,0.580978
AdaBoostClassifier,0.497985,0.493131,0.478154
GradientBoostingClassifier,0.475078,0.494242,0.477002
RandomForestClassifier,0.599573,0.587071,0.573644
GaussianNB,0.532297,0.525354,0.522063


In [10]:
from sklearn.preprocessing import StandardScaler,RobustScaler

################## Use audio data  ###################

display('################## Use audio data  ###################')

def load_audio_data( filenames ): # changed
    data_list = []
    
    audio_data = pd.DataFrame()
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/audio_descriptors/{file}.csv'
        df_data = pd.read_csv(file_path,index_col=None, header=None).T
        
        # preprocess data
        df_data = df_data.fillna(0)
        df_data = pd.DataFrame(df_data.mean(axis = 0)).T
        df_data["filename"] = file
        audio_data = audio_data.append(df_data)
    audio_data = audio_data.set_index("filename") 
    return audio_data

df_audio_data = load_audio_data( df_labled_movies['filename']  )


df_train_audio = pd.merge(df_movies, df_audio_data, on='filename')
df_train_audio.drop(['filename'],axis=1, inplace=True)
df_X = df_train_audio.drop('goodforairplane',axis=1)
df_y = df_train_audio['goodforairplane']


# df_movies = df_labled_movies.drop(['movie'],axis=1)
# df_train = pd.merge(df_movies,pd.DataFrame(df_audio_data_processed), on='filename')
# df_train.drop(['filename'],axis=1, inplace=True)
# display(df_train.head(5))
# df_X = df_train.drop('goodforairplane',axis=1)
# df_y = df_train['goodforairplane']




display("---- RAW Data ----")
metrics = evaluate_models(df_X, df_y)
display(metrics)

# save  the final table 
metrics['Modality'] = 'audio'
df_final_results = df_final_results.append(metrics)

display("---- Scaled Data ----")
df_scaled_X = StandardScaler().fit_transform(df_X)
metrics = evaluate_models(df_scaled_X, df_y)
display(metrics)

display("---- RobustScaler Data ----")
df_scaled_X = RobustScaler().fit_transform(df_X)
metrics = evaluate_models(df_scaled_X, df_y)
display(metrics)

'################## Use audio data  ###################'

'---- RAW Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.442217,0.464444,0.446119
NearestCentroid,0.605951,0.547071,0.491725
DecisionTreeClassifier,0.488826,0.468384,0.443311
LogisticRegression,0.558033,0.54596,0.539093
SVC,0.341683,0.424949,0.368661
BaggingClassifier,0.483271,0.471111,0.454408
AdaBoostClassifier,0.52447,0.51202,0.491446
GradientBoostingClassifier,0.502299,0.503838,0.494488
RandomForestClassifier,0.490856,0.485152,0.474418
GaussianNB,0.550048,0.515758,0.498764


'---- Scaled Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.453864,0.450101,0.432996
NearestCentroid,0.555387,0.542828,0.528025
DecisionTreeClassifier,0.474566,0.457273,0.429314
LogisticRegression,0.531818,0.522626,0.514002
SVC,0.416675,0.463131,0.420189
BaggingClassifier,0.446986,0.447778,0.420659
AdaBoostClassifier,0.52447,0.51202,0.491446
GradientBoostingClassifier,0.499389,0.503838,0.487417
RandomForestClassifier,0.554897,0.536667,0.51784
GaussianNB,0.550048,0.515758,0.498764


'---- RobustScaler Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.400179,0.422727,0.405317
NearestCentroid,0.647124,0.547071,0.501967
DecisionTreeClassifier,0.499441,0.477273,0.449784
LogisticRegression,0.518855,0.511515,0.501448
SVC,0.48107,0.488485,0.470724
BaggingClassifier,0.450505,0.441111,0.436473
AdaBoostClassifier,0.52447,0.51202,0.491446
GradientBoostingClassifier,0.553675,0.546263,0.533925
RandomForestClassifier,0.536757,0.529596,0.524488
GaussianNB,0.550048,0.515758,0.498764


In [11]:
# This data frame will be changed, because here we have 1235 rows, instead of 95...
display(df_train.shape[0])
# One line was made for each filename / audio frame combination, we should have one resulting line with them as attributes.

# TODO: look through that and replace it with the code above if you agree with the approach :)

95

## Final base classifier filter

In [12]:
df_r = df_final_results
df_r = df_r[ (df_r['precision'] > 0.5) & (df_r['recall'] > 0.5) & (df_r['F1'] > 0.5) ]
display(df_r)


Unnamed: 0,precision,recall,F1,Modality
KNeighborsClassifier,0.632037,0.619697,0.589233,metadata
LogisticRegression,0.545546,0.549293,0.529509,metadata
RandomForestClassifier,0.526445,0.522828,0.510247,metadata
AdaBoostClassifier,0.582774,0.570202,0.546826,textual
GradientBoostingClassifier,0.622112,0.643939,0.590491,textual
GaussianNB,0.537073,0.558182,0.538881,textual
KNeighborsClassifier,0.570299,0.56,0.538146,visual
LogisticRegression,0.580084,0.549798,0.538425,visual
BaggingClassifier,0.598845,0.577071,0.565515,visual
GradientBoostingClassifier,0.500462,0.522424,0.506196,visual


As we can see the results table looks pretty different than in the paper. There is not really enough information in the paper to be sure that we are correctly reproducing the steps. 

With the audio data there is actually not really more we could do since we just end up with one coliumn of data as descirbed in the paper, but still the metrics is not as good as in the paper. 

__Is there something wrong already when we load the data ? Wrong data?__



# Task 3.2 Feature Selection

They use LVW for feature selection as described in the mentioned paper.
What is very confusion in this section is that in the end again they refer to Table 2. as also in the previous section. 
I would conclude that in the previous section they just wanted to refer to the selected classifiers and in this section to the metrics results. 
But still we would have different classifiers. 

The implementation of the LVW shouldn't be to compilcated. Maybe there is already some code out there.

### Implementation of LVW

Here I implemented the LVW from the pseudo code of the refered paper (since I have not found any code on it), with adapting that we use a greater F1 score, instead of a lower error.

It was not clearly stated how they actually "slightly modified" the LVW...

In [21]:
def randomSet(size):
    '''
    Returns a subset of available features
    '''  
    number_of_features = random.randint(1, size-1)
    features = np.empty(number_of_features, dtype = int)
    
    i = 0
    while i < number_of_features:
        rand = random.randint(0, size-1)
        if rand not in features:
            features[i] = rand
            i = i + 1
            
    return features  


def LearnAlgo(S1, D_X, D_Y, model):
    '''
    Calculates and returns metrics on given data frame with feature
    subset S1
    '''
    D_X = D_X[D_X.columns[S1]]
    metrics = calculate_metrics(model, D_X, D_Y)
    return metrics
    

def LVW(K, D_X, D_Y, model, output = True):
    '''
    Implementation of the Las Vegas Wrapper, according to the paper
    "Feature Selection and Classification - A probabilistic approach",
    modifed to maximizing F1 instead of minimizing error.
    '''
    k = 0
    C = 100
    metrics = {"precision": 0,
               "recall": 0,
               "F1": 0}
    
    size = D_X.columns.size
    if size == 1: # fix for dataframes with size 1
        return calculate_metrics(model, D_X, D_Y), [0]
    
    while k < K:
        S1 = randomSet(size)
        C1 = S1.size
        metrics_1 = LearnAlgo(S1, D_X, D_Y, model)
        
        if (metrics_1["F1"] > metrics["F1"] or 
            (metrics_1["F1"] == metrics["F1"] and C1 < C)):
            if output:
                print("Current best F1 = " + str(metrics_1["F1"]) + ", size = " + str(C1))
            k = 0
            metrics = metrics_1
            C = C1
            S = S1
        
        k = k + 1

    return metrics, S


### Preparation of data

Here we used the code segments created in 3.1 to rebuild our needed data sets.

In [13]:
# Textual data
df_train_text = pd.merge(df_movies,df_text_data, on='filename')
df_train_text.drop(['filename'],axis=1, inplace=True)
df_X_text = df_train_text.drop('goodforairplane',axis=1)
df_y_text = df_train_text['goodforairplane']
print("text:")
display(df_X_text.head(2))


#####################################################
# Visual data
df_train_visual = pd.merge(df_movies,df_visual_data_processed, on='filename')
df_train_visual.drop(['filename'],axis=1, inplace=True)
print("visual:")
df_X_visual = df_train_visual.drop('goodforairplane',axis=1)
df_y_visual = df_train_visual['goodforairplane']
df_scaled_X_visual = pd.DataFrame(StandardScaler().fit_transform(df_X_visual))
display(df_scaled_X_visual.head(2))


#####################################################
# Audio data

def load_audio_data( filenames ):
    data_list = []
    
    audio_data = pd.DataFrame()
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/audio_descriptors/{file}.csv'
        df_data = pd.read_csv(file_path,index_col=None, header=None).T
        
        # preprocess data
        df_data = df_data.fillna(0)
        df_data = pd.DataFrame(df_data.mean(axis = 0)).T
        df_data["filename"] = file
        audio_data = audio_data.append(df_data)
    audio_data = audio_data.set_index("filename") 
    return audio_data

df_audio_data = load_audio_data( df_labled_movies['filename']  )


df_train_audio = pd.merge(df_movies, df_audio_data, on='filename')
df_train_audio.drop(['filename'],axis=1, inplace=True)
df_X_audio = df_train_audio.drop('goodforairplane',axis=1)
df_y_audio = df_train_audio['goodforairplane']

print("audio:")
display(df_X_audio.head(2))


#####################################################
# Meta data
df_train_meta = pd.merge(df_labled_movies,df_meta_data, on='filename')
df_train_meta.drop(['movie', 'filename'],axis=1, inplace=True)
df_X_meta = df_train_meta.drop('goodforairplane',axis=1)
df_X_meta['runtime'] = df_X_meta['runtime'].apply(lambda x: int(x.split(' ')[0]) )
df_X_meta['year'] =  df_X_meta['year'].apply(pd.to_numeric)
df_y_meta = df_train_meta['goodforairplane']
label_encoder = MultiColumnLabelEncoder(['language','genre','country','rated'])    
X_labelencoded_meta = pd.DataFrame(label_encoder.fit_transform(df_X_meta))
print("meta:")
display(X_labelencoded_meta.head(2))


text:


Unnamed: 0,24000,baby,baseball,big,doc,escort,frozen,heroes,high,huck,...,years.1,york,yorks,young,young.1,younger,youngja,zebra,zellweger,zoologists
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


visual:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1642,1643,1644,1645,1646,1647,1648,1649,1650,1651
0,-0.291687,-0.564263,0.087605,-0.292158,0.214169,-0.146948,-0.242564,-0.521025,-0.242699,-0.708481,...,-0.297358,-0.6448,-0.305564,-0.409549,-0.263803,-0.268605,-0.319259,-0.481967,-0.308609,-0.437872
1,1.837915,1.420698,1.66709,1.18506,1.608901,1.139443,1.661874,1.322064,1.485797,0.969673,...,2.276885,1.732011,-0.31627,-0.403887,-0.315008,-0.440851,-0.300164,-0.376406,-0.313341,-0.431973


audio:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,67.562481,-4.5252,1.6463,-0.597742,1.36281,-1.361579,0.170381,-0.500409,-0.050339,-0.269793,-0.232489,-0.197553,0.13264
1,61.548885,-7.146781,-1.103407,-1.58802,0.249743,0.003055,-2.313552,0.371521,0.516853,-1.405396,-0.951247,1.316795,-0.095459


meta:


Unnamed: 0,language,year,genre,country,runtime,rated
0,2,2014,1,29,102,5
1,2,2014,31,19,105,6


### Trys for different data modalities

In [23]:
# Audio
LVW(100, df_X_audio, df_y_audio, KNeighborsClassifier())

Current best F1 = 0.47286916786916783, size = 1
Current best F1 = 0.5036705886705887, size = 9
Current best F1 = 0.5296536796536796, size = 9
Current best F1 = 0.536009916009916, size = 7
Current best F1 = 0.5554463129463129, size = 2
Current best F1 = 0.5728625078625078, size = 11
Current best F1 = 0.6951566951566951, size = 1


(precision    0.732025
 recall       0.705051
 F1           0.695157
 dtype: float64, array([2]))

In [24]:
# Textual
LVW(100, df_X_text, df_y_text, KNeighborsClassifier())

Current best F1 = 0.4682164785105961, size = 3082
Current best F1 = 0.4705247258188436, size = 848
Current best F1 = 0.48497354497354495, size = 1963
Current best F1 = 0.5055411908353085, size = 2593
Current best F1 = 0.5617828195769372, size = 3216
Current best F1 = 0.5647335072335072, size = 1136
Current best F1 = 0.5683752086693262, size = 1755
Current best F1 = 0.5810556763497939, size = 1463
Current best F1 = 0.6165299243240419, size = 2859


(precision    0.644486
 recall       0.647071
 F1           0.616530
 dtype: float64, array([3018, 1705, 3272, ...,  366, 2222, 1191]))

In [25]:
# Visual
LVW(100, df_scaled_X_visual, df_y_visual, KNeighborsClassifier())

Current best F1 = 0.5923101898101898, size = 528
Current best F1 = 0.615989565989566, size = 423
Current best F1 = 0.6577200577200577, size = 44


(precision    0.669832
 recall       0.660707
 F1           0.657720
 dtype: float64,
 array([ 145,  534, 1395, 1192,  404,  860, 1251,   52,  597,  546,  592,
        1142, 1241,  768, 1212,  827,  524,   15, 1295,  829, 1471,  116,
        1135,  821, 1581,  615,  309,  895,  156,  250,  628,  722, 1227,
         314,  346,  763,  265,   90,  345, 1188,  256,  573,  284,  129]))

In [26]:
# Meta
LVW(100, X_labelencoded_meta, df_y_meta, KNeighborsClassifier())

Current best F1 = 0.5193288193288192, size = 2
Current best F1 = 0.5645171495171495, size = 3
Current best F1 = 0.5673261923261923, size = 4
Current best F1 = 0.5692241092241093, size = 2


(precision    0.612211
 recall       0.596667
 F1           0.569224
 dtype: float64, array([0, 3]))

### Application of LVW on selected combinations

No information about seed and so on, just that in each run there are different subsets and therefore different results (we set a seed here for enabling reproduction). I stored the features we use (from our combinations and from them of the paper) into files so we dont have to execute this code every time.

In [27]:
K = 150 # max number of runs for finding better LVW combinations -> better results, longer runtime with higher K
df_final_results_lvw = pd.DataFrame()
df_x = pd.DataFrame()
df_y = pd.DataFrame()


def str2Class(str):
    return getattr(sys.modules[__name__], str)

random.seed(123)

for index, row in df_r.iterrows():
    model = str2Class(index)
    print(getModelName(model) + " - " + row["Modality"])
    
    # get correct data frame
    if row["Modality"] == "metadata":
        df_x = X_labelencoded_meta
        df_y = df_y_meta
    elif row["Modality"] == "visual":
        df_x = df_scaled_X_visual
        df_y = df_y_visual
        
    elif row["Modality"] == "textual":
        df_x = df_X_text
        df_y = df_y_text
    else:
        df_x = df_X_audio
        df_y = df_y_audio
    
    # Calculate and append LVW metrics
    metrics = pd.DataFrame()
    S = []
    if getModelName(model) == "RandomForestClassifier":
        print("skipping random forest..")
        # evaluate random forest withot LVW since it already performs feature selection
        random.seed(123)
        m = calculate_metrics(model(), df_x, df_y)
    else:
        # feature selection metrics
        random.seed(123)
        m, S = LVW(K, df_x, df_y, model(), False)
    print(m)
    features = ','.join(map(str, S))
    print(features)
    metrics[getModelName(model)] = m
    metrics = metrics.T
    metrics["Modality"] = row["Modality"]
    metrics["Features"] = features
    
    df_final_results_lvw = df_final_results_lvw.append(metrics)


KNeighborsClassifier - metadata
precision    0.612211
recall       0.596667
F1           0.569224
dtype: float64
0,3
LogisticRegression - metadata
precision    0.580071
recall       0.605758
F1           0.562425
dtype: float64
0,3
RandomForestClassifier - metadata
skipping random forest..
precision    0.509175
recall       0.505758
F1           0.483755
dtype: float64

BaggingClassifier - textual
precision    0.688945
recall       0.640808
F1           0.608291
dtype: float64
1113,1782,289,1275,2327,2461,528,1522,952,3010,33,2019,1509,1125,89,1944,2888,2373,1269,2647,2169,686,2066,1568,620,882,2308,681,2236,2856,1823,990,2498,1223,2405,1453,1516,1129,894,2097,2685,1292,2436,696,386,2690,652,411,2176,2571,663,1523,815,336,1245,642,561,215,15,1443,1584,169,3098,2883,2199,2145,1153,2516,3049,376,1588,1580,2131,2349,2483,913,1727,2340,1576,546,2854,1546,886,2197,921,3276,45,1131,2194,2762,592,1841,1229,3044,375,1448,50,3125,1662,1978,279,416,3202,2518,3001,874,1103,1938,3197,1832,3222,262

precision    0.738405
recall       0.696162
F1           0.674759
dtype: float64
558,2003,1487,3034,325,2972,2999,2588,1155,2843,294,92,1431,1511,3152,574,794,175,2880,3091,568,615,236,3056,508,1421,1846,2154,3010,1737,151,1599,2053,2306,2427,1502,1082,3215,2022,1330,1791,2463,672,1281,1639,491,1766,2723,2334,2231,1628,1647,1549,1446,3184,348,457,972,2508,22,814,109,182,2676,1966,1199,2613,1963,1711,469,2560,3118,3203,1838,1101,5,2795,1119,1690,1447,418,2776,420,1527,3199,1138,375,2286,1012,1423,232,1046,250,133,1777,1335,2745,996,32,1230,277,1161,3243,1875,1481,1096,856,1754,1064,371,608,1099,2848,2687,2940,414,2204,2571,71,304,685,1304,1968,534,2644,2746,3111,337,3212,1068,2681,1584,3228,1850,1000,2320,3097,41,1436,3128,2670,704,626,1477,328,1344,181,2150,3134,931,9,2719,2039,1483,177,1648,2707,1701,640,3121,952,2469,1496,1306,1843,2667,1221,3225,1851,2347,1047,2983,1126,1095,717,1624,1382,2491,415,895,200,1979,2113,102,3093,2859,2291,1728,1408,1372,37,1550,2730,2103,1726,1581,195,27

precision    0.692266
recall       0.674343
F1           0.667359
dtype: float64
387,1002,1974,3252,1697,945,2011,2722,663,2748,2846,3008,2916,1432,570,334,2782,1265,3233,1818,1084,799,392,1728,1918,2594,1052,1916,2012,550,2413,2888,1682,977,499,3176,2098,1497,2244,2284,2875,1297,477,655,407,2315,1849,1186,2519,405,2661,717,989,317,1737,2937,3015,2791,3060,820,2597,1908,3194,1484,2945,2973,425,624,2951,1988,2609,3,3267,531,2429,2505,889,2598,1356,1077,2202,534,487,2139,1158,1619,1696,1438,2105,1025,2344,4,1409,414,1912,465,449,2730,975,323,535,3250,1600,1857,1453,2491,1755,571,1237,2995,2515,364,1178,1487,1015,1669,2788,2743,1066,1471,497,2256,2339,2084,3102,1568,2817,2363,2496,1702,2586,2357,1902,938,1820,343,1816,1744,1657,3101,2052,740,1425,1384,857,1403,720,1940,2534,2761,2089,2384,2632,1689,537,21,1881,815,2905,398,1120,1288,2477,2837,2167,771,2399,870,2548,577,519,660,1205,1311,1276,3029,2120,2835,1183,1360,3141,2153,14,1580,2526,1783,2936,622,1468,1597,3248,2550,1915,1380,1102,2

precision    0.701584
recall       0.684141
F1           0.679509
dtype: float64
336,870,421,108,656,329,681,376,1271,444,514,414,1,451,1445,883,1148,558,601,939,612,1394,1458,719,787,1466,104,1487,152,976,79,295,1021,298,577,1248,229,57,283,203,100,982,227,1610,93,1188,1514,281,466,338,581,1460,317,1011,33,1251,1263,1389,750,668,677,300,1444,1178,991,1375,954,1288,1554,560,642,193,313,437,847,942,505,874,147,348,1634,1557,383,1405,741,918,757,48,1177,1561,999,890,872,1000,1581,1116,805,462,948,1548,8,478,783,19,1037,866,559,1425,363,87,128,1574,197,1018,441,1130,297,507,1087,1112,961,751,1401,374,52,624,1243,798,972,983,9,892,625,308,1077,1485,1270,667,205,599,365,1455,15,1208,727,789,353,1543,2,1409,76,68,1230,82,1202,487,71,371,1198,673,1583,799,1033,64,1505,515,1283,455,1482,1407,401,722,1014,670,356,616,1269,997,175,868,714,992,1045,400,1095,1430,241,1453,857,1385,574,710,1440,1582,1104,1492,1484,533,981,257,117,1183,493,844,1478,1415,1241,434,486,1536,546,1162,1539,795,1475,889,4

### Final results
Some of the results got better, some got worse since we only use a feature subspace where the fact if the full data is tried out is just random.

In [14]:
# Previous results
display(df_r)

# Results with feature selection
#display(df_final_results_lvw)#[df_final_results_lvw.columns.difference(["Features"])])

# save final data frame (TODO: uncomment when rerunning LVW)
# df_final_results_lvw.to_csv('./data/results.csv', sep=';', encoding='utf-8')

# Try out accessing data (saves time since recompiling takes long)
test_read = pd.read_csv('./data/results.csv', sep=';', encoding='utf-8', header = 0, index_col = 0)
test_read

Unnamed: 0,precision,recall,F1,Modality
KNeighborsClassifier,0.632037,0.619697,0.589233,metadata
LogisticRegression,0.545546,0.549293,0.529509,metadata
RandomForestClassifier,0.526445,0.522828,0.510247,metadata
AdaBoostClassifier,0.582774,0.570202,0.546826,textual
GradientBoostingClassifier,0.622112,0.643939,0.590491,textual
GaussianNB,0.537073,0.558182,0.538881,textual
KNeighborsClassifier,0.570299,0.56,0.538146,visual
LogisticRegression,0.580084,0.549798,0.538425,visual
BaggingClassifier,0.598845,0.577071,0.565515,visual
GradientBoostingClassifier,0.500462,0.522424,0.506196,visual


Unnamed: 0,precision,recall,F1,Modality,Features
KNeighborsClassifier,0.612211,0.596667,0.569224,metadata,03
LogisticRegression,0.580071,0.605758,0.562425,metadata,03
RandomForestClassifier,0.509175,0.505758,0.483755,metadata,
BaggingClassifier,0.688945,0.640808,0.608291,textual,"1113,1782,289,1275,2327,2461,528,1522,952,3010..."
AdaBoostClassifier,0.632032,0.619495,0.603464,textual,"1300,1423,2906,957,1638,1126,2679,1172,3074,30..."
GradientBoostingClassifier,0.738405,0.696162,0.674759,textual,"558,2003,1487,3034,325,2972,2999,2588,1155,284..."
GaussianNB,0.692266,0.674343,0.667359,textual,"387,1002,1974,3252,1697,945,2011,2722,663,2748..."
KNeighborsClassifier,0.688369,0.68303,0.674998,visual,"906,574,221,1457,177,299,777,316,1546,1383,730..."
DecisionTreeClassifier,0.690522,0.66596,0.652296,visual,"1442,434,858,673,1387,20,1026,1367,692,420,155..."
LogisticRegression,0.653923,0.642828,0.635188,visual,"740,1446,1575,572,1069,649,1647,1265,819,1471,..."


### LVW Feature selection on the classifiers of the paper
Since we have other classifiers as an output, I decided to also use the combinations stated in the paper, that should be also nice to compare them.

In [29]:
paper_combinations_modality = ['metadata', 'metadata', 'metadata','metadata', 'metadata',
                               'metadata', 'metadata', 'metadata', 'metadata', 'textual',
                               'textual', 'textual', 'visual', 'visual', 'visual', 'visual', 
                               'visual', 'visual', 'visual', 'audio', 'audio']
paper_combinations_classifier = ['KNeighborsClassifier', 'NearestCentroid', 'DecisionTreeClassifier','LogisticRegression', 
                                 'SVC', 'BaggingClassifier', 'RandomForestClassifier', 'AdaBoostClassifier', 
                                 'GradientBoostingClassifier', 'GaussianNB', 'KNeighborsClassifier', 'SVC', 
                                 'KNeighborsClassifier', 'DecisionTreeClassifier', 'LogisticRegression', 'SVC', 
                                 'RandomForestClassifier', 'AdaBoostClassifier', 'GradientBoostingClassifier', 
                                 'LogisticRegression', 'GradientBoostingClassifier']

paper_combination_score = pd.DataFrame()
paper_combination_score["Modality"] = pd.Series(paper_combinations_modality)
paper_combination_score["Classifier"] = paper_combinations_classifier
paper_combination_score["Features"] = pd.Series()
paper_combination_score.set_index(["Classifier"], inplace = True)


K = 150 # max number of runs for finding better LVW combinations -> better results, longer runtime with higher K
df_final_results_paper = pd.DataFrame()
df_x = pd.DataFrame()
df_y = pd.DataFrame()


def str2Class(str):
    return getattr(sys.modules[__name__], str)

random.seed(123)

for index, row in paper_combination_score.iterrows():
    model = str2Class(index)
    print(getModelName(model) + " - " + row["Modality"])
    
    # get correct data frame
    if row["Modality"] == "metadata":
        df_x = X_labelencoded_meta
        df_y = df_y_meta
    elif row["Modality"] == "visual":
        df_x = df_scaled_X_visual
        df_y = df_y_visual
        
    elif row["Modality"] == "textual":
        df_x = df_X_text
        df_y = df_y_text
    else:
        df_x = df_X_audio
        df_y = df_y_audio
    
    # Calculate and append LVW metrics
    metrics = pd.DataFrame()
    S = []
    if getModelName(model) == "RandomForestClassifier":
        print("skipping random forest..")
        # evaluate random forest withot LVW since it already performs feature selection
        random.seed(123)
        m = calculate_metrics(model(), df_x, df_y)
    else:
        # feature selection metrics
        random.seed(123)
        m, S = LVW(K, df_x, df_y, model(), False)
    print(m)
    features = ','.join(map(str, S))
    print(features)
    metrics[getModelName(model)] = m
    metrics = metrics.T
    metrics["Modality"] = row["Modality"]
    metrics["Features"] = features
    
    df_final_results_paper = df_final_results_paper.append(metrics) 

KNeighborsClassifier - metadata
precision    0.612211
recall       0.596667
F1           0.569224
dtype: float64
0,3
NearestCentroid - metadata
precision    0.546039
recall       0.580404
F1           0.550910
dtype: float64
3,0
DecisionTreeClassifier - metadata
precision    0.600597
recall       0.607576
F1           0.559265
dtype: float64
3
LogisticRegression - metadata
precision    0.580071
recall       0.605758
F1           0.562425
dtype: float64
0,3
SVC - metadata
precision    0.590208
recall       0.594444
F1           0.550604
dtype: float64
3
BaggingClassifier - metadata
precision    0.654537
recall       0.634848
F1           0.586235
dtype: float64
3
RandomForestClassifier - metadata
skipping random forest..
precision    0.447834
recall       0.446061
F1           0.417068
dtype: float64

AdaBoostClassifier - metadata
precision    0.603628
recall       0.616667
F1           0.564265
dtype: float64
3
GradientBoostingClassifier - metadata
precision    0.611508
recall       0.

precision    0.300554
recall       0.547980
F1           0.388116
dtype: float64
768,2617
KNeighborsClassifier - visual
precision    0.667835
recall       0.644646
F1           0.632889
dtype: float64
1366,1327,1435,1280,698,32,287,836,155,1181,374,108,1069,1617,641,315,967,1531,540,423,1019,636,738,518,174,442,358,1488,1635,1642,1133,323,1210,284,297,1565,858,946,988,765,815,852,1552,1359,1561,221,1383,1165,389,244,1232,1123,621,886,989,265,262,1378,830,935,1637,522,91,347,558,921,1600,941,631,233,1346,1393,1595,1145,1228,1356,447,369,402,452,1529,1499,61,697,1547,632,258,1046,620,439,1017,491,482,673,85,56,1085,326,1200,446,1169,1292,3,19,1097,1374,1648,1367,797,873,350,464,713,58,1609,178,471,812,811,1144,798,681,957,1033,1536,520,1044,592,643,1350,857,1106,21,896,1124,1110,689,889,1184,159,747,71,365,912,273,96,1022,1494,275,567,543,316,657,993,1570,345,1343,377,728,1177,438,724,1222,876,411,1344,1012,679,199,1081,733,121,828,87,551,271,500,1313,1065,842,1333,445,1546,1060,145,118,

precision    0.672318
recall       0.650808
F1           0.641780
dtype: float64
883,76,1252,1108,1193,274,1199,1275,587,1498,1587,1414,402,1522,346,1399,979,1164,1221,1129,233,268,1122,1433,1251,984,1526,516,1500,725,900,1418,408,598,375,135,718,763,1212,1173,337,104,316,949,77,1584,1428,130,326,519,604,1541,165,162,331,84,50,1573,1437,1029,1117,1243,969,795,398,853,828,815,792,895,1513,1008,416,1345,477,978,663,813,1339,568,617,870,232,757,735,9,13,513,1361,614,381,371,1511,1144,1470,59,48,350,1305,768,1379,462,1217,87,444,1233,711,478,590,1565,1231,1032,762,1281,212,530,396,349,658,938,1027,628,1139,1449,51,1086,10,364,214,1213,1540,1600,1516,715,835,1310,1074,1206,925,1347,256,1225,493,496,1075,1507,662,92,961,1182,1512,606,115,236,483,1051,627,243,888,788,46,1271,1200,1558,1334,412,65,639,561,1556,1398,1162,880,306,1222,504,311,1288,1603,1605,1001,1564,826,523,1058,277,121,1632,1577,197,1447,1545,227,1065,1034,1627,1583,292,904,845,808,19,1596,1287,1092,1172,1063,852,18,1312,1555,

In [16]:
# (TODO: uncomment when rerunning LVW)
# F1_paper = [0.630, 0.591, 0.563, 0.578, 0.574, 0.631, 0.576, 0.536, 0.569, 0.702, 0.666, 0.707,
#             0.608, 0.535, 0.608, 0.580, 0.638, 0.654, 0.587, 0.546, 0.587]

# df_final_results_paper["F1 Paper"] = F1_paper
# df_final_results_paper["Difference"] =  df_final_results_paper["F1 Paper"] - df_final_results_paper["F1"]
# #df_final_results_paper

# # save final data frame 
# # df_final_results_paper.to_csv('./data/results_paper.csv', sep=';', encoding='utf-8')

# Try out accessing data (saves time since recompiling takes long)
test_read_paper = pd.read_csv('./data/results_paper.csv', sep=';', encoding='utf-8', header = 0, index_col = 0)
test_read_paper

Unnamed: 0,precision,recall,F1,Modality,Features,F1 Paper,Difference
KNeighborsClassifier,0.612211,0.596667,0.569224,metadata,03,0.63,0.060776
NearestCentroid,0.546039,0.580404,0.55091,metadata,30,0.591,0.04009
DecisionTreeClassifier,0.600597,0.607576,0.559265,metadata,3,0.563,0.003735
LogisticRegression,0.580071,0.605758,0.562425,metadata,03,0.578,0.015575
SVC,0.590208,0.594444,0.550604,metadata,3,0.574,0.023396
BaggingClassifier,0.654537,0.634848,0.586235,metadata,3,0.631,0.044765
RandomForestClassifier,0.447834,0.446061,0.417068,metadata,,0.576,0.158932
AdaBoostClassifier,0.603628,0.616667,0.564265,metadata,3,0.536,-0.028265
GradientBoostingClassifier,0.611508,0.576162,0.554581,metadata,21,0.569,0.014419
GaussianNB,0.720231,0.694141,0.687852,textual,"449,3082,1448,3241,164,1889,1791,801,3099,2106...",0.702,0.014148


Some, like Support Vector Machines with radial kernel for textual data differ a lot! (0.318884 worse!)

## Significance testing
Here we take a look on the test set + cv, and compare our results to a base line. According to the paper, the majority class baseline  is 0.5 (precision, recall and F1-score). We calculated it on our own because it differs a lot from the actual F1 if predicting always the majority class (around 0.7!)

Here we used our selected features from the task before to identify statistical significance.

We can assume that the F1 scores are normally distributed because of the CLT. We take a mean of 0.5 for the population of the data, and calculate our mean and standard deviation of F1 score out of the cross validation results.


### Loading of test data
There are problems with test data in the label file and actual XML files.. (does not match)

Sometimes the file ending is given (e.g. .xml, we removed it), sometimes the row is given as a string...

10.000km is given twice with label 0 and 1 (we just assume one of them)!

In [119]:
import os


df_movies_all = pd.read_csv("./data/CoE_dataset/Test_set/test_set_labels.csv", sep=';')
print(df_movies_all.shape[0])

# To avoid null values, we just removed " by hand...
display(df_movies_all[df_movies_all["file_name"].isnull()])

ex = 0
filenames = []
for file in df_movies_all['file_name']:
    # fix already given file type - fixed by hand now
    if ".mp4" in file:
        file = file.split(".mp4")[0]
    if ".xml" in file:
        file = file.split(".xml")[0]
    
    # check if file is given twice
    if file in filenames:
        # already dropped by hand
        print("file already in! " + file)
    else:    
        file_path = f'./data/CoE_dataset/Test_set/XML/{file}.xml'
        exists = os.path.isfile(file_path)
        if exists:
            ex = ex + 1
            filenames.append(file)
        else:
            print(file)
            print(str(file_path) + " not exists!")
            df_movies_all = df_movies_all[df_movies_all['file_name'] != file]

print("Existing movies: " + str(ex))
print(df_movies_all.shape[0])

224


Unnamed: 0,movie_name,file_name,goodforairplanes


A_Fish_Called_Wanda
./data/CoE_dataset/Test_set/XML/A_Fish_Called_Wanda.xml not exists!
Existing movies: 223
223


In [121]:
df_labled_movies_test = df_movies_all
df_labled_movies_test = df_labled_movies_test[['movie_name','file_name', 'goodforairplanes']]
df_labled_movies_test.columns = ['movie', 'filename', 'goodforairplane']
display(df_labled_movies_test.head(3))


############################################################
### Load Meta Data ###

def load_meta_data_test( filenames ): 
    
    raw_data = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Test_set/XML/{file}.xml'
        exists = os.path.isfile(file_path)
        if not exists:
            print(file + " not exists!")
        else:
            with open(file_path) as f: 
                tree = ET.parse(f)
                movie = tree.find('movie')

                lang = movie.get('language')
                year = movie.get('year')
                genre = movie.get('genre')
                country = movie.get('country')
                runtime = movie.get('runtime')
                age_rating = movie.get('rated')

                raw_data.append( (file,lang,year,genre,country,runtime,age_rating) )
    
    return pd.DataFrame(raw_data, columns=['filename','language','year','genre','country','runtime','rated'])


df_meta_data_test = load_meta_data_test( df_labled_movies_test['filename']  )
display(df_meta_data_test.head(3))


############################################################
### Load Visual Data ###

def load_visual_data_test( filenames ):
    data_list = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Test_set/vis_descriptors/{file}.csv'
        exists = os.path.isfile(file_path)
        if not exists:
            print(file + " not exists!")
        else:
            df_data = pd.read_csv(file_path,index_col=None, header=None)
            data_list.append(df_data)
        
    return pd.concat(data_list, axis = 0, keys = filenames,names=('filename','vis_data'),  sort=False)

df_visual_data_test = load_visual_data_test( df_labled_movies_test['filename']  )
df_visual_data_test = df_visual_data_test.unstack()
display(df_visual_data_test.head(3))


############################################################
### Load Audio Data ###

def load_audio_data_test( filenames ): # changed
    data_list = []
    
    audio_data = pd.DataFrame()
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Test_set/audio_descriptors/{file}.csv'
        
        exists = os.path.isfile(file_path)
        if not exists:
            print(file + " not exists!")
        else:
            df_data = pd.read_csv(file_path,index_col=None, header=None).T

            # preprocess data
            df_data = df_data.fillna(0)
            df_data = pd.DataFrame(df_data.mean(axis = 0)).T
            df_data["filename"] = file
            audio_data = audio_data.append(df_data)
            
    audio_data = audio_data.set_index("filename") 
    return audio_data

df_audio_data_test = load_audio_data_test( df_labled_movies_test['filename']  )
display(df_audio_data_test.head(3))


############################################################
### Load textual Data ###

def load_text_data_test(filenames):
    

    data_list = []
    file_path = f'./data/CoE_dataset/Test_set/text_descriptors/tdf_idf_test.csv'
    #somehow pandas can not really handle that the first line is row names.(at least I didn't find a better way) 
    # thus we do it a little complicated here
    header_index = pd.read_csv(file_path, index_col=0,nrows=1 ).reset_index().columns
    df_data = pd.read_csv(file_path, header=None, index_col=False,skiprows=1)
    df_data.set_index(header_index, inplace=True)
    df_data.columns = filenames
    return df_data.T #row are should be represented by movie names

df_text_data_test = load_text_data_test(df_labled_movies_test['filename'] )
display(df_text_data_test.head(3))

Unnamed: 0,movie,filename,goodforairplane
0,Humpty Sharma Ki Dulhania,Humpty_Sharma_Ki_Dulhania,1
1,Homeland,Homeland,1
2,Trash,Trash,1


Unnamed: 0,filename,language,year,genre,country,runtime,rated
0,Humpty_Sharma_Ki_Dulhania,Hindi,2014,"Comedy, Drama, Romance",India,133 min,NOT RATED
1,Homeland,English,2011,"Drama, Mystery, Thriller",USA,55 min,TV-MA
2,Trash,"Portuguese, English",2014,"Adventure, Comedy, Crime","UK, Brazil",114 min,R


Unnamed: 0_level_0,0,0,1,1,2,2,3,3,4,4,...,821,821,822,822,823,823,824,824,825,825
vis_data,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
filename,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Humpty_Sharma_Ki_Dulhania,0.25284,0.25278,0.23444,0.2345,0.25556,0.25565,0.23095,0.23102,0.2672,0.26718,...,13.606,13.606,38450.0,38440.0,11844.0,11847.0,23975.0,23988.0,13069.0,13071.0
Homeland,0.1366,0.12622,0.2911,0.29358,0.38919,0.38603,0.25374,0.24926,0.18142,0.20572,...,8.6886,8.6112,13428.0,14415.0,2086.1,2519.3,5285.1,6909.0,2510.7,2963.8
Trash,0.23858,0.25246,0.24924,0.25729,0.34233,0.34281,0.32537,0.32128,0.28438,0.28786,...,11.442,11.627,37948.0,36342.0,11777.0,11420.0,24904.0,23160.0,12274.0,11730.0


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Humpty_Sharma_Ki_Dulhania,4.547587,-5.63623,1.434987,-0.279716,-0.669368,-1.271336,-0.705938,-0.263066,-0.273322,-0.794631,-0.060173,0.003418,-0.272562
Homeland,62.653646,-2.540778,0.94344,-1.226452,-0.285784,-0.821387,-0.986073,-1.069481,-1.126877,-0.613598,0.16874,-0.776176,0.38972
Trash,59.511905,-4.309526,-0.72833,-2.60298,0.1502,-0.210795,-0.315625,0.037404,-0.298176,0.943956,0.579414,0.388942,-0.008194


Unnamed: 0_level_0,1,1000,200000,acquired,ailing,avatar,avoid,babysitter,barbaric,battle,...,zero,zeus,zeus.1,zeus.2,zhonglian,zhuo,zombie,zombiehating,zombies,zuckerberg
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Humpty_Sharma_Ki_Dulhania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Homeland,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Trash,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Preparation of test data
Similar to train data

In [145]:
df_movies_test = df_labled_movies_test.drop(['movie'], axis = 1)


#####################################################
# Textual data
df_test_text = pd.merge(df_movies_test, df_text_data_test, on='filename')
df_test_text.drop(['filename'],axis=1, inplace=True)

df_X_text_test = df_test_text.drop('goodforairplane',axis=1)
df_y_text_test = df_test_text['goodforairplane']

print("text:" + str(df_X_text_test.shape[0]))
display(df_X_text_test.head(2))


#####################################################
# Visual data
df_test_visual = pd.merge(df_movies_test, df_visual_data_test, on='filename')
df_test_visual.drop(['filename'],axis=1, inplace=True)

df_X_visual_test = df_test_visual.drop('goodforairplane',axis=1)
df_y_visual_test = df_test_visual['goodforairplane']
df_scaled_X_visual_test = pd.DataFrame(StandardScaler().fit_transform(df_X_visual_test))

print("visual:"  + str(df_X_visual_test.shape[0]))
display(df_X_visual_test.head(2))


#####################################################
# Audio data

def load_audio_data( filenames ):
    data_list = []
    
    audio_data = pd.DataFrame()
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Test_set/audio_descriptors/{file}.csv'
        df_data = pd.read_csv(file_path,index_col=None, header=None).T
        
        # preprocess data
        df_data = df_data.fillna(0)
        df_data = pd.DataFrame(df_data.mean(axis = 0)).T
        df_data["filename"] = file
        audio_data = audio_data.append(df_data)
    audio_data = audio_data.set_index("filename") 
    return audio_data

df_audio_data = load_audio_data( df_movies_test['filename']  )


df_test_audio = pd.merge(df_movies_test, df_audio_data_test, on='filename')
df_test_audio.drop(['filename'],axis=1, inplace=True)
df_X_audio_test = df_test_audio.drop('goodforairplane',axis=1)
df_y_audio_test = df_test_audio['goodforairplane']

print("audio:" + str(df_X_audio_test.shape[0]))
display(df_X_audio_test.head(2))


#####################################################
# Meta data
df_test_meta = pd.merge(df_movies_test,df_meta_data_test, on='filename')
df_test_meta.drop(['filename'],axis=1, inplace=True)

df_X_meta_test = df_test_meta.drop('goodforairplane',axis=1)
# Work around for runtime
df_X_meta_test['runtime'] = df_X_meta_test['runtime'].apply(lambda x: int(x.split(' ')[0]) if x != 'N/A' else 0)
df_X_meta_test['year'] =  df_X_meta_test['year'].apply(pd.to_numeric)

df_y_meta = df_test_meta['goodforairplane']

label_encoder = MultiColumnLabelEncoder(['language','genre','country','rated'])    
X_labelencoded_meta_test = pd.DataFrame(label_encoder.fit_transform(df_X_meta_test))

print("meta:"+ str(df_X_meta_test.shape[0]) )
display(X_labelencoded_meta_test.head(2))

text:223


Unnamed: 0,1,1000,200000,acquired,ailing,avatar,avoid,babysitter,barbaric,battle,...,zero,zeus,zeus.1,zeus.2,zhonglian,zhuo,zombie,zombiehating,zombies,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


visual:223


Unnamed: 0,"(0, 0)","(0, 1)","(1, 0)","(1, 1)","(2, 0)","(2, 1)","(3, 0)","(3, 1)","(4, 0)","(4, 1)",...,"(821, 0)","(821, 1)","(822, 0)","(822, 1)","(823, 0)","(823, 1)","(824, 0)","(824, 1)","(825, 0)","(825, 1)"
0,0.25284,0.25278,0.23444,0.2345,0.25556,0.25565,0.23095,0.23102,0.2672,0.26718,...,13.606,13.606,38450.0,38440.0,11844.0,11847.0,23975.0,23988.0,13069.0,13071.0
1,0.1366,0.12622,0.2911,0.29358,0.38919,0.38603,0.25374,0.24926,0.18142,0.20572,...,8.6886,8.6112,13428.0,14415.0,2086.1,2519.3,5285.1,6909.0,2510.7,2963.8


audio:223


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,4.547587,-5.63623,1.434987,-0.279716,-0.669368,-1.271336,-0.705938,-0.263066,-0.273322,-0.794631,-0.060173,0.003418,-0.272562
1,62.653646,-2.540778,0.94344,-1.226452,-0.285784,-0.821387,-0.986073,-1.069481,-1.126877,-0.613598,0.16874,-0.776176,0.38972


meta:223


Unnamed: 0,language,year,genre,country,runtime,rated
0,56,2014,59,21,133,3
1,5,2011,83,45,55,7


### Statistical significance
Tests for alpha = 0.05 on combinations from the paper, the baseline is the F1 score predicting always the most frequent class.

In [27]:
alpha = 0.05
df_y = df_y_text

def str2Class(str):
    return getattr(sys.modules[__name__], str)

def calculate_F1_scores_cv(clf, X, y):
    metric =  cross_validate(clf, X, y, scoring=('f1_weighted'), return_train_score = False, cv = 10)  
    return metric['test_score']


def t_test(f1_scores, population_mean = 0.5):
    mu_sample = f1_scores.mean()
    n = len(f1_scores)
    std = sqrt((abs(f1_scores - mu_sample)**2).mean())   
    
    return (mu_sample - population_mean) / (std / sqrt(n))



classifier_combinations = pd.read_csv('./data/results_paper.csv', sep=';', encoding='utf-8', 
                                      header = 0, index_col = 0)[["Modality", "Features"]]

# TODO: We have to decide if we take as a baseline the one from the paper (0.5) or predicting the most frequent class!
baseline =  (df_labled_movies[df_labled_movies['goodforairplane'] == 1]['goodforairplane'].count() /  
             df_labled_movies['goodforairplane'].count())
baseline_f1 = f1_score(([1] if baseline > 0.5 else [0]) * len(df_labled_movies['goodforairplane']), 
                       df_labled_movies['goodforairplane'])
print("Baseline: " + str(baseline_f1))


for index, row in classifier_combinations.iterrows():
        model = str2Class(index)

        # get correct data frame
        if row["Modality"] == "metadata":
            df_x = X_labelencoded_meta
        elif row["Modality"] == "visual":
            df_x = df_scaled_X_visual
        elif row["Modality"] == "textual":
            df_x = df_X_text
        else:
            df_x = df_X_audio
               
        # Calculate and append LVW metrics
        if getModelName(model) == "RandomForestClassifier":
            # evaluate random forest without selected features since it already performs feature selection
            random.seed(123)
            metric = calculate_F1_scores_cv(model(), df_x, df_y)
        else:
            # get results with features from LVW        
            features = [int(i) for i in row["Features"].split(",")]
            df_x_features = df_x[df_x.columns[features]]
            random.seed(123)
            metric = calculate_F1_scores_cv(model(), df_x_features, df_y)
        
        # Calculate one sample t-test score
        t_score = t_test(metric, baseline_f1)
        df = len(metric - 1)
        # calculate the critical value
        cv = t.ppf(1.0 - alpha, df)
        
        print(str(index) + " - " + row["Modality"] + ": " + str(t_score) + ", " + 
              ("reject" if abs(t_score) <= cv else "keep"))
        

Baseline: 0.707482993197279
KNeighborsClassifier - metadata: -2.356004359495605, keep
NearestCentroid - metadata: -3.3348291810906527, keep
DecisionTreeClassifier - metadata: -3.187220703536219, keep
LogisticRegression - metadata: -2.849044391095118, keep
SVC - metadata: -5.138966877832904, keep
BaggingClassifier - metadata: -3.4557412587345593, keep
RandomForestClassifier - metadata: -3.9894222876643606, keep
AdaBoostClassifier - metadata: -3.2712569875344513, keep
GradientBoostingClassifier - metadata: -5.3796153821418615, keep
GaussianNB - textual: -0.4329850430594301, reject
KNeighborsClassifier - textual: -1.72466891421405, reject
SVC - textual: -53.59221026589789, keep
KNeighborsClassifier - visual: -2.4373632492476696, keep
DecisionTreeClassifier - visual: -2.2877419553828187, keep
LogisticRegression - visual: -1.2602838653471002, reject
SVC - visual: -2.5186203216444016, keep
RandomForestClassifier - visual: -4.716531689695143, keep
AdaBoostClassifier - visual: -0.4375249467011

# 3.3 Classifier stacking

## Majority Voting
This is the simplest case, where we select classifiers and feature subspaces through the steps above, and assign final predicted labels through majority voting on the labels of the 21 classifiers.

In [32]:
# Here we have to decide if we take our combinations - or the ones from the paper (or just try both :))
classifier_combinations = pd.read_csv('./data/results_paper.csv', sep=';', encoding='utf-8', 
                                      header = 0, index_col = 0)[["Modality", "Features"]]
#display(classifier_combinations.head(2))
df_y = df_y_text

kf = KFold(n_splits = 10)
           
random.seed(123)

f1 = 0
recall = 0
precision = 0

for train_index, test_index in kf.split(df_X_text):
    predicitons = [0] * len(test_index)
    
    for index, row in classifier_combinations.iterrows():
        model = str2Class(index)

        # get correct data frame
        if row["Modality"] == "metadata":
            df_x = X_labelencoded_meta
        elif row["Modality"] == "visual":
            df_x = df_scaled_X_visual
        elif row["Modality"] == "textual":
            df_x = df_X_text
        else:
            df_x = df_X_audio
        
        
        # Calculate and append LVW metrics
        if getModelName(model) == "RandomForestClassifier":
            # evaluate random forest without selected features since it already performs feature selection
            random.seed(123)
            mod = model().fit(df_x.loc[train_index, :], df_y[train_index])
            pred = mod.predict(df_x.iloc[test_index, :])
            
        else:
            # get results with features from LVW        
            features = [int(i) for i in row["Features"].split(",")]
            df_x_features = df_x[df_x.columns[features]]
            
            # Predicitions
            random.seed(123)
            mod = model().fit(df_x_features.loc[train_index, :], df_y[train_index])
            pred = mod.predict(df_x_features.iloc[test_index, :])    
            
        predicitons = predicitons + pred
        
    predictions_majority = pd.Series(predicitons).map(lambda x: 0 if x < classifier_combinations.shape[0]/2 else 1)
    f1 = f1 + f1_score(df_y[test_index], predictions_majority)
    recall = recall + recall_score(df_y[test_index], predictions_majority)
    precision = precision + precision_score(df_y[test_index], predictions_majority)
    
print("Precision: " + str(precision/10))
print("Recall: " + str(recall/10))
print("F1 score: " + str(f1/10))

Precision: 0.6192460317460318
Recall: 0.7895238095238095
F1 score: 0.6909090909090909


## Label Stacking
Assume we have n instances and T base classifiers, then we can generate an n by T matrix consisting of predictions (labels) given by each classifier. Label combining strategy tries to build a second-level classifier based on this label matrix, and return a final prediction result for that.

## Label-Feature Stacking
Similar to label stacking, label-feature stacking strategy uses both base-classifier predictions and features as training data to predict output.

## Significance testing for Classifier stacking