In [1]:
import sklearn 
import pandas as pd
import xml.etree.ElementTree as ET
import random
import sys
import numpy as np

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from math import sqrt
from scipy.stats import t
from scipy import stats


__First of all they did not mentioned what sklearn version they used!!!__


As described in the paper, the first step is to select base classifiers. 
The selected base classifiers are trained with default parameter settings with 10-fold cross-validation.
As input data, the training data set and its ground truth labels, per single modality is used.
For the audio MFCC features, we set NaN values to 0, and calculate the average of each MFCC coefficient over all frames.

# Load input data


# Description:

## Available Data
There are variouse csv files and data files available. It's very messy. 
There is one file called "CoE_dataset_offical_release.zip"! 
We extract this file and use this data included their for now! 

## Meta Data
In the original paper there is no information given what is included in the metadata. 
Looking at the paper describing the data set (Right Inflight? A Dataset for Exploring the Automatic Prediction of Movies Suitable for a Watching Situation
) we found out that as metadata they used language, year published, genre, country, runtime and age rating. We assume, since the author of our paper didn't say otherwise, that they used the same metadata. 

## User Rating
User rating is just mentioned in Table 1 and the dataset paper. It is not mentioned again in the CoE paper, althought user rating is important for good performance as mentioned in the dataset paper. 

__I would assume that the CoE paper just used user rating in the "metadata", since it's saved in the same file"__

## Visual Data: 
The visual data is provied as a csv file for each movie, containing two rows. According to the paper of the dataset they calculated following visual features, Histogram of Oriented Gradients (HOG) gray, Color Moments, local binary patterns (LBP) and Gray Level Run Length Matrix, but don't say how the csv file represents them. Also as mentioned the csv file just has two rows which would not ad up to the mentioned 4 visual features. __We are treating all values as seperate column!__

## Audio Data: 
Audio features is also provided per movie as a csv file. Each audio feature consits of 12 coefficients for multiple frames.

## Textual Data
The textual data is just one file containing the tdf-idf matrix. The first line are the row names for each word. 
While the columns are the associated movie. __There is no indication to which movie each column belongs! Thus we need to assume this!__

__For now we assume the order is the same as in the df_labled_movies dataframe!!!__



In [2]:

df_labled_movies = pd.read_csv("./data/CoE_dataset/Dev_set/dev_set_groundtruth_and_trailers.csv", sep=';')
del df_labled_movies['trailer']
df_labled_movies = df_labled_movies[['movie','filename', 'goodforairplane']]
display(df_labled_movies.head(5))


###############################################################
### Load Meta Data ###

def load_meta_data( filenames ): 
    
    raw_data = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/XML/{file}.xml'
        with open(file_path) as f: 
            tree = ET.parse(f)
            movie = tree.find('movie')
            
            lang = movie.get('language')
            year = movie.get('year')
            genre = movie.get('genre')
            country = movie.get('country')
            runtime = movie.get('runtime')
            age_rating = movie.get('rated')
             
            raw_data.append( (file,lang,year,genre,country,runtime,age_rating) )
    
    return pd.DataFrame(raw_data, columns=['filename','language','year','genre','country','runtime','rated'])

df_meta_data = load_meta_data( df_labled_movies['filename']  )
#pd.merge(df_labled_movies.drop(['movie'],axis=1),df_meta_data, on='filename').to_csv('data/meta_data_exported.csv',sep=";",index = False)

display(df_meta_data.head(5))
#display(df_meta_data.dtypes)

###############################################################
### Load User Rating Data ###

def load_user_rating_data( filenames ): 
    
    raw_data = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/XML/{file}.xml'
        with open(file_path) as f: 
            tree = ET.parse(f)
            movie = tree.find('movie')
            
            tomato_rating = movie.get('tomatoUserRating')
            imbd_rating = movie.get('imdbRating')
            meta_rating = movie.get('metascore')
            if tomato_rating == "N/A":
                tomato_rating = np.nan
            if imbd_rating == "N/A":
                imbd_rating = np.nan
            if meta_rating == "N/A":
                meta_rating = np.nan
             
            raw_data.append( (file,tomato_rating,imbd_rating,meta_rating) )
    
    return pd.DataFrame(raw_data, columns=['filename','tomatorating','imbdrating','metarating'])


df_user_rating_data = load_user_rating_data( df_labled_movies['filename']  )
df_user_rating_data['tomatorating'] = pd.to_numeric(df_user_rating_data['tomatorating'])
df_user_rating_data['imbdrating'] = pd.to_numeric(df_user_rating_data['imbdrating'])
df_user_rating_data['metarating'] = pd.to_numeric(df_user_rating_data['metarating'])
#pd.merge(df_labled_movies.drop(['movie'],axis=1),df_user_rating_data, on='filename').to_csv('data/user_rating_data_exported.csv',sep=";",index = False)
df_user_rating_data.fillna(df_user_rating_data.mean(),inplace=True)

display(df_user_rating_data.head(5))
#display(df_user_rating_data.dtypes)


###############################################################
### Load meta data with user rating  ###
def load_meta_extended_data( filenames ): 
    
     
    raw_data = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/XML/{file}.xml'
        with open(file_path) as f: 
            tree = ET.parse(f)
            movie = tree.find('movie')
            
            lang = movie.get('language')
            year = movie.get('year')
            genre = movie.get('genre')
            country = movie.get('country')
            runtime = movie.get('runtime')
            age_rating = movie.get('rated')
            tomato_rating = movie.get('tomatoUserRating')
            imbd_rating = movie.get('imdbRating')
            meta_rating = movie.get('metascore')
            
            if tomato_rating == "N/A":
                tomato_rating = np.nan
            if imbd_rating == "N/A":
                imbd_rating = np.nan
            if meta_rating == "N/A":
                meta_rating = np.nan
            raw_data.append( (file,lang,year,genre,country,runtime,age_rating,tomato_rating,imbd_rating,meta_rating) )
    
    return pd.DataFrame(raw_data, columns=['filename','language','year','genre','country','runtime','rated','tomatorating','imbdrating','metarating'])


df_meta_extended_data = load_meta_extended_data( df_labled_movies['filename']  )
df_meta_extended_data['tomatorating'] = pd.to_numeric(df_meta_extended_data['tomatorating'])
df_meta_extended_data['imbdrating'] = pd.to_numeric(df_meta_extended_data['imbdrating'])
df_meta_extended_data['metarating'] = pd.to_numeric(df_meta_extended_data['metarating'])
df_meta_extended_data.fillna(df_meta_extended_data.mean(),inplace=True)

display(df_meta_extended_data.head(5))
#display(df_user_rating_data.dtypes)

###############################################################
### Load Visual Data ###

def load_visual_data( filenames ):
    data_list = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/vis_descriptors/{file}.csv'
        df_data = pd.read_csv(file_path,index_col=None, header=None)
        data_list.append(df_data)
        
    return pd.concat(data_list, axis = 0, keys = filenames,names=('filename','vis_data'),  sort=False)

df_visual_data = load_visual_data( df_labled_movies['filename']  )

display(df_visual_data.head(5))


###############################################################
### Load Audio Data ###

# def load_audio_data( filenames ):
#     data_list = []
    
#     for file in filenames: 
#         file_path = f'./data/CoE_dataset/Dev_set/audio_descriptors/{file}.csv'
#         df_data = pd.read_csv(file_path,index_col=None, header=None)
#         data_list.append(df_data)
        
#     return pd.concat(data_list, axis = 0, keys = filenames,names=('filename','freq_coeff'),  sort=False)

# df_audio_data = load_audio_data( df_labled_movies['filename']  )
# display(df_audio_data.head(20))

def load_audio_data( filenames ):
    data_list = []
    
    audio_data = pd.DataFrame()
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/audio_descriptors/{file}.csv'
        df_data = pd.read_csv(file_path,index_col=None, header=None).T
        
        # preprocess data
        df_data = df_data.fillna(0)
        df_data = pd.DataFrame(df_data.mean(axis = 0)).T
        df_data["filename"] = file
        audio_data = audio_data.append(df_data)
    audio_data = audio_data.set_index("filename") 
    return audio_data

df_audio_data = load_audio_data( df_labled_movies['filename']  )
#pd.merge(df_labled_movies.drop(['movie'],axis=1),df_audio_data, on='filename').to_csv('data/audio_data_exported.csv',sep=";",index = False)

display(df_audio_data.head(5))


###############################################################
### Load textual Data ###

def load_text_data(filenames):
    

    data_list = []
    file_path = f'./data/CoE_dataset/Dev_set/text_descriptors/tdf_idf_dev.csv'
    #somehow pandas can not really handle that the first line is row names.(at least I didn't find a better way) 
    # thus we do it a little complicated here
    header_index = pd.read_csv(file_path, index_col=0,nrows=1 ).reset_index().columns
    df_data = pd.read_csv(file_path, header=None, index_col=False,skiprows=1)
    df_data.set_index(header_index, inplace=True)
    df_data.columns = filenames
    return df_data.T #row are should be represented by movie names

df_text_data = load_text_data(df_labled_movies['filename'] )
display(df_text_data.head(5))
#display(df_text_data.shape)
#display(df_text_data.describe())



Unnamed: 0,movie,filename,goodforairplane
0,Seventh Son,Seventh_Son,1
1,Welcome to Me,Welcome_to_Me,0
2,The Judge,The_Judge,0
3,Transformers Age of Extinction,Transformers__Age_of_Extinction,0
4,The Normal Heart,The_Normal_Heart,1


Unnamed: 0,filename,language,year,genre,country,runtime,rated
0,Seventh_Son,English,2014,"Action, Adventure, Fantasy","USA, UK, Canada, China",102 min,PG-13
1,Welcome_to_Me,English,2014,"Comedy, Drama",USA,105 min,R
2,The_Judge,English,2014,Drama,USA,141 min,R
3,Transformers__Age_of_Extinction,English,2014,"Action, Adventure, Sci-Fi","USA, China",165 min,PG-13
4,The_Normal_Heart,English,2014,Drama,USA,132 min,TV-MA


Unnamed: 0,filename,tomatorating,imbdrating,metarating
0,Seventh_Son,2.9,5.5,30.0
1,Welcome_to_Me,3.4,6.2,67.0
2,The_Judge,3.8,7.5,48.0
3,Transformers__Age_of_Extinction,3.3,5.8,32.0
4,The_Normal_Heart,4.2,8.0,64.093333


Unnamed: 0,filename,language,year,genre,country,runtime,rated,tomatorating,imbdrating,metarating
0,Seventh_Son,English,2014,"Action, Adventure, Fantasy","USA, UK, Canada, China",102 min,PG-13,2.9,5.5,30.0
1,Welcome_to_Me,English,2014,"Comedy, Drama",USA,105 min,R,3.4,6.2,67.0
2,The_Judge,English,2014,Drama,USA,141 min,R,3.8,7.5,48.0
3,Transformers__Age_of_Extinction,English,2014,"Action, Adventure, Sci-Fi","USA, China",165 min,PG-13,3.3,5.8,32.0
4,The_Normal_Heart,English,2014,Drama,USA,132 min,TV-MA,4.2,8.0,64.093333


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,816,817,818,819,820,821,822,823,824,825
filename,vis_data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Seventh_Son,0,0.047044,0.11619,0.13633,0.066194,0.072554,0.17267,0.21519,0.070574,0.071423,0.14938,...,731.69,502.01,1.897,2.2788,2.1412,2.9504,91672.0,22207.0,26201.0,14542.0
Seventh_Son,1,0.056526,0.12516,0.14628,0.082497,0.079331,0.17538,0.21839,0.093521,0.074837,0.15025,...,689.95,474.97,2.2676,2.5887,2.4022,3.2167,81373.0,21045.0,24225.0,13529.0
Welcome_to_Me,0,0.30717,0.33422,0.33112,0.33124,0.31114,0.33644,0.33616,0.34479,0.16983,0.27379,...,394.34,167.91,20.337,21.276,18.527,21.189,81665.0,13672.0,32531.0,13753.0
Welcome_to_Me,1,0.30466,0.33193,0.33124,0.33138,0.30788,0.3327,0.33357,0.34305,0.1733,0.28076,...,397.26,168.23,20.426,21.3,18.608,21.182,83171.0,13714.0,32774.0,13780.0
The_Judge,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,230400.0,119950.0,1e-06,0.002466,4e-06,0.002466,729320.0,119950.0,230400.0,119950.0


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Seventh_Son,67.562481,-4.5252,1.6463,-0.597742,1.36281,-1.361579,0.170381,-0.500409,-0.050339,-0.269793,-0.232489,-0.197553,0.13264
Welcome_to_Me,61.548885,-7.146781,-1.103407,-1.58802,0.249743,0.003055,-2.313552,0.371521,0.516853,-1.405396,-0.951247,1.316795,-0.095459
The_Judge,65.038918,-4.171344,-0.455058,-0.094249,-0.365649,-0.182148,0.700715,-0.195335,-0.204333,-1.281841,0.301911,-0.197494,0.861993
Transformers__Age_of_Extinction,64.544291,-3.661545,-0.010532,-0.802876,-0.614974,-0.255984,0.810787,0.465672,0.271618,-0.691701,0.16124,0.310825,0.21462
The_Normal_Heart,60.433903,0.148386,1.713255,-0.203955,-1.187262,-2.310341,-1.726492,-0.512949,0.270257,-0.098537,0.705479,-0.059131,0.247545


Unnamed: 0_level_0,24000,baby,baseball,big,doc,escort,frozen,heroes,high,huck,...,years.1,york,yorks,young,young.1,younger,youngja,zebra,zellweger,zoologists
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Seventh_Son,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Welcome_to_Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The_Judge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Transformers__Age_of_Extinction,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The_Normal_Heart,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.051657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Preprocess Data

## Description 

Beside a short description for the audio data there is no more information on how to handle the other data. For example the runtime currently is not handles as a number but as a string(object)
Since sklearn mostly expects numerical inputs, we need to encode the data. 

For different class normally you would use one-hot-encoding, but since it's not specified let's try first the easiest approach which is Labelencoding.


### Audio Data: 
As mentiones in the paper, NaN values of the audio data are set to 0 and the average of each MFCC coefficient is calculated over all frames.

This is already treated in the section before.



In [3]:

# def pre_process_audio_data():
#     df_data = df_audio_data.fillna(0.0)
#     return df_data.mean(axis=1)
    
def pre_process_visual_data():
    #create columns of the two rows belonging to each movie
    df_data = df_visual_data.unstack()
    return df_data
    
def pre_process_meta_data(df_meta):
    #create columns of the two rows belonging to each movie
    df_data = df_meta.drop(['genre','country','language'],axis=1)
    df_data_genre = df_meta['genre'].apply( lambda x: x.replace(" ", "") ).str.get_dummies(sep=",")
    df_data_genre.columns = ['genre_'+str(x) for x in df_data_genre.columns]
    
    df_data_country = df_meta_data['country'].apply( lambda x: x.replace(" ", "") ).str.get_dummies(sep=",")
    df_data_country.columns = ['country_'+str(x) for x in df_data_country.columns]
    
    df_data_language = df_meta_data['language'].apply( lambda x: x.replace(" ", "") ).str.get_dummies(sep=",")
    df_data_language.columns = ['language_'+str(x) for x in df_data_language.columns]

    df_data = pd.concat([df_data,df_data_genre,df_data_country,df_data_language], axis=1)

    df_data['runtime'] = pd.to_numeric(df_data['runtime'].apply(lambda x: x.split(' ')[0] ) )
    df_data['year'] =  df_data['year'].apply(pd.to_numeric)

    return df_data

df_meta_data_processed = pre_process_meta_data(df_meta_data)
display(df_meta_data_processed.head(5))

pd.merge(df_labled_movies.drop(['movie'],axis=1),df_meta_data_processed, on='filename').to_csv('data/meta_data_processed_exported.csv',sep=";",index = False)
df_meta_extended_data_processed = pre_process_meta_data(df_meta_extended_data)
display(df_meta_extended_data_processed.head(5))



df_audio_data_processed = df_audio_data #pre_process_audio_data()
display(df_audio_data_processed.head(5))

df_visual_data_processed = pre_process_visual_data()
#pd.merge(df_labled_movies.drop(['movie'],axis=1),df_visual_data_processed, on='filename').to_csv('data/visual_data_exported.csv',sep=";",index = False)

display(df_visual_data_processed.head(5))

Unnamed: 0,filename,year,runtime,rated,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,...,language_Latin,language_Mandarin,language_Navajo,language_Russian,language_Serbian,language_Spanish,language_Swahili,language_Urdu,language_Vietnamese,language_Yiddish
0,Seventh_Son,2014,102,PG-13,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Welcome_to_Me,2014,105,R,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,The_Judge,2014,141,R,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Transformers__Age_of_Extinction,2014,165,PG-13,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The_Normal_Heart,2014,132,TV-MA,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,filename,year,runtime,rated,tomatorating,imbdrating,metarating,genre_Action,genre_Adventure,genre_Animation,...,language_Latin,language_Mandarin,language_Navajo,language_Russian,language_Serbian,language_Spanish,language_Swahili,language_Urdu,language_Vietnamese,language_Yiddish
0,Seventh_Son,2014,102,PG-13,2.9,5.5,30.0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Welcome_to_Me,2014,105,R,3.4,6.2,67.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The_Judge,2014,141,R,3.8,7.5,48.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Transformers__Age_of_Extinction,2014,165,PG-13,3.3,5.8,32.0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,The_Normal_Heart,2014,132,TV-MA,4.2,8.0,64.093333,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Seventh_Son,67.562481,-4.5252,1.6463,-0.597742,1.36281,-1.361579,0.170381,-0.500409,-0.050339,-0.269793,-0.232489,-0.197553,0.13264
Welcome_to_Me,61.548885,-7.146781,-1.103407,-1.58802,0.249743,0.003055,-2.313552,0.371521,0.516853,-1.405396,-0.951247,1.316795,-0.095459
The_Judge,65.038918,-4.171344,-0.455058,-0.094249,-0.365649,-0.182148,0.700715,-0.195335,-0.204333,-1.281841,0.301911,-0.197494,0.861993
Transformers__Age_of_Extinction,64.544291,-3.661545,-0.010532,-0.802876,-0.614974,-0.255984,0.810787,0.465672,0.271618,-0.691701,0.16124,0.310825,0.21462
The_Normal_Heart,60.433903,0.148386,1.713255,-0.203955,-1.187262,-2.310341,-1.726492,-0.512949,0.270257,-0.098537,0.705479,-0.059131,0.247545


Unnamed: 0_level_0,0,0,1,1,2,2,3,3,4,4,...,821,821,822,822,823,823,824,824,825,825
vis_data,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
filename,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Seventh_Son,0.047044,0.056526,0.11619,0.12516,0.13633,0.14628,0.066194,0.082497,0.072554,0.079331,...,2.9504,3.2167,91672.0,81373.0,22207.0,21045.0,26201.0,24225.0,14542.0,13529.0
Welcome_to_Me,0.30717,0.30466,0.33422,0.33193,0.33112,0.33124,0.33124,0.33138,0.31114,0.30788,...,21.189,21.182,81665.0,83171.0,13672.0,13714.0,32531.0,32774.0,13753.0,13780.0
The_Judge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002466,0.002466,729320.0,729320.0,119950.0,119950.0,230400.0,230400.0,119950.0,119950.0
Transformers__Age_of_Extinction,0.19996,0.18913,0.26934,0.25738,0.27986,0.27465,0.23725,0.23664,0.30844,0.30332,...,14.779,14.723,208630.0,211630.0,23968.0,24019.0,47979.0,48339.0,24059.0,24090.0
The_Normal_Heart,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038749,0.20135,...,7.3798,11.306,145760.0,79962.0,20730.0,20617.0,35320.0,45216.0,20831.0,20738.0


# Define Models

## Description 
These are the models described in the paper. It is not allways clear which exact models they used. (see comments)

In [4]:
from  sklearn.neighbors import KNeighborsClassifier, NearestCentroid #(not sure if this is the nearest mean classifiert) 
from  sklearn.tree import DecisionTreeClassifier
from  sklearn.linear_model import LogisticRegression
from  sklearn.svm import SVC #(not clear which SVC, there is also NuSVC )
from  sklearn.ensemble import BaggingClassifier
from  sklearn.ensemble import AdaBoostClassifier
from  sklearn.ensemble import GradientBoostingClassifier
from  sklearn.ensemble import RandomForestClassifier
from  sklearn.naive_bayes import GaussianNB # there are 3 different naive bayes classifiers, it is not stated which one they used 


model_list = [KNeighborsClassifier(),
                NearestCentroid(),
                DecisionTreeClassifier(random_state = 123),
                LogisticRegression(random_state = 123),
                SVC(random_state = 123),
                BaggingClassifier(random_state = 123),
                AdaBoostClassifier(random_state = 123),
                GradientBoostingClassifier(random_state = 123),
                RandomForestClassifier(random_state = 123),
                GaussianNB() 
                   ]

    


# Define Performance measures:

As mentioned in the paper the performant measueres are the following Precision and Recall and F1-Score. To be more precise the weighted average of Precision and Recall and F1-Score as stated in the dataset paper. 

In [5]:
from sklearn.model_selection import cross_validate

def calculate_metrics(clf,X,y ):
    metric =  cross_validate(clf, X, y, scoring=('precision_weighted','recall_weighted','f1_weighted'), return_train_score=False, cv=10)  
    return pd.Series({'precision':metric['test_precision_weighted'].mean(),'recall':metric['test_recall_weighted'].mean(),'F1':metric['test_f1_weighted'].mean() })

# Select Models

As defined in the paper they use 10-fold CV on the classifiers for training and keep all the classifiers where the metrics are above 0.5 for later stacking.


In [6]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np

class MultiColumnLabelEncoder:
    
    def __init__(self, columns = None):
        self.columns = columns # list of column to encode

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        
        output = X.copy()
        
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname, col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        
        return output

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
    
def getModelName( object ): 

    if hasattr(object, '__module__') and hasattr(object, '__name__'):
        return  object.__name__
    elif hasattr(object, '__module__') and hasattr(object, '__class__'):
        return  object.__class__.__name__
    else:
        raise TypeError("Could not get name of object!")
    
def evaluate_models( X, y ):
    metrics = pd.DataFrame()

    for model in model_list:
        m = calculate_metrics(model,X,y )
        metrics[getModelName(model)] = m

    return metrics.T


df_final_results = pd.DataFrame()

import warnings
warnings.filterwarnings('ignore')

## Meta data

In [7]:
    
df_train = pd.merge(df_labled_movies,df_meta_data, on='filename')
df_train.drop(['movie', 'filename'],axis=1, inplace=True)
display(df_train.head(2))
df_X = df_train.drop('goodforairplane',axis=1)
df_y = df_train['goodforairplane']



display("----  Lable encoded ----")
label_encoder = MultiColumnLabelEncoder(['language','year','genre','country','runtime','rated'])    
X_labelencoded = label_encoder.fit_transform(df_X)
metrics = evaluate_models(X_labelencoded, df_y)
display(metrics)

#convert runtime and year to actual number
df_X['runtime'] = df_X['runtime'].apply(lambda x: int(x.split(' ')[0]) )
df_X['year'] =  df_X['year'].apply(pd.to_numeric)

display("---- Lable encoded with float for year and runtime ----")
##optimizing encoding
label_encoder = MultiColumnLabelEncoder(['language','year','genre','country','rated'])    
X_labelencoded = label_encoder.fit_transform(df_X)
metrics = evaluate_models(X_labelencoded, df_y)
display(metrics)

display("---- Lable encoded without year ----")
label_encoder = MultiColumnLabelEncoder(['language','genre','country','rated'])    
X_labelencoded = label_encoder.fit_transform(df_X)
metrics = evaluate_models(X_labelencoded, df_y)
display(metrics)

# save the best of the for the final table 
metrics['Modality'] = 'metadata'
df_final_results = df_final_results.append(metrics)


display("---- OneHot Encoding ----")
##optimizing encoding further by using the processed meta data

# df_train = pd.merge(df_labled_movies,df_meta_extended_data_processed, on='filename')
# df_train.drop(['movie', 'filename'],axis=1, inplace=True)
# display(df_train.head(2))
# df_X = df_train.drop('goodforairplane',axis=1)
# df_y = df_train['goodforairplane']

# label_encoder = MultiColumnLabelEncoder(['rated'])    
# X_labelencoded = label_encoder.fit_transform(df_X)
# metrics = evaluate_models(X_labelencoded, df_y)
# display(metrics)


# Save final data sets obtained with our best approach
df_y_meta = df_y
X_labelencoded_meta = X_labelencoded

Unnamed: 0,goodforairplane,language,year,genre,country,runtime,rated
0,1,English,2014,"Action, Adventure, Fantasy","USA, UK, Canada, China",102 min,PG-13
1,0,English,2014,"Comedy, Drama",USA,105 min,R


'----  Lable encoded ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.5194,0.566162,0.52335
NearestCentroid,0.602922,0.583333,0.576202
DecisionTreeClassifier,0.490991,0.490505,0.467364
LogisticRegression,0.587637,0.585556,0.574014
SVC,0.297467,0.536869,0.382622
BaggingClassifier,0.512169,0.499596,0.48254
AdaBoostClassifier,0.516002,0.501717,0.490721
GradientBoostingClassifier,0.501382,0.500707,0.486474
RandomForestClassifier,0.50988,0.498485,0.489622
GaussianNB,0.47488,0.506667,0.481515


'---- Lable encoded with float for year and runtime ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.657535,0.618586,0.600698
NearestCentroid,0.466408,0.479091,0.465068
DecisionTreeClassifier,0.404628,0.411818,0.401082
LogisticRegression,0.512315,0.52404,0.493776
SVC,0.300554,0.54798,0.388116
BaggingClassifier,0.524126,0.520909,0.509404
AdaBoostClassifier,0.466827,0.474444,0.462241
GradientBoostingClassifier,0.439652,0.441111,0.425558
RandomForestClassifier,0.503327,0.503737,0.486752
GaussianNB,0.467194,0.499798,0.47404


'---- Lable encoded without year ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.632037,0.619697,0.589233
NearestCentroid,0.466408,0.479091,0.465068
DecisionTreeClassifier,0.404628,0.411818,0.401082
LogisticRegression,0.545546,0.549293,0.529509
SVC,0.300554,0.54798,0.388116
BaggingClassifier,0.524126,0.520909,0.509404
AdaBoostClassifier,0.466827,0.474444,0.462241
GradientBoostingClassifier,0.439652,0.441111,0.425558
RandomForestClassifier,0.503327,0.503737,0.486752
GaussianNB,0.517396,0.539798,0.499798


'---- OneHot Encoding ----'

## Textual data

In [8]:
from sklearn.preprocessing import Normalizer

################## Use textual data  ###################
display('################## Use textual data  ###################')

df_movies = df_labled_movies.drop(['movie'],axis=1)
df_train = pd.merge(df_movies,df_text_data, on='filename')
df_train.drop(['filename'],axis=1, inplace=True)
display(df_train.head(2))
df_X = df_train.drop('goodforairplane',axis=1)
df_y = df_train['goodforairplane']


display("---- RAW Data ----")
metrics = evaluate_models(df_X, df_y)
display(metrics)

# save  the final table 
metrics['Modality'] = 'textual'
df_final_results = df_final_results.append(metrics)

display("---- Normalize Data ----")
df_normalized_X = Normalizer().fit_transform(df_X)
metrics = evaluate_models(df_normalized_X, df_y)
display(metrics)


# Save final data sets obtained with our best approach
df_y_text = df_y
df_X_text = df_X

'################## Use textual data  ###################'

Unnamed: 0,goodforairplane,24000,baby,baseball,big,doc,escort,frozen,heroes,high,...,years.1,york,yorks,young,young.1,younger,youngja,zebra,zellweger,zoologists
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


'---- RAW Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.341994,0.465152,0.366055
NearestCentroid,0.452087,0.561111,0.461862
DecisionTreeClassifier,0.455147,0.46,0.427983
LogisticRegression,0.300554,0.54798,0.388116
SVC,0.300554,0.54798,0.388116
BaggingClassifier,0.509397,0.567273,0.496721
AdaBoostClassifier,0.575156,0.576162,0.558896
GradientBoostingClassifier,0.639572,0.655051,0.600616
RandomForestClassifier,0.446707,0.54,0.470888
GaussianNB,0.537073,0.558182,0.538881


'---- Normalize Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.506534,0.53303,0.512809
NearestCentroid,0.555174,0.555253,0.531696
DecisionTreeClassifier,0.522597,0.536869,0.511263
LogisticRegression,0.300554,0.54798,0.388116
SVC,0.300554,0.54798,0.388116
BaggingClassifier,0.529374,0.56798,0.523605
AdaBoostClassifier,0.494176,0.530606,0.502345
GradientBoostingClassifier,0.65407,0.629697,0.58193
RandomForestClassifier,0.488127,0.528889,0.491408
GaussianNB,0.537073,0.558182,0.538881


## Visual data

In [9]:
from sklearn.preprocessing import StandardScaler,RobustScaler

################## Use visual data  ###################
display('################## Use visual data  ###################')

df_movies = df_labled_movies.drop(['movie'],axis=1)
df_train = pd.merge(df_movies,df_visual_data_processed, on='filename')
df_train.drop(['filename'],axis=1, inplace=True)
display(df_train.head(5))
df_X = df_train.drop('goodforairplane',axis=1)
df_y = df_train['goodforairplane']


display("---- RAW Data ----")
metrics = evaluate_models(df_X, df_y)
display(metrics)

display("---- Scaled Data ----")
df_scaled_X = StandardScaler().fit_transform(df_X)
metrics = evaluate_models(df_scaled_X, df_y)
display(metrics)

# save  the final table 
metrics['Modality'] = 'visual'
df_final_results = df_final_results.append(metrics)

display("---- RobustScaler Data ----")
df_scaled_X = RobustScaler().fit_transform(df_X)
metrics = evaluate_models(df_scaled_X, df_y)
display(metrics)


# Save final data sets obtained with our best approach
df_y_visual = df_y
df_scaled_X_visual = pd.DataFrame(df_scaled_X)

'################## Use visual data  ###################'

Unnamed: 0,goodforairplane,"(0, 0)","(0, 1)","(1, 0)","(1, 1)","(2, 0)","(2, 1)","(3, 0)","(3, 1)","(4, 0)",...,"(821, 0)","(821, 1)","(822, 0)","(822, 1)","(823, 0)","(823, 1)","(824, 0)","(824, 1)","(825, 0)","(825, 1)"
0,1,0.047044,0.056526,0.11619,0.12516,0.13633,0.14628,0.066194,0.082497,0.072554,...,2.9504,3.2167,91672.0,81373.0,22207.0,21045.0,26201.0,24225.0,14542.0,13529.0
1,0,0.30717,0.30466,0.33422,0.33193,0.33112,0.33124,0.33124,0.33138,0.31114,...,21.189,21.182,81665.0,83171.0,13672.0,13714.0,32531.0,32774.0,13753.0,13780.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002466,0.002466,729320.0,729320.0,119950.0,119950.0,230400.0,230400.0,119950.0,119950.0
3,0,0.19996,0.18913,0.26934,0.25738,0.27986,0.27465,0.23725,0.23664,0.30844,...,14.779,14.723,208630.0,211630.0,23968.0,24019.0,47979.0,48339.0,24059.0,24090.0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038749,...,7.3798,11.306,145760.0,79962.0,20730.0,20617.0,35320.0,45216.0,20831.0,20738.0


'---- RAW Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.546787,0.542929,0.529648
NearestCentroid,0.405332,0.473333,0.405324
DecisionTreeClassifier,0.491792,0.508182,0.488059
LogisticRegression,0.561063,0.573939,0.555761
SVC,0.398547,0.55798,0.4225
BaggingClassifier,0.616987,0.606162,0.604084
AdaBoostClassifier,0.505459,0.509293,0.496069
GradientBoostingClassifier,0.521968,0.534646,0.517804
RandomForestClassifier,0.476204,0.485152,0.467797
GaussianNB,0.503584,0.518586,0.484453


'---- Scaled Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.570299,0.56,0.538146
NearestCentroid,0.515741,0.505354,0.487731
DecisionTreeClassifier,0.484093,0.509293,0.48457
LogisticRegression,0.580084,0.549798,0.538425
SVC,0.45307,0.538889,0.448757
BaggingClassifier,0.584129,0.583939,0.575801
AdaBoostClassifier,0.507985,0.504242,0.489265
GradientBoostingClassifier,0.505687,0.521515,0.505821
RandomForestClassifier,0.453981,0.462929,0.445575
GaussianNB,0.607893,0.587273,0.57359


'---- RobustScaler Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.558343,0.56,0.544837
NearestCentroid,0.576882,0.539596,0.508983
DecisionTreeClassifier,0.484093,0.509293,0.48457
LogisticRegression,0.513166,0.509394,0.501544
SVC,0.425366,0.467172,0.420111
BaggingClassifier,0.601801,0.606162,0.591861
AdaBoostClassifier,0.497985,0.493131,0.478154
GradientBoostingClassifier,0.537062,0.532626,0.524928
RandomForestClassifier,0.453981,0.462929,0.445575
GaussianNB,0.532297,0.525354,0.522063


## Audio data

In [10]:
from sklearn.preprocessing import StandardScaler,RobustScaler

################## Use audio data  ###################

display('################## Use audio data  ###################')

def load_audio_data( filenames ): # changed
    data_list = []
    
    audio_data = pd.DataFrame()
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/audio_descriptors/{file}.csv'
        df_data = pd.read_csv(file_path,index_col=None, header=None).T
        
        # preprocess data
        df_data = df_data.fillna(0)
        df_data = pd.DataFrame(df_data.mean(axis = 0)).T
        df_data["filename"] = file
        audio_data = audio_data.append(df_data)
    audio_data = audio_data.set_index("filename") 
    return audio_data

df_audio_data = load_audio_data( df_labled_movies['filename']  )


df_train_audio = pd.merge(df_movies, df_audio_data, on='filename')
df_train_audio.drop(['filename'],axis=1, inplace=True)
df_X = df_train_audio.drop('goodforairplane',axis=1)
df_y = df_train_audio['goodforairplane']


# df_movies = df_labled_movies.drop(['movie'],axis=1)
# df_train = pd.merge(df_movies,pd.DataFrame(df_audio_data_processed), on='filename')
# df_train.drop(['filename'],axis=1, inplace=True)
# display(df_train.head(5))
# df_X = df_train.drop('goodforairplane',axis=1)
# df_y = df_train['goodforairplane']




display("---- RAW Data ----")
metrics = evaluate_models(df_X, df_y)
display(metrics)

# save  the final table 
metrics['Modality'] = 'audio'
df_final_results = df_final_results.append(metrics)

display("---- Scaled Data ----")
df_scaled_X = StandardScaler().fit_transform(df_X)
metrics = evaluate_models(df_scaled_X, df_y)
display(metrics)

display("---- RobustScaler Data ----")
df_scaled_X = RobustScaler().fit_transform(df_X)
metrics = evaluate_models(df_scaled_X, df_y)
display(metrics)


# Save final data sets obtained with our best approach
df_y_audio = df_y
df_X_audio = df_X

'################## Use audio data  ###################'

'---- RAW Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.442217,0.464444,0.446119
NearestCentroid,0.605951,0.547071,0.491725
DecisionTreeClassifier,0.474011,0.457273,0.430179
LogisticRegression,0.558033,0.54596,0.539093
SVC,0.341683,0.424949,0.368661
BaggingClassifier,0.427441,0.435859,0.419515
AdaBoostClassifier,0.52447,0.51202,0.491446
GradientBoostingClassifier,0.515392,0.51697,0.506224
RandomForestClassifier,0.469181,0.474444,0.459993
GaussianNB,0.550048,0.515758,0.498764


'---- Scaled Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.453864,0.450101,0.432996
NearestCentroid,0.555387,0.542828,0.528025
DecisionTreeClassifier,0.474011,0.457273,0.430179
LogisticRegression,0.531818,0.522626,0.514002
SVC,0.416675,0.463131,0.420189
BaggingClassifier,0.427441,0.435859,0.419515
AdaBoostClassifier,0.52447,0.51202,0.491446
GradientBoostingClassifier,0.515392,0.51697,0.506224
RandomForestClassifier,0.469181,0.474444,0.459993
GaussianNB,0.550048,0.515758,0.498764


'---- RobustScaler Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.400179,0.422727,0.405317
NearestCentroid,0.647124,0.547071,0.501967
DecisionTreeClassifier,0.474011,0.457273,0.430179
LogisticRegression,0.518855,0.511515,0.501448
SVC,0.48107,0.488485,0.470724
BaggingClassifier,0.427441,0.435859,0.419515
AdaBoostClassifier,0.52447,0.51202,0.491446
GradientBoostingClassifier,0.515392,0.51697,0.506224
RandomForestClassifier,0.469181,0.474444,0.459993
GaussianNB,0.550048,0.515758,0.498764


## Final base classifier filter

In [11]:
df_r = df_final_results
df_r = df_r[ (df_r['precision'] > 0.5) & (df_r['recall'] > 0.5) & (df_r['F1'] > 0.5) ]
display(df_r)


Unnamed: 0,precision,recall,F1,Modality
KNeighborsClassifier,0.632037,0.619697,0.589233,metadata
LogisticRegression,0.545546,0.549293,0.529509,metadata
BaggingClassifier,0.524126,0.520909,0.509404,metadata
AdaBoostClassifier,0.575156,0.576162,0.558896,textual
GradientBoostingClassifier,0.639572,0.655051,0.600616,textual
GaussianNB,0.537073,0.558182,0.538881,textual
KNeighborsClassifier,0.570299,0.56,0.538146,visual
LogisticRegression,0.580084,0.549798,0.538425,visual
BaggingClassifier,0.584129,0.583939,0.575801,visual
GradientBoostingClassifier,0.505687,0.521515,0.505821,visual


As we can see the results table looks pretty different than in the paper. There is not really enough information in the paper to be sure that we are correctly reproducing the steps. 

With the audio data there is actually not really more we could do since we just end up with one coliumn of data as descirbed in the paper, but still the metrics is not as good as in the paper. 

__Is there something wrong already when we load the data ? Wrong data?__



## Loading of test data
There are problems with test data in the label file and actual XML files, some of them do not match. Sometimes the file ending is given (e.g. .xml, we removed it), sometimes the row is given as a string. We detected such movies with the code below, and afterwards fixed the detected movies by hand.

Additonally, the movie 10.000km is given twice with label 0 and 1 (we just assume one of them and deleted the other one)!

In [12]:
import os


df_movies_all = pd.read_csv("./data/CoE_dataset/Test_set/test_set_labels.csv", sep=';')
print(df_movies_all.shape[0])

# To avoid null values, we just removed " by hand...
display(df_movies_all[df_movies_all["file_name"].isnull()])

ex = 0
filenames = []
for file in df_movies_all['file_name']:
    # fix already given file type - fixed by hand now
    if ".mp4" in file:
        file = file.split(".mp4")[0]
    if ".xml" in file:
        file = file.split(".xml")[0]
    
    # check if file is given twice
    if file in filenames:
        # already dropped by hand
        print("file already in! " + file)
    else:    
        file_path = f'./data/CoE_dataset/Test_set/XML/{file}.xml'
        exists = os.path.isfile(file_path)
        if exists:
            ex = ex + 1
            filenames.append(file)
        else:
            print(file)
            print(str(file_path) + " not exists!")
            df_movies_all = df_movies_all[df_movies_all['file_name'] != file]

print("Existing movies: " + str(ex))
print(df_movies_all.shape[0])

224


Unnamed: 0,movie_name,file_name,goodforairplanes


A_Fish_Called_Wanda
./data/CoE_dataset/Test_set/XML/A_Fish_Called_Wanda.xml not exists!
Existing movies: 223
223


In [13]:
df_labled_movies_test = df_movies_all
df_labled_movies_test = df_labled_movies_test[['movie_name','file_name', 'goodforairplanes']]
df_labled_movies_test.columns = ['movie', 'filename', 'goodforairplane']
display(df_labled_movies_test.head(3))


############################################################
### Load Meta Data ###

def load_meta_data_test( filenames ): 
    
    raw_data = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Test_set/XML/{file}.xml'
        exists = os.path.isfile(file_path)
        if not exists:
            print(file + " not exists!")
        else:
            with open(file_path) as f: 
                tree = ET.parse(f)
                movie = tree.find('movie')

                lang = movie.get('language')
                year = movie.get('year')
                genre = movie.get('genre')
                country = movie.get('country')
                runtime = movie.get('runtime')
                age_rating = movie.get('rated')

                raw_data.append( (file,lang,year,genre,country,runtime,age_rating) )
    
    return pd.DataFrame(raw_data, columns=['filename','language','year','genre','country','runtime','rated'])


df_meta_data_test = load_meta_data_test( df_labled_movies_test['filename']  )
display(df_meta_data_test.head(3))


############################################################
### Load Visual Data ###

def load_visual_data_test( filenames ):
    data_list = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Test_set/vis_descriptors/{file}.csv'
        exists = os.path.isfile(file_path)
        if not exists:
            print(file + " not exists!")
        else:
            df_data = pd.read_csv(file_path,index_col=None, header=None)
            data_list.append(df_data)
        
    return pd.concat(data_list, axis = 0, keys = filenames,names=('filename','vis_data'),  sort=False)

df_visual_data_test = load_visual_data_test( df_labled_movies_test['filename']  )
df_visual_data_test = df_visual_data_test.unstack()
display(df_visual_data_test.head(3))


############################################################
### Load Audio Data ###

def load_audio_data_test( filenames ):
    data_list = []
    
    audio_data = pd.DataFrame()
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Test_set/audio_descriptors/{file}.csv'
        
        exists = os.path.isfile(file_path)
        if not exists:
            print(file + " not exists!")
        else:
            df_data = pd.read_csv(file_path,index_col=None, header=None).T

            # preprocess data
            df_data = df_data.fillna(0)
            df_data = pd.DataFrame(df_data.mean(axis = 0)).T
            df_data["filename"] = file
            audio_data = audio_data.append(df_data)
            
    audio_data = audio_data.set_index("filename") 
    return audio_data

df_audio_data_test = load_audio_data_test( df_labled_movies_test['filename']  )
display(df_audio_data_test.head(3))


############################################################
### Load textual Data ###

def load_text_data_test(filenames):
    

    data_list = []
    file_path = f'./data/CoE_dataset/Test_set/text_descriptors/tdf_idf_test.csv'
    #somehow pandas can not really handle that the first line is row names.(at least I didn't find a better way) 
    # thus we do it a little complicated here
    header_index = pd.read_csv(file_path, index_col=0,nrows=1 ).reset_index().columns
    df_data = pd.read_csv(file_path, header=None, index_col=False,skiprows=1)
    df_data.set_index(header_index, inplace=True)
    df_data.columns = filenames
    return df_data.T #row are should be represented by movie names

df_text_data_test = load_text_data_test(df_labled_movies_test['filename'] )
display(df_text_data_test.head(3))

Unnamed: 0,movie,filename,goodforairplane
0,Humpty Sharma Ki Dulhania,Humpty_Sharma_Ki_Dulhania,1
1,Homeland,Homeland,1
2,Trash,Trash,1


Unnamed: 0,filename,language,year,genre,country,runtime,rated
0,Humpty_Sharma_Ki_Dulhania,Hindi,2014,"Comedy, Drama, Romance",India,133 min,NOT RATED
1,Homeland,English,2011,"Drama, Mystery, Thriller",USA,55 min,TV-MA
2,Trash,"Portuguese, English",2014,"Adventure, Comedy, Crime","UK, Brazil",114 min,R


Unnamed: 0_level_0,0,0,1,1,2,2,3,3,4,4,...,821,821,822,822,823,823,824,824,825,825
vis_data,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
filename,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Humpty_Sharma_Ki_Dulhania,0.25284,0.25278,0.23444,0.2345,0.25556,0.25565,0.23095,0.23102,0.2672,0.26718,...,13.606,13.606,38450.0,38440.0,11844.0,11847.0,23975.0,23988.0,13069.0,13071.0
Homeland,0.1366,0.12622,0.2911,0.29358,0.38919,0.38603,0.25374,0.24926,0.18142,0.20572,...,8.6886,8.6112,13428.0,14415.0,2086.1,2519.3,5285.1,6909.0,2510.7,2963.8
Trash,0.23858,0.25246,0.24924,0.25729,0.34233,0.34281,0.32537,0.32128,0.28438,0.28786,...,11.442,11.627,37948.0,36342.0,11777.0,11420.0,24904.0,23160.0,12274.0,11730.0


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Humpty_Sharma_Ki_Dulhania,4.547587,-5.63623,1.434987,-0.279716,-0.669368,-1.271336,-0.705938,-0.263066,-0.273322,-0.794631,-0.060173,0.003418,-0.272562
Homeland,62.653646,-2.540778,0.94344,-1.226452,-0.285784,-0.821387,-0.986073,-1.069481,-1.126877,-0.613598,0.16874,-0.776176,0.38972
Trash,59.511905,-4.309526,-0.72833,-2.60298,0.1502,-0.210795,-0.315625,0.037404,-0.298176,0.943956,0.579414,0.388942,-0.008194


Unnamed: 0_level_0,1,1000,200000,acquired,ailing,avatar,avoid,babysitter,barbaric,battle,...,zero,zeus,zeus.1,zeus.2,zhonglian,zhuo,zombie,zombiehating,zombies,zuckerberg
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Humpty_Sharma_Ki_Dulhania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Homeland,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Trash,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Preparation of test data
The process here is similar to the training data.

In [14]:
df_movies_test = df_labled_movies_test.drop(['movie'], axis = 1)


#####################################################
# Textual data
df_test_text = pd.merge(df_movies_test, df_text_data_test, on='filename')
df_test_text.drop(['filename'],axis=1, inplace=True)

df_X_text_test = df_test_text.drop('goodforairplane',axis=1)
df_y_text_test = df_test_text['goodforairplane']

print("text:" + str(df_X_text_test.shape[0]))
display(df_X_text_test.head(2))


#####################################################
# Visual data
df_test_visual = pd.merge(df_movies_test, df_visual_data_test, on='filename')
df_test_visual.drop(['filename'],axis=1, inplace=True)

df_X_visual_test = df_test_visual.drop('goodforairplane',axis=1)
df_y_visual_test = df_test_visual['goodforairplane']
df_scaled_X_visual_test = pd.DataFrame(StandardScaler().fit_transform(df_X_visual_test))

print("visual:"  + str(df_X_visual_test.shape[0]))
display(df_X_visual_test.head(2))


#####################################################
# Audio data
df_test_audio = pd.merge(df_movies_test, df_audio_data_test, on='filename')
df_test_audio.drop(['filename'],axis=1, inplace=True)
df_X_audio_test = df_test_audio.drop('goodforairplane',axis=1)
df_y_audio_test = df_test_audio['goodforairplane']

print("audio:" + str(df_X_audio_test.shape[0]))
display(df_X_audio_test.head(2))


#####################################################
# Meta data
df_test_meta = pd.merge(df_movies_test,df_meta_data_test, on='filename')
df_test_meta.drop(['filename'],axis=1, inplace=True)

df_X_meta_test = df_test_meta.drop('goodforairplane',axis=1)
# Work around for runtime
df_X_meta_test['runtime'] = df_X_meta_test['runtime'].apply(lambda x: int(x.split(' ')[0]) if x != 'N/A' else 0)
df_X_meta_test['year'] =  df_X_meta_test['year'].apply(pd.to_numeric)

df_y_meta_test = df_test_meta['goodforairplane']

label_encoder = MultiColumnLabelEncoder(['language','genre','country','rated'])    
X_labelencoded_meta_test = pd.DataFrame(label_encoder.fit_transform(df_X_meta_test))

print("meta:"+ str(df_X_meta_test.shape[0]) )
display(X_labelencoded_meta_test.head(2))

text:223


Unnamed: 0,1,1000,200000,acquired,ailing,avatar,avoid,babysitter,barbaric,battle,...,zero,zeus,zeus.1,zeus.2,zhonglian,zhuo,zombie,zombiehating,zombies,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


visual:223


Unnamed: 0,"(0, 0)","(0, 1)","(1, 0)","(1, 1)","(2, 0)","(2, 1)","(3, 0)","(3, 1)","(4, 0)","(4, 1)",...,"(821, 0)","(821, 1)","(822, 0)","(822, 1)","(823, 0)","(823, 1)","(824, 0)","(824, 1)","(825, 0)","(825, 1)"
0,0.25284,0.25278,0.23444,0.2345,0.25556,0.25565,0.23095,0.23102,0.2672,0.26718,...,13.606,13.606,38450.0,38440.0,11844.0,11847.0,23975.0,23988.0,13069.0,13071.0
1,0.1366,0.12622,0.2911,0.29358,0.38919,0.38603,0.25374,0.24926,0.18142,0.20572,...,8.6886,8.6112,13428.0,14415.0,2086.1,2519.3,5285.1,6909.0,2510.7,2963.8


audio:223


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,4.547587,-5.63623,1.434987,-0.279716,-0.669368,-1.271336,-0.705938,-0.263066,-0.273322,-0.794631,-0.060173,0.003418,-0.272562
1,62.653646,-2.540778,0.94344,-1.226452,-0.285784,-0.821387,-0.986073,-1.069481,-1.126877,-0.613598,0.16874,-0.776176,0.38972


meta:223


Unnamed: 0,language,year,genre,country,runtime,rated
0,56,2014,59,21,133,3
1,5,2011,83,45,55,7


## Further preparation for textual data
In the text data files, there are different words that appear in one movie and not in an other one.

Therefore, we add columns with 0 values for words that are not in the movies.

In [15]:
print("Training")
print(df_X_text.shape)
print(X_labelencoded_meta.shape)
print(df_X_audio.shape)
print(df_scaled_X_visual.shape)

print("Test")
print(df_X_text_test.shape)
print(X_labelencoded_meta_test.shape)
print(df_X_audio_test.shape)
print(df_scaled_X_visual_test.shape)


# check why test data is not consistent with train data for textual, so we 
# do not compare different words
for col in df_X_text_test.columns:
    if col not in df_X_text.columns:
        df_X_text[col] = pd.Series([0] * df_X_text.shape[0])

for col in df_X_text.columns:
    if col not in df_X_text_test.columns:
        df_X_text_test[col] = pd.Series([0] * df_X_text_test.shape[0])



# We have to order the columns, otherwise the features selected will not be at the same place
df_X_text_test = df_X_text_test.reindex(sorted(df_X_text_test.columns), axis=1)
display(df_X_text_test.head(3))

df_X_text = df_X_text.reindex(sorted(df_X_text.columns), axis=1)
display(df_X_text.head(3))

Training
(95, 3283)
(95, 6)
(95, 13)
(95, 1652)
Test
(223, 6317)
(223, 6)
(223, 13)
(223, 1652)


Unnamed: 0,00,1,10,100,1000,1000.1,100000,10000km,101yearold,10round,...,zeus.1,zeus.2,zhonglian,zhuo,zing,zombie,zombiehating,zombies,zoologists,zuckerberg
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0


Unnamed: 0,00,1,10,100,1000,1000.1,100000,10000km,101yearold,10round,...,zeus.1,zeus.2,zhonglian,zhuo,zing,zombie,zombiehating,zombies,zoologists,zuckerberg
0,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0,0,0,0.0,0
1,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0,0,0,0.0,0
2,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0,0,0,0.0,0


# Task 3.2 Feature Selection

They use LVW for feature selection as described in the mentioned paper.
What is very confusion in this section is that in the end again they refer to Table 2. as also in the previous section. 
I would conclude that in the previous section they just wanted to refer to the selected classifiers and in this section to the metrics results. 
But still we would have different classifiers. 

The implementation of the LVW shouldn't be to compilcated. Maybe there is already some code out there.

### Implementation of LVW

Here I implemented the LVW from the pseudo code of the refered paper (since I have not found any code on it), with adapting that we use a greater F1 score, instead of a lower error.

It was not clearly stated how they actually "slightly modified" the LVW...

In [16]:
def randomSet(size):
    '''
    Returns a subset of available features
    '''  
    number_of_features = random.randint(1, size-1)
    features = np.empty(number_of_features, dtype = int)
    
    i = 0
    while i < number_of_features:
        rand = random.randint(0, size-1)
        if rand not in features:
            features[i] = rand
            i = i + 1
            
    return features  


def LearnAlgo(S1, D_X, D_Y, model):
    '''
    Calculates and returns metrics on given data frame with feature
    subset S1
    '''
    D_X = D_X[D_X.columns[S1]]
    metrics = calculate_metrics(model, D_X, D_Y)
    return metrics
    

def LVW(K, D_X, D_Y, model, output = True):
    '''
    Implementation of the Las Vegas Wrapper, according to the paper
    "Feature Selection and Classification - A probabilistic approach",
    modifed to maximizing F1 instead of minimizing error.
    '''
    k = 0
    C = 100
    metrics = {"precision": 0,
               "recall": 0,
               "F1": 0}
    
    size = D_X.columns.size
    if size == 1: # fix for dataframes with size 1
        return calculate_metrics(model, D_X, D_Y), [0]
    
    while k < K:
        S1 = randomSet(size)
        C1 = S1.size
        metrics_1 = LearnAlgo(S1, D_X, D_Y, model)
        
        if (metrics_1["F1"] > metrics["F1"] or 
            (metrics_1["F1"] == metrics["F1"] and C1 < C)):
            if output:
                print("Current best F1 = " + str(metrics_1["F1"]) + ", size = " + str(C1))
            k = 0
            metrics = metrics_1
            C = C1
            S = S1
        
        k = k + 1

    return metrics, S


### Trys for different data modalities

In [17]:
# Audio
LVW(5, df_X_audio, df_y_audio, KNeighborsClassifier())

Current best F1 = 0.33559884559884556, size = 2
Current best F1 = 0.457967772967773, size = 12
Current best F1 = 0.4871639471639472, size = 2
Current best F1 = 0.5754042254042254, size = 2


(precision    0.596786
 recall       0.585758
 F1           0.575404
 dtype: float64, array([1, 7]))

In [18]:
# Textual
LVW(5, df_X_text, df_y_text, KNeighborsClassifier())

Current best F1 = 0.4626710979652156, size = 3765
Current best F1 = 0.483067553361671, size = 1148
Current best F1 = 0.5168817021758197, size = 6183


(precision    0.534987
 recall       0.591313
 F1           0.516882
 dtype: float64, array([3093, 1770, 5367, ...,  398,  729, 4120]))

In [19]:
# Visual
LVW(5, df_scaled_X_visual, df_y_visual, KNeighborsClassifier())

Current best F1 = 0.5927152477152478, size = 1139


(precision    0.627671
 recall       0.605354
 F1           0.592715
 dtype: float64, array([ 505, 1052,  447, ...,  573, 1022,  350]))

In [20]:
# Meta
LVW(5, X_labelencoded_meta, df_y_meta, KNeighborsClassifier())

Current best F1 = 0.564422151922152, size = 5


(precision    0.616487
 recall       0.587475
 F1           0.564422
 dtype: float64, array([2, 3, 1, 5, 4]))

### Application of LVW on selected combinations

No information about seed and so on, just that in each run there are different subsets and therefore different results (we set a seed here for enabling reproduction). I stored the features we use (from our combinations and from them of the paper) into files so we dont have to execute this code every time.

In [21]:

def str2Class(str):
    if str == "KNeighborsClassifier" or str == "NearestCentroid" or str == "GaussianNB":
        return getattr(sys.modules[__name__], str)()
    return getattr(sys.modules[__name__], str)(random_state = 123)



def run_LVW_Selected_Combinations(K):
    
    df_final_results_lvw = pd.DataFrame()
    df_x = pd.DataFrame()
    df_y = pd.DataFrame()

    random.seed(123)
    
    for index, row in df_r.iterrows():
        model = str2Class(index)
        print(getModelName(model) + " - " + row["Modality"])

        # get correct data frame
        if row["Modality"] == "metadata":
            df_x = X_labelencoded_meta
            df_y = df_y_meta
        elif row["Modality"] == "visual":
            df_x = df_scaled_X_visual
            df_y = df_y_visual

        elif row["Modality"] == "textual":
            df_x = df_X_text
            df_y = df_y_text
        else:
            df_x = df_X_audio
            df_y = df_y_audio

        # Calculate and append LVW metrics
        metrics = pd.DataFrame()
        S = []
        if getModelName(model) == "RandomForestClassifier":
            print("skipping random forest..")
            # evaluate random forest withot LVW since it already performs feature selection
            random.seed(123)
            m = calculate_metrics(model, df_x, df_y)
        else:
            # feature selection metrics
            random.seed(123)
            m, S = LVW(K, df_x, df_y, model, False)
        print(m)
        features = ','.join(map(str, S))
        print(features)
        metrics[getModelName(model)] = m
        metrics = metrics.T
        metrics["Modality"] = row["Modality"]
        metrics["Features"] = features

        df_final_results_lvw = df_final_results_lvw.append(metrics)
    
    # Results with feature selection
    #display(df_final_results_lvw)#[df_final_results_lvw.columns.difference(["Features"])])

    # save final data frame (TODO: uncomment when rerunning LVW)
    df_final_results_lvw.to_csv('./data/results.csv', sep=';', encoding='utf-8')


### Final results
Some of the results got better, some got worse since we only use a feature subspace where the fact if the full data is tried out is just random.

In [22]:
# TODO: Uncomment if you want to rerun LVW!!!
# max number of runs for finding better LVW combinations -> better results, longer runtime with higher K   
#run_LVW_Selected_Combinations(100)

In [23]:
# Previous results
display(df_r)

# Access saved data from function run_LVW_Selected_Combinations()
test_read = pd.read_csv('./data/results.csv', sep=';', encoding='utf-8', header = 0, index_col = 0)
test_read

Unnamed: 0,precision,recall,F1,Modality
KNeighborsClassifier,0.632037,0.619697,0.589233,metadata
LogisticRegression,0.545546,0.549293,0.529509,metadata
BaggingClassifier,0.524126,0.520909,0.509404,metadata
AdaBoostClassifier,0.575156,0.576162,0.558896,textual
GradientBoostingClassifier,0.639572,0.655051,0.600616,textual
GaussianNB,0.537073,0.558182,0.538881,textual
KNeighborsClassifier,0.570299,0.56,0.538146,visual
LogisticRegression,0.580084,0.549798,0.538425,visual
BaggingClassifier,0.584129,0.583939,0.575801,visual
GradientBoostingClassifier,0.505687,0.521515,0.505821,visual


Unnamed: 0,precision,recall,F1,Modality,Features
KNeighborsClassifier,0.612211,0.596667,0.569224,metadata,03
LogisticRegression,0.580071,0.605758,0.562425,metadata,03
BaggingClassifier,0.618274,0.614646,0.567548,metadata,3
AdaBoostClassifier,0.66458,0.639697,0.621954,textual,"810,1139,2666,2865,1327,1972,2464,2728,1996,16..."
GradientBoostingClassifier,0.667745,0.684343,0.650515,textual,"3088,1323,2941,320,1994,3147,1075,2356,2004,17..."
GaussianNB,0.646028,0.640707,0.63765,textual,"1053,538,358,2194,2018,2292,1612,1481,2060,501..."
KNeighborsClassifier,0.680255,0.644646,0.630992,visual,"886,606,174,441,697,897,193,1566,713,1496,1010..."
LogisticRegression,0.676223,0.650707,0.637595,visual,"992,843,1246,1235,83,309,1110,708,620,1535,143..."
BaggingClassifier,0.671854,0.649495,0.643444,visual,"911,1434,1185,1455,1271,1069,229,306,105,756,3..."
GradientBoostingClassifier,0.681809,0.655758,0.637713,visual,"1477,183,687,25,1345,246,1255,203,1458,21,1320..."


### LVW Feature selection on the classifiers of the paper
Since we have other classifiers as an output, I decided to also use the combinations stated in the paper, that should be also nice to compare them.

In [24]:

def run_LVW_Paper_Combinations(K):
    
    paper_combinations_modality = ['metadata', 'metadata', 'metadata','metadata', 'metadata',
                                   'metadata', 'metadata', 'metadata', 'metadata', 'textual',
                                   'textual', 'textual', 'visual', 'visual', 'visual', 'visual', 
                                   'visual', 'visual', 'visual', 'audio', 'audio']
    paper_combinations_classifier = ['KNeighborsClassifier', 'NearestCentroid', 'DecisionTreeClassifier','LogisticRegression', 
                                     'SVC', 'BaggingClassifier', 'RandomForestClassifier', 'AdaBoostClassifier', 
                                     'GradientBoostingClassifier', 'GaussianNB', 'KNeighborsClassifier', 'SVC', 
                                     'KNeighborsClassifier', 'DecisionTreeClassifier', 'LogisticRegression', 'SVC', 
                                     'RandomForestClassifier', 'AdaBoostClassifier', 'GradientBoostingClassifier', 
                                     'LogisticRegression', 'GradientBoostingClassifier']

    paper_combination_score = pd.DataFrame()
    paper_combination_score["Modality"] = pd.Series(paper_combinations_modality)
    paper_combination_score["Classifier"] = paper_combinations_classifier
    paper_combination_score["Features"] = pd.Series()
    paper_combination_score.set_index(["Classifier"], inplace = True)

    df_final_results_paper = pd.DataFrame()
    df_x = pd.DataFrame()
    df_y = pd.DataFrame()


    random.seed(123)

    for index, row in paper_combination_score.iterrows():
        model = str2Class(index)
        print(getModelName(model) + " - " + row["Modality"])

        # get correct data frame
        if row["Modality"] == "metadata":
            df_x = X_labelencoded_meta
            df_y = df_y_meta
        elif row["Modality"] == "visual":
            df_x = df_scaled_X_visual
            df_y = df_y_visual

        elif row["Modality"] == "textual":
            df_x = df_X_text
            df_y = df_y_text
        else:
            df_x = df_X_audio
            df_y = df_y_audio

        # Calculate and append LVW metrics
        metrics = pd.DataFrame()
        S = []
        if getModelName(model) == "RandomForestClassifier":
            print("skipping random forest..")
            # evaluate random forest withot LVW since it already performs feature selection
            random.seed(123)
            m = calculate_metrics(model, df_x, df_y)
        else:
            # feature selection metrics
            random.seed(123)
            m, S = LVW(K, df_x, df_y, model, False)
        print(m)
        features = ','.join(map(str, S))
        print(features)
        metrics[getModelName(model)] = m
        metrics = metrics.T
        metrics["Modality"] = row["Modality"]
        metrics["Features"] = features

        df_final_results_paper = df_final_results_paper.append(metrics) 

    F1_paper = [0.630, 0.591, 0.563, 0.578, 0.574, 0.631, 0.576, 0.536, 0.569, 0.702, 0.666, 0.707,
                0.608, 0.535, 0.608, 0.580, 0.638, 0.654, 0.587, 0.546, 0.587]

    df_final_results_paper["F1 Paper"] = F1_paper
    df_final_results_paper["Difference"] =  df_final_results_paper["F1 Paper"] - df_final_results_paper["F1"]
    #display(df_final_results_paper)

    # save final data frame 
    df_final_results_paper.to_csv('./data/results_paper.csv', sep=';', encoding='utf-8')
    

In [25]:
# TODO: Uncomment if you want to rerun LVW!!!
# max number of runs for finding better LVW combinations -> better results, longer runtime with higher K   
# run_LVW_Paper_Combinations(200)

KNeighborsClassifier - metadata
precision    0.612211
recall       0.596667
F1           0.569224
dtype: float64
0,3
NearestCentroid - metadata
precision    0.546039
recall       0.580404
F1           0.550910
dtype: float64
3,0
DecisionTreeClassifier - metadata
precision    0.600597
recall       0.607576
F1           0.559265
dtype: float64
3
LogisticRegression - metadata
precision    0.580071
recall       0.605758
F1           0.562425
dtype: float64
0,3
SVC - metadata
precision    0.590208
recall       0.594444
F1           0.550604
dtype: float64
3
BaggingClassifier - metadata
precision    0.618274
recall       0.614646
F1           0.567548
dtype: float64
3
RandomForestClassifier - metadata
skipping random forest..
precision    0.503327
recall       0.503737
F1           0.486752
dtype: float64

AdaBoostClassifier - metadata
precision    0.603628
recall       0.616667
F1           0.564265
dtype: float64
3
GradientBoostingClassifier - metadata
precision    0.611508
recall       0.

precision    0.300554
recall       0.547980
F1           0.388116
dtype: float64
1390,445,5192,5674,6755,6152,7580,1178,4892,6566,7500,7723,7058,4282,946,4977,4451,6088,2950,4652,773,5235,2701,2038,2270,3885,2095,903,5039,1676,604,5758,6825,4936,4017,5408,2779,5329,7228,5314,4987,7087,4012,4756,836,2652,5888,2854,4434,370,4674,6128,291,5369,6237,1011,2925,1568,2179,6169,5579,545,4143,2405,7541,3445,7956,6471,869,1197,7832,245,6523,6942,5602,5622,5077,5390,7439,329,7497,5817,2729,1800,1384,5856,7656,3821,7954,524,5121,55,5084,6427,7849,867,1834,365,324,5152,3830,5355,4469,5457,1459,6637
KNeighborsClassifier - visual
precision    0.655459
recall       0.644646
F1           0.636077
dtype: float64
1555,906,1556,936,710,958,1165,692,874,713,1583,340,1236,1178,800,1434,1508,1495,1028,1166,360,1080,1288,1136,575,890,1534,1194,1466,576,1006,1572,1565,803,1359,1418,899,335,571,947,957,1297,1238,438,1084,1245,601,1367,938,642,365,1306,1173,616,672,1591,1518,1427,850,1140,413,1551,659,543,1149,9

precision    0.603013
recall       0.603434
F1           0.593673
dtype: float64
9,5
GradientBoostingClassifier - audio
precision    0.663218
recall       0.655960
F1           0.650645
dtype: float64
2,10,0


In [26]:
# Accessing saved data (saves time since recompiling takes long)
test_read_paper = pd.read_csv('./data/results_paper.csv', sep=';', encoding='utf-8', header = 0, index_col = 0)
test_read_paper

Unnamed: 0,precision,recall,F1,Modality,Features,F1 Paper,Difference
KNeighborsClassifier,0.612211,0.596667,0.569224,metadata,03,0.63,0.060776
NearestCentroid,0.546039,0.580404,0.55091,metadata,30,0.591,0.04009
DecisionTreeClassifier,0.600597,0.607576,0.559265,metadata,3,0.563,0.003735
LogisticRegression,0.580071,0.605758,0.562425,metadata,03,0.578,0.015575
SVC,0.590208,0.594444,0.550604,metadata,3,0.574,0.023396
BaggingClassifier,0.618274,0.614646,0.567548,metadata,3,0.631,0.063452
RandomForestClassifier,0.503327,0.503737,0.486752,metadata,,0.576,0.089248
AdaBoostClassifier,0.603628,0.616667,0.564265,metadata,3,0.536,-0.028265
GradientBoostingClassifier,0.611508,0.576162,0.554581,metadata,21,0.569,0.014419
GaussianNB,0.713846,0.697475,0.690642,textual,"4054,2048,4573,5987,2805,83,4922,7685,5572,788...",0.702,0.011358


Some, like Support Vector Machines with radial kernel for textual data differ a lot! (0.318884 worse!)

# 3.3 Classifier stacking

## Majority Voting
This is the simplest case, where we select classifiers and feature subspaces through the steps above, and assign final predicted labels through majority voting on the labels of the 21 classifiers.

First we prepare a data frame containing our predictions that we have obtained by cv, so that we do not need to build it again for classifier stacking.

In [27]:
predictions_df = pd.DataFrame()
for i in range(0, 21):
    predictions_df[i] = pd.Series([0]*df_X_text.shape[0])
predictions_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Majority voting (CV)

In [28]:
classifier_combinations = pd.read_csv('./data/results_paper.csv', sep=';', encoding='utf-8', 
                                      header = 0, index_col = 0)[["Modality", "Features"]]
df_y = df_y_text

kf = KFold(n_splits = 10, random_state = 123)
           
random.seed(123)

f1_scores_voting = [0]*10
recall = 0
precision = 0

k = 0
for train_index, test_index in kf.split(df_X_text):
    predicitons = [0] * len(test_index)  
    
    i = 0
    for index, row in classifier_combinations.iterrows():
        model = str2Class(index)       
        
        # get correct data frame
        if row["Modality"] == "metadata":
            df_x = X_labelencoded_meta
        elif row["Modality"] == "visual":
            df_x = df_scaled_X_visual
        elif row["Modality"] == "textual":
            df_x = df_X_text
        else:
            df_x = df_X_audio
        
        
        # Calculate and append LVW metrics
        if getModelName(model) == "RandomForestClassifier":
            # evaluate random forest without selected features since it already performs feature selection
            mod = model.fit(df_x.loc[train_index, :], df_y[train_index])
            pred = mod.predict(df_x.iloc[test_index, :])
            
        else:
            # get results with features from LVW        
            features = [int(i) for i in row["Features"].split(",")]
            df_x_features = df_x[df_x.columns[features]]
            
            # Predicitions
            mod = model.fit(df_x_features.loc[train_index, :], df_y[train_index])
            pred = mod.predict(df_x_features.iloc[test_index, :])    
        
        # Add results to final label data set
        for j, p in enumerate(pred):
            predictions_df.loc[j + test_index[0] , i] = p
            
        predicitons = predicitons + pred
        i = i + 1
            
    predictions_majority = pd.Series(predicitons).map(lambda x: 0 if x < classifier_combinations.shape[0]/2 else 1)
    
    # Save F1 scores for significance testing later on
    f1_scores_voting[k] = f1_score(df_y[test_index], predictions_majority)
    k = k + 1
    
    recall = recall + recall_score(df_y[test_index], predictions_majority)
    precision = precision + precision_score(df_y[test_index], predictions_majority)
    
    
    
    
print("Precision: " + str(precision/10))
print("Recall: " + str(recall/10))
print("F1 score: " + str(np.mean(f1_scores_voting)))

print("F1 scores by CV: " + str(f1_scores_voting))

Precision: 0.620436507936508
Recall: 0.7728571428571429
F1 score: 0.6804029304029304
F1 scores by CV: [0.7692307692307692, 0.6666666666666667, 0.7692307692307692, 0.7142857142857143, 0.7692307692307692, 0.8333333333333333, 0.6153846153846154, 0.8333333333333333, 0.8333333333333333, 0.0]


### Majority voting (test)
Here we fit the models for the training data, and obtain test performance also by using 10-fold CV, like mentioned in the paper.

We did the CV for the training data to check its robustness against different splits, because somehow it makes no sense to split the test data at all.

In [29]:
# create prediction data frame for label stacking test
predictions_test_df = pd.DataFrame()
for i in range(0, 21):
    predictions_test_df[i] = pd.Series([0]*df_X_text_test.shape[0])
predictions_test_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
df_y_test = df_y_meta_test

f1_scores_voting_test = [0]*10
recall = 0
precision = 0

k = 0
# use k-fold cv for training data and apply it on test data
for train_index, test_index in kf.split(df_X_text):
    predicitons = [0] * df_y_test.shape[0] 
    
    i = 0
    for index, row in classifier_combinations.iterrows():
        model = str2Class(index)       
        
        # get correct data frames
        if row["Modality"] == "metadata":
            df_x = X_labelencoded_meta
            df_x_test = X_labelencoded_meta_test
        elif row["Modality"] == "visual":
            df_x = df_scaled_X_visual
            df_x_test = df_scaled_X_visual_test
        elif row["Modality"] == "textual":
            df_x = df_X_text
            df_x_test = df_X_text_test
        else:
            df_x = df_X_audio
            df_x_test = df_X_audio_test
        
        
        # Calculate and append LVW metrics
        if getModelName(model) == "RandomForestClassifier":
            # evaluate random forest without selected features since it already performs feature selection
            # fit on training data
            mod = model.fit(df_x.loc[train_index, :], df_y[train_index])
            # evaluation for test data
            pred = mod.predict(df_x_test)
            
        else:
            # get results with features from LVW        
            features = [int(i) for i in row["Features"].split(",")]
            df_x_features = df_x[df_x.columns[features]]
            
            # Fit model for CV of train data
            mod = model.fit(df_x_features.loc[train_index, :], df_y[train_index])
            df_x_features_test = df_x_test[df_x_test.columns[features]]
            
            # Predicitions for test data
            pred = mod.predict(df_x_features_test) 
            
        # Add results to final label data set
        for j, p in enumerate(pred):
            predictions_test_df.loc[j , i] = p
            
        predicitons = predicitons + pred
        i = i + 1

    predictions_majority = pd.Series(predicitons).map(lambda x: 0 if x < classifier_combinations.shape[0]/2 else 1)
    f1_scores_voting_test[k] = f1_score(df_y_test, predictions_majority)
    k = k + 1

    recall = recall + recall_score(df_y_test, predictions_majority)
    precision = precision + precision_score(df_y_test, predictions_majority)
    
print("Precision: " + str(precision/10))
print("Recall: " + str(recall/10))
print("F1 score: " + str(np.mean(f1_scores_voting_test)))
print("F1 scores by CV: " + str(f1_scores_voting_test))

Precision: 0.6048082086287696
Recall: 0.5740740740740741
F1 score: 0.5773097100247325
F1 scores by CV: [0.5130434782608696, 0.4444444444444445, 0.4495412844036698, 0.718266253869969, 0.5977859778597786, 0.5172413793103449, 0.5609756097560976, 0.6159420289855072, 0.6433566433566432, 0.7125]


## Label Stacking
Assume we have n instances and T base classifiers, then we can generate an n by T matrix consisting of predictions (labels) given by each classifier. Label combining strategy tries to build a second-level classifier based on this label matrix, and return a final prediction result for that.

### Label Stacking (cv)
Here we use the prediction data frame obtained by the predictions already done in the previous section. The best one is obtained by AdaBoost as a meta learner, with an F1 score of 0.721267.

In [31]:
def calculate_metrics_extended(clf, X, y):
    metric =  cross_validate(clf, X, y, scoring=('precision_weighted','recall_weighted','f1_weighted'), 
                             return_train_score=False, cv=10)  
    return pd.Series({'precision':metric['test_precision_weighted'].mean(),'recall':metric['test_recall_weighted'].mean(),
                      'F1':metric['test_f1_weighted'].mean(),
                     'F1 scores': metric['test_f1_weighted']})


def evaluate_models_extended(X, y):
    metrics = pd.DataFrame()

    for model in model_list:
        m = calculate_metrics_extended(model, X, y)
        metrics[getModelName(model)] = m

    return metrics.T

In [32]:
display(predictions_df.head(2))

label_stack = evaluate_models_extended(predictions_df, df_y)
label_stack[["precision", "recall", "F1"]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,1,1,1,1,0,1,1,1,0,1,...,1,0,1,0,0,0,0,1,1,1
1,0,1,1,1,1,1,1,1,0,1,...,1,0,0,0,0,0,0,0,0,0


Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.638801,0.657071,0.630147
NearestCentroid,0.673947,0.678384,0.665951
DecisionTreeClassifier,0.561979,0.56,0.53779
LogisticRegression,0.631283,0.608586,0.584848
SVC,0.630833,0.629697,0.604782
BaggingClassifier,0.592456,0.575253,0.566989
AdaBoostClassifier,0.592037,0.566364,0.550568
GradientBoostingClassifier,0.610649,0.599495,0.58773
RandomForestClassifier,0.548877,0.527778,0.513845
GaussianNB,0.59377,0.610808,0.598202


### Label Stacking (test)
We obtained the best results with the KNeighborsClassifier, with an F1 score of 0.546670.

In [33]:
display(predictions_test_df.head(2))


label_stack_scores = pd.DataFrame()
label_stack_scores["Classifier"] = pd.Series()
label_stack_scores["Precision"] = pd.Series()
label_stack_scores["Recall"] = pd.Series()
label_stack_scores["F1"] = pd.Series()
label_stack_scores["F1 scores"] = pd.Series()

for model in model_list:
    f1 = [0]*10
    recall = [0]*10
    precision = [0]*10
    i = 0
    for train_index, test_index in kf.split(predictions_df):
        # Fit model for CV of train data
        mod = model.fit(predictions_df.loc[train_index, :], df_y[train_index])
        # Predicitions for test data
        pred = mod.predict(predictions_test_df)
        
        f1[i] = f1_score(df_y_test, pred)
        recall[i] = recall_score(df_y_test, pred)
        precision[i] = precision_score(df_y_test, pred)
        i = i + 1
    
    label_stack_scores = label_stack_scores.append(pd.Series({"Classifier": getModelName(model),
                                        "Precision": np.mean(precision),
                                        "Recall": np.mean(recall),
                                        "F1": np.mean(f1),
                                        "F1 scores": f1}), 
                              ignore_index = True)
    
label_stack_scores[["Classifier", "Precision", "Recall", "F1"]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,1,0,1,0,1,1,1,1,1,0,...,1,1,0,1,1,1,1,1,1,1
1,1,1,0,1,1,0,1,0,1,1,...,1,1,0,1,1,0,1,1,1,1


Unnamed: 0,Classifier,Precision,Recall,F1
0,KNeighborsClassifier,0.603455,0.486667,0.537146
1,NearestCentroid,0.590902,0.30963,0.403388
2,DecisionTreeClassifier,0.59343,0.58963,0.586393
3,LogisticRegression,0.585682,0.347407,0.422823
4,SVC,0.590478,0.392593,0.462589
5,BaggingClassifier,0.59138,0.292593,0.383436
6,AdaBoostClassifier,0.535322,0.188889,0.267023
7,GradientBoostingClassifier,0.596133,0.368889,0.44916
8,RandomForestClassifier,0.609943,0.285185,0.383515
9,GaussianNB,0.504913,0.125926,0.197846


## Label-Attribute Stacking
Similar to label stacking, label-feature stacking strategy uses both base-classifier predictions and features as training data to predict output.

For that reason, we merge the prediction labels obtained by the test data and merge them with all features.

In [34]:
# Adapt training data
new_cols = [0]*predictions_df.columns.size
for i, val in enumerate(predictions_df.columns):
    new_cols[i] = "l"+str(i)
predictions_df.columns = new_cols

label_feature_train = predictions_df.join(df_X_text) \
                                        .join(X_labelencoded_meta, rsuffix='_meta') \
                                        .join(df_X_audio) \
                                        .join(df_scaled_X_visual, rsuffix='vis') \

display(label_feature_train.head(2))


# Adapt test data
new_cols = [0]*predictions_test_df.columns.size
for i, val in enumerate(predictions_test_df.columns):
    new_cols[i] = "l"+str(i)
predictions_test_df.columns = new_cols

label_feature_test = predictions_test_df.join(df_X_text_test) \
                                        .join(X_labelencoded_meta_test, rsuffix='_meta') \
                                        .join(df_X_audio_test) \
                                        .join(df_scaled_X_visual_test, rsuffix='vis') \

display(label_feature_test.head(2))

Unnamed: 0,l0,l1,l2,l3,l4,l5,l6,l7,l8,l9,...,1642,1643,1644,1645,1646,1647,1648,1649,1650,1651
0,1,1,1,1,0,1,1,1,0,1,...,0.346819,-0.256042,-0.326896,0.043705,-0.195445,0.287132,-0.321261,-0.013402,-0.304257,0.021692
1,0,1,1,1,1,1,1,1,0,1,...,2.576495,0.951241,-0.350476,0.052596,-0.301344,0.039139,-0.277715,0.14595,-0.314509,0.030172


Unnamed: 0,l0,l1,l2,l3,l4,l5,l6,l7,l8,l9,...,1642,1643,1644,1645,1646,1647,1648,1649,1650,1651
0,1,0,1,0,1,1,1,1,1,0,...,1.195796,0.775823,-0.884788,-0.645061,-0.792237,-0.530113,-0.824228,-0.5463,-0.769062,-0.508099
1,1,1,0,1,1,0,1,0,1,1,...,0.517933,0.114551,-0.962248,-0.723853,-0.97322,-0.711025,-1.007682,-0.723235,-0.962423,-0.702882


### Label Attribute Stacking (CV)

In [35]:
label_feature_stack_cv = evaluate_models_extended(label_feature_train, df_y)
label_feature_stack_cv[["precision", "recall", "F1"]]

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.621403,0.630606,0.610811
NearestCentroid,0.532706,0.503737,0.483416
DecisionTreeClassifier,0.46595,0.460202,0.452873
LogisticRegression,0.606014,0.603333,0.595539
SVC,0.37006,0.482222,0.390764
BaggingClassifier,0.526562,0.534646,0.513684
AdaBoostClassifier,0.506339,0.505354,0.493656
GradientBoostingClassifier,0.526316,0.517576,0.495106
RandomForestClassifier,0.590955,0.587071,0.5782
GaussianNB,0.537073,0.558182,0.538881


### Label Attribute Stacking (Test)

In [36]:
label_feature_stack_scores = pd.DataFrame()
label_feature_stack_scores["Classifier"] = pd.Series()
label_feature_stack_scores["Precision"] = pd.Series()
label_feature_stack_scores["Recall"] = pd.Series()
label_feature_stack_scores["F1"] = pd.Series()
label_feature_stack_scores["F1 scores"] = pd.Series()

for model in model_list:
    f1 = [0]*10
    recall = [0]*10
    precision = [0]*10
    i = 0
    for train_index, test_index in kf.split(label_feature_train):
        # Fit model for CV of train data
        mod = model.fit(label_feature_train.loc[train_index, :], df_y[train_index])
        # Predicitions for test data
        pred = mod.predict(label_feature_test)
        
        f1[i] = f1_score(df_y_test, pred)
        recall[i] = recall_score(df_y_test, pred)
        precision[i] = precision_score(df_y_test, pred)
        i = i + 1
    
    label_feature_stack_scores = label_feature_stack_scores.append(pd.Series({"Classifier": getModelName(model),
                                        "Precision": np.mean(precision),
                                        "Recall": np.mean(recall),
                                        "F1": np.mean(f1),
                                        "F1 scores": f1}), 
                              ignore_index = True)
    
label_feature_stack_scores[["Classifier", "Precision", "Recall", "F1"]]

Unnamed: 0,Classifier,Precision,Recall,F1
0,KNeighborsClassifier,0.605097,0.651852,0.626941
1,NearestCentroid,0.615027,0.41037,0.486876
2,DecisionTreeClassifier,0.624244,0.555556,0.567829
3,LogisticRegression,0.608511,0.626667,0.608641
4,SVC,0.618358,0.838519,0.710022
5,BaggingClassifier,0.624924,0.466667,0.527745
6,AdaBoostClassifier,0.607032,0.611852,0.590236
7,GradientBoostingClassifier,0.620184,0.62963,0.620525
8,RandomForestClassifier,0.591186,0.493333,0.533179
9,GaussianNB,0.57623,0.645926,0.608811


# Significance Testing

## Testing for results obtained with Feature Selection
Here we made tests for alpha = 0.05 on combinations from the paper, the baseline is the F1 score predicting always the most frequent class  (around 0.7!).

We take a look on the cross validation settings of the results achieved by feature results, and compare our results to the baseline. According to the paper, the majority class baseline  is 0.5 (precision, recall and F1-score). We decided to calculate it on our own because it differs a lot from the actual F1 if predicting always the majority class.

The selected features from the task before are obtained to identify statistical significance.

We can assume that the F1 scores are normally distributed because of the CLT, because of that it is suitable to apply t tests. We take a the basline F1 score for the population of the data, and calculate our mean and standard deviation of F1 score out of the cross validation results.


In [37]:
alpha = 0.05

def calculate_F1_scores_cv(clf, X, y):
    metric =  cross_validate(clf, X, y, scoring=('f1_weighted'), return_train_score = False, cv = 10)  
    return metric['test_score']


def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), stats.sem(a)
    h = se * stats.t.ppf((1 + confidence) / 2., n-1)
    return m-h, m+h

In [38]:
classifier_combinations = pd.read_csv('./data/results_paper.csv', sep=';', encoding='utf-8', 
                                      header = 0)
classifier_combinations = classifier_combinations.rename(index=str, columns={"Unnamed: 0" : "Classifier"})

# Calculate Baseline
baseline =  (df_labled_movies[df_labled_movies['goodforairplane'] == 1]['goodforairplane'].count() /  
             df_labled_movies['goodforairplane'].count())
baseline_f1 = f1_score(([1] if baseline > 0.5 else [0]) * len(df_labled_movies['goodforairplane']), 
                       df_labled_movies['goodforairplane'])
print("Baseline: " + str(baseline_f1))


####################################################################
# Calculations for variances, confidence intervals, T-Test score and p-values
variances = [0] * classifier_combinations.shape[0]
conf = [""] * classifier_combinations.shape[0]
t_scores = [0] * classifier_combinations.shape[0]
p_vals = [0] * classifier_combinations.shape[0]
h0_baseline = [""] * classifier_combinations.shape[0]

t_scores_paper = [0] * classifier_combinations.shape[0]
p_vals_paper = [0] * classifier_combinations.shape[0]
h0_paper = [""] * classifier_combinations.shape[0]


# Go through all classifier modality combinations
i = 0
for index, row in classifier_combinations.iterrows():
    model = str2Class(row["Classifier"])

    # get correct data frame
    if row["Modality"] == "metadata":
        df_x = X_labelencoded_meta
    elif row["Modality"] == "visual":
        df_x = df_scaled_X_visual
    elif row["Modality"] == "textual":
        df_x = df_X_text
    else:
        df_x = df_X_audio

    # Calculate and append LVW metrics
    if getModelName(model) == "RandomForestClassifier":
        # evaluate random forest without selected features since it already performs feature selection
        metric = calculate_F1_scores_cv(model, df_x, df_y)
    else:
        # get results with features from LVW        
        features = [int(i) for i in row["Features"].split(",")]
        df_x_features = df_x[df_x.columns[features]]
        metric = calculate_F1_scores_cv(model, df_x_features, df_y)

    #########################################
    # Baseline tests (Mean = Baseline)
    
    # Calculate variance
    variances[i] = np.var(metric)
    
    # Confidence interval
    low, up = mean_confidence_interval(metric)
    conf[i] = "[" + str('{:.3f}'.format(low)) + ", " + str('{:.3f}'.format(up)) + "]"
    # Calculate one sample t-test score and p-value
    stat = stats.ttest_1samp(metric, baseline_f1)
    t_scores[i] = stat[0]
    p_vals[i] = stat[1]
    
    # Check H0 for baseline with two-sided test H0 = H1
    h0_baseline[i] = "reject" if  (p_vals[i] <= alpha) else "keep"
                                #or (p_vals[i]/2 < alpha and t_scores[i] > 0) \
   
    
    #########################################
    # Paper distribution tests

    # Calculate one sample t-test score and p-value
    stat = stats.ttest_1samp(metric, row["F1 Paper"])
    t_scores_paper[i] = stat[0]
    p_vals_paper[i] = stat[1]
    
    # Check H0 for baseline, two sided test
    h0_paper[i] = "reject" if  (p_vals[i] <= alpha) else "keep"
    
    i = i + 1
           
classifier_combinations["Variance"] = pd.Series(variances, index=classifier_combinations.index)
classifier_combinations["95% CI"] = pd.Series(conf, index=classifier_combinations.index)
classifier_combinations["T Score Baseline"] = pd.Series(t_scores, index=classifier_combinations.index)
classifier_combinations["p-value Baseline"] = pd.Series(p_vals, index=classifier_combinations.index)
classifier_combinations["H0 Baseline"] = pd.Series(h0_baseline, index=classifier_combinations.index)

classifier_combinations["T Score Paper"] = pd.Series(t_scores_paper, index=classifier_combinations.index)
classifier_combinations["p-value Paper"] = pd.Series(p_vals_paper, index=classifier_combinations.index)
classifier_combinations["H0 Paper"] = pd.Series(h0_paper, index=classifier_combinations.index)


Baseline: 0.707482993197279


In [39]:
# Baseline Results
display(classifier_combinations[["Classifier", "Modality", "F1", "Variance", "95% CI",
                                 "T Score Baseline", "p-value Baseline", "H0 Baseline"]])

Unnamed: 0,Classifier,Modality,F1,Variance,95% CI,T Score Baseline,p-value Baseline,H0 Baseline
0,KNeighborsClassifier,metadata,0.569224,0.034438,"[0.429, 0.709]",-2.235102,0.05225962,keep
1,NearestCentroid,metadata,0.55091,0.022044,"[0.439, 0.663]",-3.163697,0.01148175,reject
2,DecisionTreeClassifier,metadata,0.559265,0.021626,"[0.448, 0.670]",-3.023663,0.01439365,reject
3,LogisticRegression,metadata,0.562425,0.025923,"[0.441, 0.684]",-2.702841,0.02428047,reject
4,SVC,metadata,0.550604,0.009319,"[0.478, 0.623]",-4.875252,0.0008771209,reject
5,BaggingClassifier,metadata,0.567548,0.010989,"[0.489, 0.647]",-4.004752,0.003088216,reject
6,RandomForestClassifier,metadata,0.486752,0.023635,"[0.371, 0.603]",-4.307302,0.001969458,reject
7,AdaBoostClassifier,metadata,0.564265,0.019168,"[0.460, 0.669]",-3.103387,0.01265312,reject
8,GradientBoostingClassifier,metadata,0.554581,0.008078,"[0.487, 0.622]",-5.103551,0.0006420666,reject
9,GaussianNB,textual,0.690642,0.021861,"[0.579, 0.802]",-0.341707,0.7404143,keep


In [40]:
# Paper Results
display(classifier_combinations[["Classifier", "Modality", "F1","F1 Paper", "Variance", "95% CI",
                                 "T Score Paper", "p-value Paper", "H0 Paper"]])

Unnamed: 0,Classifier,Modality,F1,F1 Paper,Variance,95% CI,T Score Paper,p-value Paper,H0 Paper
0,KNeighborsClassifier,metadata,0.569224,0.63,0.034438,"[0.429, 0.709]",-0.982507,0.351523,keep
1,NearestCentroid,metadata,0.55091,0.591,0.022044,"[0.439, 0.663]",-0.810048,0.4388048,reject
2,DecisionTreeClassifier,metadata,0.559265,0.563,0.021626,"[0.448, 0.670]",-0.076198,0.9409284,reject
3,LogisticRegression,metadata,0.562425,0.578,0.025923,"[0.441, 0.684]",-0.29021,0.7782315,reject
4,SVC,metadata,0.550604,0.574,0.009319,"[0.478, 0.623]",-0.727056,0.4856726,reject
5,BaggingClassifier,metadata,0.567548,0.631,0.010989,"[0.489, 0.647]",-1.815917,0.1027666,reject
6,RandomForestClassifier,metadata,0.486752,0.576,0.023635,"[0.371, 0.603]",-1.741571,0.1155655,reject
7,AdaBoostClassifier,metadata,0.564265,0.536,0.019168,"[0.460, 0.669]",0.612469,0.5553846,reject
8,GradientBoostingClassifier,metadata,0.554581,0.569,0.008078,"[0.487, 0.622]",-0.481266,0.6418126,reject
9,GaussianNB,textual,0.690642,0.702,0.021861,"[0.579, 0.802]",-0.230455,0.822891,keep


## Significance testing for Classifier stacking
We should also compare them to the baseline, maybe also to the results using just a single classifier. This can of course also be done directly in the code above, maybe saves some work.

TODO: The baselines for cv and test would differ a bit, we could merge the data to get an F1 for the population.

In [41]:
baseline_test =  (df_labled_movies_test[df_labled_movies_test['goodforairplane'] == 1]['goodforairplane'].count() /  
             df_labled_movies_test['goodforairplane'].count())
baseline_f1_test = f1_score(([1] if baseline_test > 0.5 else [0]) * len(df_labled_movies_test['goodforairplane']), 
                       df_labled_movies_test['goodforairplane'])
print("Baseline: " + str(baseline_f1_test))


# Majority voting on train set
f1_scores_voting


#  Majority Voting on test set
f1_scores_voting_test


###
# I would just take the best meta learner from the stack, so just one row (like in the paper)

# Label stack on train set
label_stack["F1 scores"]


# label stack train
label_stack_scores["F1 scores"]


# Label-Feature-Stack
label_feature_stack_cv["F1 scores"]
label_feature_stack_scores["F1 scores"]

Baseline: 0.7541899441340781


0    [0.5947955390334572, 0.6204379562043796, 0.621...
1    [0.5112107623318386, 0.38862559241706157, 0.50...
2    [0.3473684210526316, 0.6506849315068493, 0.545...
3    [0.6482758620689656, 0.5166666666666666, 0.497...
4    [0.6757679180887372, 0.6598639455782314, 0.673...
5    [0.358974358974359, 0.5, 0.48113207547169806, ...
6    [0.6077738515901061, 0.5692883895131087, 0.637...
7    [0.5525291828793774, 0.6492537313432837, 0.558...
8    [0.625, 0.48068669527896996, 0.534979423868312...
9    [0.613240418118467, 0.5894736842105264, 0.5776...
Name: F1 scores, dtype: object