In [1]:
import sklearn 
import pandas as pd
import xml.etree.ElementTree as ET
import random
import sys
import numpy as np

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from math import sqrt
from scipy.stats import t
from scipy import stats


__First of all they did not mentioned what sklearn version they used!!!__


As described in the paper, the first step is to select base classifiers. 
The selected base classifiers are trained with default parameter settings with 10-fold cross-validation.
As input data, the training data set and its ground truth labels, per single modality is used.
For the audio MFCC features, we set NaN values to 0, and calculate the average of each MFCC coefficient over all frames.

# Load input data


# Description:

## Available Data
There are variouse csv files and data files available. It's very messy. 
There is one file called "CoE_dataset_offical_release.zip"! 
We extract this file and use this data included their for now! 

## Meta Data
In the original paper there is no information given what is included in the metadata. 
Looking at the paper describing the data set (Right Inflight? A Dataset for Exploring the Automatic Prediction of Movies Suitable for a Watching Situation
) we found out that as metadata they used language, year published, genre, country, runtime and age rating. We assume, since the author of our paper didn't say otherwise, that they used the same metadata. 

## User Rating
User rating is just mentioned in Table 1 and the dataset paper. It is not mentioned again in the CoE paper, althought user rating is important for good performance as mentioned in the dataset paper. 

__I would assume that the CoE paper just used user rating in the "metadata", since it's saved in the same file"__

## Visual Data: 
The visual data is provied as a csv file for each movie, containing two rows. According to the paper of the dataset they calculated following visual features, Histogram of Oriented Gradients (HOG) gray, Color Moments, local binary patterns (LBP) and Gray Level Run Length Matrix, but don't say how the csv file represents them. Also as mentioned the csv file just has two rows which would not ad up to the mentioned 4 visual features. __We are treating all values as seperate column!__

## Audio Data: 
Audio features is also provided per movie as a csv file. Each audio feature consits of 12 coefficients for multiple frames.

## Textual Data
The textual data is just one file containing the tdf-idf matrix. The first line are the row names for each word. 
While the columns are the associated movie. __There is no indication to which movie each column belongs! Thus we need to assume this!__

__For now we assume the order is the same as in the df_labled_movies dataframe!!!__



In [2]:

df_labled_movies = pd.read_csv("./data/CoE_dataset/Dev_set/dev_set_groundtruth_and_trailers.csv", sep=';')
del df_labled_movies['trailer']
df_labled_movies = df_labled_movies[['movie','filename', 'goodforairplane']]
display(df_labled_movies.head(5))


###############################################################
### Load Meta Data ###

def load_meta_data( filenames ): 
    
    raw_data = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/XML/{file}.xml'
        with open(file_path) as f: 
            tree = ET.parse(f)
            movie = tree.find('movie')
            
            lang = movie.get('language')
            year = movie.get('year')
            genre = movie.get('genre')
            country = movie.get('country')
            runtime = movie.get('runtime')
            age_rating = movie.get('rated')
             
            raw_data.append( (file,lang,year,genre,country,runtime,age_rating) )
    
    return pd.DataFrame(raw_data, columns=['filename','language','year','genre','country','runtime','rated'])

df_meta_data = load_meta_data( df_labled_movies['filename']  )
#pd.merge(df_labled_movies.drop(['movie'],axis=1),df_meta_data, on='filename').to_csv('data/meta_data_exported.csv',sep=";",index = False)

display(df_meta_data.head(5))
#display(df_meta_data.dtypes)

###############################################################
### Load User Rating Data ###

def load_user_rating_data( filenames ): 
    
    raw_data = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/XML/{file}.xml'
        with open(file_path) as f: 
            tree = ET.parse(f)
            movie = tree.find('movie')
            
            tomato_rating = movie.get('tomatoUserRating')
            imbd_rating = movie.get('imdbRating')
            meta_rating = movie.get('metascore')
            if tomato_rating == "N/A":
                tomato_rating = np.nan
            if imbd_rating == "N/A":
                imbd_rating = np.nan
            if meta_rating == "N/A":
                meta_rating = np.nan
             
            raw_data.append( (file,tomato_rating,imbd_rating,meta_rating) )
    
    return pd.DataFrame(raw_data, columns=['filename','tomatorating','imbdrating','metarating'])


df_user_rating_data = load_user_rating_data( df_labled_movies['filename']  )
df_user_rating_data['tomatorating'] = pd.to_numeric(df_user_rating_data['tomatorating'])
df_user_rating_data['imbdrating'] = pd.to_numeric(df_user_rating_data['imbdrating'])
df_user_rating_data['metarating'] = pd.to_numeric(df_user_rating_data['metarating'])
#pd.merge(df_labled_movies.drop(['movie'],axis=1),df_user_rating_data, on='filename').to_csv('data/user_rating_data_exported.csv',sep=";",index = False)
df_user_rating_data.fillna(df_user_rating_data.mean(),inplace=True)

display(df_user_rating_data.head(5))
#display(df_user_rating_data.dtypes)


###############################################################
### Load meta data with user rating  ###
def load_meta_extended_data( filenames ): 
    
     
    raw_data = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/XML/{file}.xml'
        with open(file_path) as f: 
            tree = ET.parse(f)
            movie = tree.find('movie')
            
            lang = movie.get('language')
            year = movie.get('year')
            genre = movie.get('genre')
            country = movie.get('country')
            runtime = movie.get('runtime')
            age_rating = movie.get('rated')
            tomato_rating = movie.get('tomatoUserRating')
            imbd_rating = movie.get('imdbRating')
            meta_rating = movie.get('metascore')
            
            if tomato_rating == "N/A":
                tomato_rating = np.nan
            if imbd_rating == "N/A":
                imbd_rating = np.nan
            if meta_rating == "N/A":
                meta_rating = np.nan
            raw_data.append( (file,lang,year,genre,country,runtime,age_rating,tomato_rating,imbd_rating,meta_rating) )
    
    return pd.DataFrame(raw_data, columns=['filename','language','year','genre','country','runtime','rated','tomatorating','imbdrating','metarating'])


df_meta_extended_data = load_meta_extended_data( df_labled_movies['filename']  )
df_meta_extended_data['tomatorating'] = pd.to_numeric(df_meta_extended_data['tomatorating'])
df_meta_extended_data['imbdrating'] = pd.to_numeric(df_meta_extended_data['imbdrating'])
df_meta_extended_data['metarating'] = pd.to_numeric(df_meta_extended_data['metarating'])
df_meta_extended_data.fillna(df_meta_extended_data.mean(),inplace=True)

display(df_meta_extended_data.head(5))
#display(df_user_rating_data.dtypes)

###############################################################
### Load Visual Data ###

def load_visual_data( filenames ):
    data_list = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/vis_descriptors/{file}.csv'
        df_data = pd.read_csv(file_path,index_col=None, header=None)
        data_list.append(df_data)
        
    return pd.concat(data_list, axis = 0, keys = filenames,names=('filename','vis_data'),  sort=False)

df_visual_data = load_visual_data( df_labled_movies['filename']  )

display(df_visual_data.head(5))


###############################################################
### Load Audio Data ###


def load_audio_data( filenames ):
    data_list = []
    
    audio_data = pd.DataFrame()
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/audio_descriptors/{file}.csv'
        df_data = pd.read_csv(file_path,index_col=None, header=None).T
        
        # preprocess data
        df_data = df_data.fillna(0)
        df_data = pd.DataFrame(df_data.mean(axis = 0)).T
        df_data["filename"] = file
        audio_data = audio_data.append(df_data)
    audio_data = audio_data.set_index("filename") 
    return audio_data

df_audio_data = load_audio_data( df_labled_movies['filename']  )
#pd.merge(df_labled_movies.drop(['movie'],axis=1),df_audio_data, on='filename').to_csv('data/audio_data_exported.csv',sep=";",index = False)

display(df_audio_data.head(5))


###############################################################
### Load textual Data ###

def load_text_data(filenames):
    

    data_list = []
    file_path = f'./data/CoE_dataset/Dev_set/text_descriptors/tdf_idf_dev.csv'
    #somehow pandas can not really handle that the first line is row names.(at least I didn't find a better way) 
    # thus we do it a little complicated here
    header_index = pd.read_csv(file_path, index_col=0,nrows=1 ).reset_index().columns
    df_data = pd.read_csv(file_path, header=None, index_col=False,skiprows=1)
    df_data.set_index(header_index, inplace=True)
    df_data.columns = filenames
    return df_data.T #row are should be represented by movie names

df_text_data = load_text_data(df_labled_movies['filename'] )
display(df_text_data.head(5))



Unnamed: 0,movie,filename,goodforairplane
0,Seventh Son,Seventh_Son,1
1,Welcome to Me,Welcome_to_Me,0
2,The Judge,The_Judge,0
3,Transformers Age of Extinction,Transformers__Age_of_Extinction,0
4,The Normal Heart,The_Normal_Heart,1


Unnamed: 0,filename,language,year,genre,country,runtime,rated
0,Seventh_Son,English,2014,"Action, Adventure, Fantasy","USA, UK, Canada, China",102 min,PG-13
1,Welcome_to_Me,English,2014,"Comedy, Drama",USA,105 min,R
2,The_Judge,English,2014,Drama,USA,141 min,R
3,Transformers__Age_of_Extinction,English,2014,"Action, Adventure, Sci-Fi","USA, China",165 min,PG-13
4,The_Normal_Heart,English,2014,Drama,USA,132 min,TV-MA


Unnamed: 0,filename,tomatorating,imbdrating,metarating
0,Seventh_Son,2.9,5.5,30.0
1,Welcome_to_Me,3.4,6.2,67.0
2,The_Judge,3.8,7.5,48.0
3,Transformers__Age_of_Extinction,3.3,5.8,32.0
4,The_Normal_Heart,4.2,8.0,64.093333


Unnamed: 0,filename,language,year,genre,country,runtime,rated,tomatorating,imbdrating,metarating
0,Seventh_Son,English,2014,"Action, Adventure, Fantasy","USA, UK, Canada, China",102 min,PG-13,2.9,5.5,30.0
1,Welcome_to_Me,English,2014,"Comedy, Drama",USA,105 min,R,3.4,6.2,67.0
2,The_Judge,English,2014,Drama,USA,141 min,R,3.8,7.5,48.0
3,Transformers__Age_of_Extinction,English,2014,"Action, Adventure, Sci-Fi","USA, China",165 min,PG-13,3.3,5.8,32.0
4,The_Normal_Heart,English,2014,Drama,USA,132 min,TV-MA,4.2,8.0,64.093333


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,816,817,818,819,820,821,822,823,824,825
filename,vis_data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Seventh_Son,0,0.047044,0.11619,0.13633,0.066194,0.072554,0.17267,0.21519,0.070574,0.071423,0.14938,...,731.69,502.01,1.897,2.2788,2.1412,2.9504,91672.0,22207.0,26201.0,14542.0
Seventh_Son,1,0.056526,0.12516,0.14628,0.082497,0.079331,0.17538,0.21839,0.093521,0.074837,0.15025,...,689.95,474.97,2.2676,2.5887,2.4022,3.2167,81373.0,21045.0,24225.0,13529.0
Welcome_to_Me,0,0.30717,0.33422,0.33112,0.33124,0.31114,0.33644,0.33616,0.34479,0.16983,0.27379,...,394.34,167.91,20.337,21.276,18.527,21.189,81665.0,13672.0,32531.0,13753.0
Welcome_to_Me,1,0.30466,0.33193,0.33124,0.33138,0.30788,0.3327,0.33357,0.34305,0.1733,0.28076,...,397.26,168.23,20.426,21.3,18.608,21.182,83171.0,13714.0,32774.0,13780.0
The_Judge,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,230400.0,119950.0,1e-06,0.002466,4e-06,0.002466,729320.0,119950.0,230400.0,119950.0


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Seventh_Son,67.562481,-4.5252,1.6463,-0.597742,1.36281,-1.361579,0.170381,-0.500409,-0.050339,-0.269793,-0.232489,-0.197553,0.13264
Welcome_to_Me,61.548885,-7.146781,-1.103407,-1.58802,0.249743,0.003055,-2.313552,0.371521,0.516853,-1.405396,-0.951247,1.316795,-0.095459
The_Judge,65.038918,-4.171344,-0.455058,-0.094249,-0.365649,-0.182148,0.700715,-0.195335,-0.204333,-1.281841,0.301911,-0.197494,0.861993
Transformers__Age_of_Extinction,64.544291,-3.661545,-0.010532,-0.802876,-0.614974,-0.255984,0.810787,0.465672,0.271618,-0.691701,0.16124,0.310825,0.21462
The_Normal_Heart,60.433903,0.148386,1.713255,-0.203955,-1.187262,-2.310341,-1.726492,-0.512949,0.270257,-0.098537,0.705479,-0.059131,0.247545


Unnamed: 0_level_0,24000,baby,baseball,big,doc,escort,frozen,heroes,high,huck,...,years.1,york,yorks,young,young.1,younger,youngja,zebra,zellweger,zoologists
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Seventh_Son,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Welcome_to_Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The_Judge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Transformers__Age_of_Extinction,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The_Normal_Heart,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.051657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Preprocess Data

## Description 

Beside a short description for the audio data there is no more information on how to handle the other data. For example the runtime currently is not handles as a number but as a string(object)
Since sklearn mostly expects numerical inputs, we need to encode the data. 

For different class normally you would use one-hot-encoding, but since it's not specified let's try first the easiest approach which is Labelencoding.


### Audio Data: 
As mentiones in the paper, NaN values of the audio data are set to 0 and the average of each MFCC coefficient is calculated over all frames.

This is already treated in the section before.



In [3]:

    
def pre_process_visual_data():
    #create columns of the two rows belonging to each movie
    df_data = df_visual_data.unstack()
    return df_data
    
def pre_process_meta_data(df_meta):
    #create columns of the two rows belonging to each movie
    df_data = df_meta.drop(['genre','country','language'],axis=1)
    df_data_genre = df_meta['genre'].apply( lambda x: x.replace(" ", "") ).str.get_dummies(sep=",")
    df_data_genre.columns = ['genre_'+str(x) for x in df_data_genre.columns]
    
    df_data_country = df_meta_data['country'].apply( lambda x: x.replace(" ", "") ).str.get_dummies(sep=",")
    df_data_country.columns = ['country_'+str(x) for x in df_data_country.columns]
    
    df_data_language = df_meta_data['language'].apply( lambda x: x.replace(" ", "") ).str.get_dummies(sep=",")
    df_data_language.columns = ['language_'+str(x) for x in df_data_language.columns]

    df_data = pd.concat([df_data,df_data_genre,df_data_country,df_data_language], axis=1)

    df_data['runtime'] = pd.to_numeric(df_data['runtime'].apply(lambda x: int(x.split(' ')[0]) if x != 'N/A' else 0))
    df_data['year'] =  df_data['year'].apply(pd.to_numeric)

    return df_data

df_meta_data_processed = pre_process_meta_data(df_meta_data)
display(df_meta_data_processed.head(5))

#pd.merge(df_labled_movies.drop(['movie'],axis=1),df_meta_data_processed, on='filename').to_csv('data/meta_data_processed_exported.csv',sep=";",index = False)
df_meta_extended_data_processed = pre_process_meta_data(df_meta_extended_data)
display(df_meta_extended_data_processed.head(5))



df_audio_data_processed = df_audio_data #pre_process_audio_data()
display(df_audio_data_processed.head(5))

df_visual_data_processed = pre_process_visual_data()
#pd.merge(df_labled_movies.drop(['movie'],axis=1),df_visual_data_processed, on='filename').to_csv('data/visual_data_exported.csv',sep=";",index = False)

display(df_visual_data_processed.head(5))

Unnamed: 0,filename,year,runtime,rated,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,...,language_Latin,language_Mandarin,language_Navajo,language_Russian,language_Serbian,language_Spanish,language_Swahili,language_Urdu,language_Vietnamese,language_Yiddish
0,Seventh_Son,2014,102,PG-13,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Welcome_to_Me,2014,105,R,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,The_Judge,2014,141,R,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Transformers__Age_of_Extinction,2014,165,PG-13,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The_Normal_Heart,2014,132,TV-MA,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,filename,year,runtime,rated,tomatorating,imbdrating,metarating,genre_Action,genre_Adventure,genre_Animation,...,language_Latin,language_Mandarin,language_Navajo,language_Russian,language_Serbian,language_Spanish,language_Swahili,language_Urdu,language_Vietnamese,language_Yiddish
0,Seventh_Son,2014,102,PG-13,2.9,5.5,30.0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Welcome_to_Me,2014,105,R,3.4,6.2,67.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The_Judge,2014,141,R,3.8,7.5,48.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Transformers__Age_of_Extinction,2014,165,PG-13,3.3,5.8,32.0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,The_Normal_Heart,2014,132,TV-MA,4.2,8.0,64.093333,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Seventh_Son,67.562481,-4.5252,1.6463,-0.597742,1.36281,-1.361579,0.170381,-0.500409,-0.050339,-0.269793,-0.232489,-0.197553,0.13264
Welcome_to_Me,61.548885,-7.146781,-1.103407,-1.58802,0.249743,0.003055,-2.313552,0.371521,0.516853,-1.405396,-0.951247,1.316795,-0.095459
The_Judge,65.038918,-4.171344,-0.455058,-0.094249,-0.365649,-0.182148,0.700715,-0.195335,-0.204333,-1.281841,0.301911,-0.197494,0.861993
Transformers__Age_of_Extinction,64.544291,-3.661545,-0.010532,-0.802876,-0.614974,-0.255984,0.810787,0.465672,0.271618,-0.691701,0.16124,0.310825,0.21462
The_Normal_Heart,60.433903,0.148386,1.713255,-0.203955,-1.187262,-2.310341,-1.726492,-0.512949,0.270257,-0.098537,0.705479,-0.059131,0.247545


Unnamed: 0_level_0,0,0,1,1,2,2,3,3,4,4,...,821,821,822,822,823,823,824,824,825,825
vis_data,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
filename,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Seventh_Son,0.047044,0.056526,0.11619,0.12516,0.13633,0.14628,0.066194,0.082497,0.072554,0.079331,...,2.9504,3.2167,91672.0,81373.0,22207.0,21045.0,26201.0,24225.0,14542.0,13529.0
Welcome_to_Me,0.30717,0.30466,0.33422,0.33193,0.33112,0.33124,0.33124,0.33138,0.31114,0.30788,...,21.189,21.182,81665.0,83171.0,13672.0,13714.0,32531.0,32774.0,13753.0,13780.0
The_Judge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002466,0.002466,729320.0,729320.0,119950.0,119950.0,230400.0,230400.0,119950.0,119950.0
Transformers__Age_of_Extinction,0.19996,0.18913,0.26934,0.25738,0.27986,0.27465,0.23725,0.23664,0.30844,0.30332,...,14.779,14.723,208630.0,211630.0,23968.0,24019.0,47979.0,48339.0,24059.0,24090.0
The_Normal_Heart,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038749,0.20135,...,7.3798,11.306,145760.0,79962.0,20730.0,20617.0,35320.0,45216.0,20831.0,20738.0


# Define Models

## Description 
These are the models described in the paper. It is not allways clear which exact models they used. (see comments)

In [4]:
from  sklearn.neighbors import KNeighborsClassifier, NearestCentroid #(not sure if this is the nearest mean classifiert) 
from  sklearn.tree import DecisionTreeClassifier
from  sklearn.linear_model import LogisticRegression
from  sklearn.svm import SVC #(not clear which SVC, there is also NuSVC )
from  sklearn.ensemble import BaggingClassifier
from  sklearn.ensemble import AdaBoostClassifier
from  sklearn.ensemble import GradientBoostingClassifier
from  sklearn.ensemble import RandomForestClassifier
from  sklearn.naive_bayes import GaussianNB # there are 3 different naive bayes classifiers, it is not stated which one they used 


model_list = [KNeighborsClassifier(),
                NearestCentroid(),
                DecisionTreeClassifier(random_state = 123),
                LogisticRegression(random_state = 123),
                SVC(random_state = 123),
                BaggingClassifier(random_state = 123),
                AdaBoostClassifier(random_state = 123),
                GradientBoostingClassifier(random_state = 123),
                RandomForestClassifier(random_state = 123),
                GaussianNB() 
                   ]

    


# Define Performance measures:

As mentioned in the paper the performant measueres are the following Precision and Recall and F1-Score. To be more precise the weighted average of Precision and Recall and F1-Score as stated in the dataset paper. 

In [5]:
from sklearn.model_selection import cross_validate

def calculate_metrics(clf,X,y ):
    metric =  cross_validate(clf, X, y, scoring=('precision_weighted','recall_weighted','f1_weighted'), return_train_score=False, cv=10)  
    return pd.Series({'precision':metric['test_precision_weighted'].mean(),'recall':metric['test_recall_weighted'].mean(),'F1':metric['test_f1_weighted'].mean() })

# Select Models

As defined in the paper they use 10-fold CV on the classifiers for training and keep all the classifiers where the metrics are above 0.5 for later stacking.


In [6]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np

class MultiColumnLabelEncoder:
    
    def __init__(self, columns = None):
        self.columns = columns # list of column to encode

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        
        output = X.copy()
        
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname, col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        
        return output

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
    
def getModelName( object ): 

    if hasattr(object, '__module__') and hasattr(object, '__name__'):
        return  object.__name__
    elif hasattr(object, '__module__') and hasattr(object, '__class__'):
        return  object.__class__.__name__
    else:
        raise TypeError("Could not get name of object!")
    
def evaluate_models( X, y ):
    metrics = pd.DataFrame()

    for model in model_list:
        m = calculate_metrics(model,X,y )
        metrics[getModelName(model)] = m

    return metrics.T


df_final_results = pd.DataFrame()

import warnings
warnings.filterwarnings('ignore')

## Meta data

In [7]:
    
df_train = pd.merge(df_labled_movies,df_meta_data, on='filename')
df_train.drop(['movie', 'filename'],axis=1, inplace=True)
display(df_train.head(2))
df_X = df_train.drop('goodforairplane',axis=1)
df_y = df_train['goodforairplane']



display("----  Lable encoded ----")
label_encoder = MultiColumnLabelEncoder(['language','year','genre','country','runtime','rated'])    
X_labelencoded = label_encoder.fit_transform(df_X)
metrics = evaluate_models(X_labelencoded, df_y)
display(metrics)

#convert runtime and year to actual number
df_X['runtime'] = df_X['runtime'].apply(lambda x: int(x.split(' ')[0]) )
df_X['year'] =  df_X['year'].apply(pd.to_numeric)

display("---- Lable encoded with float for year and runtime ----")
##optimizing encoding
label_encoder = MultiColumnLabelEncoder(['language','year','genre','country','rated'])    
X_labelencoded = label_encoder.fit_transform(df_X)
metrics = evaluate_models(X_labelencoded, df_y)
display(metrics)

display("---- Lable encoded without year ----")
label_encoder = MultiColumnLabelEncoder(['language','genre','country','rated'])    
X_labelencoded = label_encoder.fit_transform(df_X)
metrics = evaluate_models(X_labelencoded, df_y)
display(metrics)


display("---- OneHot Encoding ----")
##optimizing encoding further by using the processed meta data

df_train = pd.merge(df_labled_movies,df_meta_extended_data_processed, on='filename')
df_train.drop(['movie', 'filename'],axis=1, inplace=True)
display(df_train.head(2))
df_X = df_train.drop('goodforairplane',axis=1)
df_y = df_train['goodforairplane']

label_encoder = MultiColumnLabelEncoder(['rated'])    
X_labelencoded = label_encoder.fit_transform(df_X)
metrics = evaluate_models(X_labelencoded, df_y)
# save the best of the for the final table 
metrics['Modality'] = 'metadata'
df_final_results = df_final_results.append(metrics)

display(metrics)


# Save final data sets obtained with our best approach
df_y_meta = df_y
X_labelencoded_meta = X_labelencoded

Unnamed: 0,goodforairplane,language,year,genre,country,runtime,rated
0,1,English,2014,"Action, Adventure, Fantasy","USA, UK, Canada, China",102 min,PG-13
1,0,English,2014,"Comedy, Drama",USA,105 min,R


'----  Lable encoded ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.5194,0.566162,0.52335
NearestCentroid,0.602922,0.583333,0.576202
DecisionTreeClassifier,0.490991,0.490505,0.467364
LogisticRegression,0.587637,0.585556,0.574014
SVC,0.297467,0.536869,0.382622
BaggingClassifier,0.512169,0.499596,0.48254
AdaBoostClassifier,0.516002,0.501717,0.490721
GradientBoostingClassifier,0.501382,0.500707,0.486474
RandomForestClassifier,0.50988,0.498485,0.489622
GaussianNB,0.47488,0.506667,0.481515


'---- Lable encoded with float for year and runtime ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.657535,0.618586,0.600698
NearestCentroid,0.466408,0.479091,0.465068
DecisionTreeClassifier,0.404628,0.411818,0.401082
LogisticRegression,0.512315,0.52404,0.493776
SVC,0.300554,0.54798,0.388116
BaggingClassifier,0.524126,0.520909,0.509404
AdaBoostClassifier,0.466827,0.474444,0.462241
GradientBoostingClassifier,0.439652,0.441111,0.425558
RandomForestClassifier,0.503327,0.503737,0.486752
GaussianNB,0.467194,0.499798,0.47404


'---- Lable encoded without year ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.632037,0.619697,0.589233
NearestCentroid,0.466408,0.479091,0.465068
DecisionTreeClassifier,0.404628,0.411818,0.401082
LogisticRegression,0.545546,0.549293,0.529509
SVC,0.300554,0.54798,0.388116
BaggingClassifier,0.524126,0.520909,0.509404
AdaBoostClassifier,0.466827,0.474444,0.462241
GradientBoostingClassifier,0.439652,0.441111,0.425558
RandomForestClassifier,0.503327,0.503737,0.486752
GaussianNB,0.517396,0.539798,0.499798


'---- OneHot Encoding ----'

Unnamed: 0,goodforairplane,year,runtime,rated,tomatorating,imbdrating,metarating,genre_Action,genre_Adventure,genre_Animation,...,language_Latin,language_Mandarin,language_Navajo,language_Russian,language_Serbian,language_Spanish,language_Swahili,language_Urdu,language_Vietnamese,language_Yiddish
0,1,2014,102,PG-13,2.9,5.5,30.0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,2014,105,R,3.4,6.2,67.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,precision,recall,F1,Modality
KNeighborsClassifier,0.484398,0.492626,0.48159,metadata
NearestCentroid,0.383315,0.394242,0.378167,metadata
DecisionTreeClassifier,0.555659,0.547273,0.539899,metadata
LogisticRegression,0.442538,0.454242,0.443005,metadata
SVC,0.414265,0.449293,0.407568,metadata
BaggingClassifier,0.439428,0.438889,0.434495,metadata
AdaBoostClassifier,0.408812,0.442929,0.413217,metadata
GradientBoostingClassifier,0.439845,0.454242,0.443651,metadata
RandomForestClassifier,0.395541,0.393636,0.379418,metadata
GaussianNB,0.424487,0.489394,0.413467,metadata


## Textual data

In [8]:
from sklearn.preprocessing import Normalizer

################## Use textual data  ###################
display('################## Use textual data  ###################')

df_movies = df_labled_movies.drop(['movie'],axis=1)
df_train = pd.merge(df_movies,df_text_data, on='filename')
df_train.drop(['filename'],axis=1, inplace=True)
display(df_train.head(10))
df_X = df_train.drop('goodforairplane',axis=1)
df_y = df_train['goodforairplane']


display("---- RAW Data ----")
metrics = evaluate_models(df_X, df_y)
display(metrics)


display("---- Normalize Data ----")
df_normalized_X = Normalizer().fit_transform(df_X)
metrics = evaluate_models(df_normalized_X, df_y)
display(metrics)

# save  the final table 
metrics['Modality'] = 'textual'
df_final_results = df_final_results.append(metrics)

# Save final data sets obtained with our best approach
df_y_text = df_y
df_X_text = df_X #pd.DataFrame(df_normalized_X, df_X.index, df_X.columns)

'################## Use textual data  ###################'

Unnamed: 0,goodforairplane,24000,baby,baseball,big,doc,escort,frozen,heroes,high,...,years.1,york,yorks,young,young.1,younger,youngja,zebra,zellweger,zoologists
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.051657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.041679,0.0,0.0,0.0,0.0
8,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.15957,0.15957,0.0,0.0,0.0,0.0,0.0
9,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


'---- RAW Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.341994,0.465152,0.366055
NearestCentroid,0.452087,0.561111,0.461862
DecisionTreeClassifier,0.455147,0.46,0.427983
LogisticRegression,0.300554,0.54798,0.388116
SVC,0.300554,0.54798,0.388116
BaggingClassifier,0.509397,0.567273,0.496721
AdaBoostClassifier,0.575156,0.576162,0.558896
GradientBoostingClassifier,0.639572,0.655051,0.600616
RandomForestClassifier,0.446707,0.54,0.470888
GaussianNB,0.537073,0.558182,0.538881


'---- Normalize Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.506534,0.53303,0.512809
NearestCentroid,0.555174,0.555253,0.531696
DecisionTreeClassifier,0.522597,0.536869,0.511263
LogisticRegression,0.300554,0.54798,0.388116
SVC,0.300554,0.54798,0.388116
BaggingClassifier,0.529374,0.56798,0.523605
AdaBoostClassifier,0.494176,0.530606,0.502345
GradientBoostingClassifier,0.65407,0.629697,0.58193
RandomForestClassifier,0.488127,0.528889,0.491408
GaussianNB,0.537073,0.558182,0.538881


## Visual data

In [9]:
from sklearn.preprocessing import StandardScaler,RobustScaler

################## Use visual data  ###################
display('################## Use visual data  ###################')

df_movies = df_labled_movies.drop(['movie'],axis=1)
df_train = pd.merge(df_movies,df_visual_data_processed, on='filename')
df_train.drop(['filename'],axis=1, inplace=True)
display(df_train.head(5))
df_X = df_train.drop('goodforairplane',axis=1)
df_y = df_train['goodforairplane']


display("---- RAW Data ----")
metrics = evaluate_models(df_X, df_y)
display(metrics)

display("---- Scaled Data ----")
df_scaled_X = StandardScaler().fit_transform(df_X)
metrics = evaluate_models(df_scaled_X, df_y)
display(metrics)


display("---- RobustScaler Data ----")
df_scaled_X = RobustScaler().fit_transform(df_X)
metrics = evaluate_models(df_scaled_X, df_y)
display(metrics)

# save  the final table 
metrics['Modality'] = 'visual'
df_final_results = df_final_results.append(metrics)

# Save final data sets obtained with our best approach
df_y_visual = df_y
df_scaled_X_visual = df_X#pd.DataFrame(df_scaled_X, df_X.index, df_X.columns)


'################## Use visual data  ###################'

Unnamed: 0,goodforairplane,"(0, 0)","(0, 1)","(1, 0)","(1, 1)","(2, 0)","(2, 1)","(3, 0)","(3, 1)","(4, 0)",...,"(821, 0)","(821, 1)","(822, 0)","(822, 1)","(823, 0)","(823, 1)","(824, 0)","(824, 1)","(825, 0)","(825, 1)"
0,1,0.047044,0.056526,0.11619,0.12516,0.13633,0.14628,0.066194,0.082497,0.072554,...,2.9504,3.2167,91672.0,81373.0,22207.0,21045.0,26201.0,24225.0,14542.0,13529.0
1,0,0.30717,0.30466,0.33422,0.33193,0.33112,0.33124,0.33124,0.33138,0.31114,...,21.189,21.182,81665.0,83171.0,13672.0,13714.0,32531.0,32774.0,13753.0,13780.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002466,0.002466,729320.0,729320.0,119950.0,119950.0,230400.0,230400.0,119950.0,119950.0
3,0,0.19996,0.18913,0.26934,0.25738,0.27986,0.27465,0.23725,0.23664,0.30844,...,14.779,14.723,208630.0,211630.0,23968.0,24019.0,47979.0,48339.0,24059.0,24090.0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038749,...,7.3798,11.306,145760.0,79962.0,20730.0,20617.0,35320.0,45216.0,20831.0,20738.0


'---- RAW Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.546787,0.542929,0.529648
NearestCentroid,0.405332,0.473333,0.405324
DecisionTreeClassifier,0.491792,0.508182,0.488059
LogisticRegression,0.561063,0.573939,0.555761
SVC,0.398547,0.55798,0.4225
BaggingClassifier,0.616987,0.606162,0.604084
AdaBoostClassifier,0.505459,0.509293,0.496069
GradientBoostingClassifier,0.521968,0.534646,0.517804
RandomForestClassifier,0.476204,0.485152,0.467797
GaussianNB,0.503584,0.518586,0.484453


'---- Scaled Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.570299,0.56,0.538146
NearestCentroid,0.515741,0.505354,0.487731
DecisionTreeClassifier,0.484093,0.509293,0.48457
LogisticRegression,0.580084,0.549798,0.538425
SVC,0.45307,0.538889,0.448757
BaggingClassifier,0.584129,0.583939,0.575801
AdaBoostClassifier,0.507985,0.504242,0.489265
GradientBoostingClassifier,0.505687,0.521515,0.505821
RandomForestClassifier,0.453981,0.462929,0.445575
GaussianNB,0.607893,0.587273,0.57359


'---- RobustScaler Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.558343,0.56,0.544837
NearestCentroid,0.576882,0.539596,0.508983
DecisionTreeClassifier,0.484093,0.509293,0.48457
LogisticRegression,0.513166,0.509394,0.501544
SVC,0.425366,0.467172,0.420111
BaggingClassifier,0.601801,0.606162,0.591861
AdaBoostClassifier,0.497985,0.493131,0.478154
GradientBoostingClassifier,0.537062,0.532626,0.524928
RandomForestClassifier,0.453981,0.462929,0.445575
GaussianNB,0.532297,0.525354,0.522063


## Audio data

In [10]:
from sklearn.preprocessing import StandardScaler,RobustScaler

################## Use audio data  ###################

display('################## Use audio data  ###################')

def load_audio_data( filenames ): # changed
    data_list = []
    
    audio_data = pd.DataFrame()
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Dev_set/audio_descriptors/{file}.csv'
        df_data = pd.read_csv(file_path,index_col=None, header=None).T
        
        # preprocess data
        df_data = df_data.fillna(0)
        df_data = pd.DataFrame(df_data.mean(axis = 0)).T
        df_data["filename"] = file
        audio_data = audio_data.append(df_data)
    audio_data = audio_data.set_index("filename") 
    return audio_data

df_audio_data = load_audio_data( df_labled_movies['filename']  )


df_train_audio = pd.merge(df_movies, df_audio_data, on='filename')
df_train_audio.drop(['filename'],axis=1, inplace=True)
df_X = df_train_audio.drop('goodforairplane',axis=1)
df_y = df_train_audio['goodforairplane']


display("---- RAW Data ----")
metrics = evaluate_models(df_X, df_y)
display(metrics)

display("---- Scaled Data ----")
df_scaled_X = StandardScaler().fit_transform(df_X)
metrics = evaluate_models(df_scaled_X, df_y)
display(metrics)

# save  the final table 
metrics['Modality'] = 'audio'
df_final_results = df_final_results.append(metrics)

# Save final data sets obtained with our best approach
df_y_audio = df_y
df_X_audio = df_X

display("---- RobustScaler Data ----")
df_scaled_X = RobustScaler().fit_transform(df_X)
metrics = evaluate_models(df_scaled_X, df_y)
display(metrics)


'################## Use audio data  ###################'

'---- RAW Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.442217,0.464444,0.446119
NearestCentroid,0.605951,0.547071,0.491725
DecisionTreeClassifier,0.474011,0.457273,0.430179
LogisticRegression,0.558033,0.54596,0.539093
SVC,0.341683,0.424949,0.368661
BaggingClassifier,0.427441,0.435859,0.419515
AdaBoostClassifier,0.52447,0.51202,0.491446
GradientBoostingClassifier,0.515392,0.51697,0.506224
RandomForestClassifier,0.469181,0.474444,0.459993
GaussianNB,0.550048,0.515758,0.498764


'---- Scaled Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.453864,0.450101,0.432996
NearestCentroid,0.555387,0.542828,0.528025
DecisionTreeClassifier,0.474011,0.457273,0.430179
LogisticRegression,0.531818,0.522626,0.514002
SVC,0.416675,0.463131,0.420189
BaggingClassifier,0.427441,0.435859,0.419515
AdaBoostClassifier,0.52447,0.51202,0.491446
GradientBoostingClassifier,0.515392,0.51697,0.506224
RandomForestClassifier,0.469181,0.474444,0.459993
GaussianNB,0.550048,0.515758,0.498764


'---- RobustScaler Data ----'

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.400179,0.422727,0.405317
NearestCentroid,0.647124,0.547071,0.501967
DecisionTreeClassifier,0.474011,0.457273,0.430179
LogisticRegression,0.518855,0.511515,0.501448
SVC,0.48107,0.488485,0.470724
BaggingClassifier,0.427441,0.435859,0.419515
AdaBoostClassifier,0.52447,0.51202,0.491446
GradientBoostingClassifier,0.515392,0.51697,0.506224
RandomForestClassifier,0.469181,0.474444,0.459993
GaussianNB,0.550048,0.515758,0.498764


## Final base classifier filter

In [11]:
df_r = df_final_results
df_r = df_r[ (df_r['precision'] > 0.5) & (df_r['recall'] > 0.5) & (df_r['F1'] > 0.5) ]
display(df_r)


Unnamed: 0,precision,recall,F1,Modality
DecisionTreeClassifier,0.555659,0.547273,0.539899,metadata
KNeighborsClassifier,0.506534,0.53303,0.512809,textual
NearestCentroid,0.555174,0.555253,0.531696,textual
DecisionTreeClassifier,0.522597,0.536869,0.511263,textual
BaggingClassifier,0.529374,0.56798,0.523605,textual
GradientBoostingClassifier,0.65407,0.629697,0.58193,textual
GaussianNB,0.537073,0.558182,0.538881,textual
KNeighborsClassifier,0.558343,0.56,0.544837,visual
NearestCentroid,0.576882,0.539596,0.508983,visual
LogisticRegression,0.513166,0.509394,0.501544,visual


As we can see the results table looks pretty different than in the paper. There is not really enough information in the paper to be sure that we are correctly reproducing the steps. 

With the audio data there is actually not really more we could do since we just end up with one coliumn of data as descirbed in the paper, but still the metrics is not as good as in the paper. 

__Is there something wrong already when we load the data ? Wrong data?__



## Loading of test data
There are problems with test data in the label file and actual XML files, some of them do not match. Sometimes the file ending is given (e.g. .xml, we removed it), sometimes the row is given as a string. We detected such movies with the code below, and afterwards fixed the detected movies by hand.

Additonally, the movie 10.000km is given twice with label 0 and 1 (we just assume one of them and deleted the other one)!

In [12]:
import os


df_movies_all = pd.read_csv("./data/CoE_dataset/Test_set/test_set_labels.csv", sep=';')
print(df_movies_all.shape[0])

# To avoid null values, we just removed " by hand...
display(df_movies_all[df_movies_all["file_name"].isnull()])

ex = 0
filenames = []
for file in df_movies_all['file_name']:
    # fix already given file type - fixed by hand now
    if ".mp4" in file:
        file = file.split(".mp4")[0]
    if ".xml" in file:
        file = file.split(".xml")[0]
    
    # check if file is given twice
    if file in filenames:
        # already dropped by hand
        print("file already in! " + file)
    else:    
        file_path = f'./data/CoE_dataset/Test_set/XML/{file}.xml'
        exists = os.path.isfile(file_path)
        if exists:
            ex = ex + 1
            filenames.append(file)
        else:
            print(file)
            print(str(file_path) + " not exists!")
            df_movies_all = df_movies_all[df_movies_all['file_name'] != file]

print("Existing movies: " + str(ex))
print(df_movies_all.shape[0])

224


Unnamed: 0,movie_name,file_name,goodforairplanes


A_Fish_Called_Wanda
./data/CoE_dataset/Test_set/XML/A_Fish_Called_Wanda.xml not exists!
Existing movies: 223
223


In [13]:
df_labled_movies_test = df_movies_all
df_labled_movies_test = df_labled_movies_test[['movie_name','file_name', 'goodforairplanes']]
df_labled_movies_test.columns = ['movie', 'filename', 'goodforairplane']
display(df_labled_movies_test.head(3))


############################################################
### Load Meta Data ###

def load_meta_data_test( filenames ): 
    
    raw_data = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Test_set/XML/{file}.xml'
        exists = os.path.isfile(file_path)
        if not exists:
            print(file + " not exists!")
        else:
            with open(file_path) as f: 
                tree = ET.parse(f)
                movie = tree.find('movie')

                lang = movie.get('language')
                year = movie.get('year')
                genre = movie.get('genre')
                country = movie.get('country')
                runtime = movie.get('runtime')
                age_rating = movie.get('rated')

                raw_data.append( (file,lang,year,genre,country,runtime,age_rating) )
    
    return pd.DataFrame(raw_data, columns=['filename','language','year','genre','country','runtime','rated'])


df_meta_data_test = load_meta_data_test( df_labled_movies_test['filename']  )
display(df_meta_data_test.head(3))



###############################################################
### Load meta data with user rating  ###

def load_meta_extended_data_test( filenames ): 
    
    raw_data = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Test_set/XML/{file}.xml'
        exists = os.path.isfile(file_path)
        if not exists:
            print(file + " not exists!")
        else:
            with open(file_path) as f: 
                tree = ET.parse(f)
                movie = tree.find('movie')

                lang = movie.get('language')
                year = movie.get('year')
                genre = movie.get('genre')
                country = movie.get('country')
                runtime = movie.get('runtime')
                age_rating = movie.get('rated')
                tomato_rating = movie.get('tomatoUserRating')
                imbd_rating = movie.get('imdbRating')
                meta_rating = movie.get('metascore')

                if tomato_rating == "N/A":
                    tomato_rating = np.nan
                if imbd_rating == "N/A":
                    imbd_rating = np.nan
                if meta_rating == "N/A":
                    meta_rating = np.nan
                if country == "N/A":
                    country = np.nan
                if genre == "N/A":
                    genre = np.nan
                raw_data.append( (file,lang,year,genre,country,runtime,age_rating,tomato_rating,imbd_rating,meta_rating) )
    
    return pd.DataFrame(raw_data, columns=['filename','language','year','genre','country','runtime','rated','tomatorating','imbdrating','metarating'])



df_meta_extended_data_test = load_meta_extended_data_test( df_labled_movies_test['filename']  )
df_meta_extended_data_test['tomatorating'] = pd.to_numeric(df_meta_extended_data_test['tomatorating'])
df_meta_extended_data_test['imbdrating'] = pd.to_numeric(df_meta_extended_data_test['imbdrating'])
df_meta_extended_data_test['metarating'] = pd.to_numeric(df_meta_extended_data_test['metarating'])
df_meta_extended_data_test.fillna(df_meta_extended_data_test.mean(),inplace=True)

display(df_meta_extended_data_test.head(3))



############################################################
### Load Visual Data ###

def load_visual_data_test( filenames ):
    data_list = []
    
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Test_set/vis_descriptors/{file}.csv'
        exists = os.path.isfile(file_path)
        if not exists:
            print(file + " not exists!")
        else:
            df_data = pd.read_csv(file_path,index_col=None, header=None)
            data_list.append(df_data)
        
    return pd.concat(data_list, axis = 0, keys = filenames,names=('filename','vis_data'),  sort=False)

df_visual_data_test = load_visual_data_test( df_labled_movies_test['filename']  )
df_visual_data_test = df_visual_data_test.unstack()
display(df_visual_data_test.head(3))


############################################################
### Load Audio Data ###

def load_audio_data_test( filenames ):
    data_list = []
    
    audio_data = pd.DataFrame()
    for file in filenames: 
        file_path = f'./data/CoE_dataset/Test_set/audio_descriptors/{file}.csv'
        
        exists = os.path.isfile(file_path)
        if not exists:
            print(file + " not exists!")
        else:
            df_data = pd.read_csv(file_path,index_col=None, header=None).T

            # preprocess data
            df_data = df_data.fillna(0)
            df_data = pd.DataFrame(df_data.mean(axis = 0)).T
            df_data["filename"] = file
            audio_data = audio_data.append(df_data)
            
    audio_data = audio_data.set_index("filename") 
    return audio_data

df_audio_data_test = load_audio_data_test( df_labled_movies_test['filename']  )
display(df_audio_data_test.head(3))


############################################################
### Load textual Data ###

def load_text_data_test(filenames):
    

    data_list = []
    file_path = f'./data/CoE_dataset/Test_set/text_descriptors/tdf_idf_test.csv'
    #somehow pandas can not really handle that the first line is row names.(at least I didn't find a better way) 
    # thus we do it a little complicated here
    header_index = pd.read_csv(file_path, index_col=0,nrows=1 ).reset_index().columns
    df_data = pd.read_csv(file_path, header=None, index_col=False,skiprows=1)
    df_data.set_index(header_index, inplace=True)
    df_data.columns = filenames
    return df_data.T #row are should be represented by movie names

df_text_data_test = load_text_data_test(df_labled_movies_test['filename'] )
display(df_text_data_test.head(3))

Unnamed: 0,movie,filename,goodforairplane
0,Humpty Sharma Ki Dulhania,Humpty_Sharma_Ki_Dulhania,1
1,Homeland,Homeland,1
2,Trash,Trash,1


Unnamed: 0,filename,language,year,genre,country,runtime,rated
0,Humpty_Sharma_Ki_Dulhania,Hindi,2014,"Comedy, Drama, Romance",India,133 min,NOT RATED
1,Homeland,English,2011,"Drama, Mystery, Thriller",USA,55 min,TV-MA
2,Trash,"Portuguese, English",2014,"Adventure, Comedy, Crime","UK, Brazil",114 min,R


Unnamed: 0,filename,language,year,genre,country,runtime,rated,tomatorating,imbdrating,metarating
0,Humpty_Sharma_Ki_Dulhania,Hindi,2014,"Comedy, Drama, Romance",India,133 min,NOT RATED,3.2,6.0,65.470588
1,Homeland,English,2011,"Drama, Mystery, Thriller",USA,55 min,TV-MA,3.570423,8.5,65.470588
2,Trash,"Portuguese, English",2014,"Adventure, Comedy, Crime","UK, Brazil",114 min,R,3.5,7.1,63.0


Unnamed: 0_level_0,0,0,1,1,2,2,3,3,4,4,...,821,821,822,822,823,823,824,824,825,825
vis_data,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
filename,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Humpty_Sharma_Ki_Dulhania,0.25284,0.25278,0.23444,0.2345,0.25556,0.25565,0.23095,0.23102,0.2672,0.26718,...,13.606,13.606,38450.0,38440.0,11844.0,11847.0,23975.0,23988.0,13069.0,13071.0
Homeland,0.1366,0.12622,0.2911,0.29358,0.38919,0.38603,0.25374,0.24926,0.18142,0.20572,...,8.6886,8.6112,13428.0,14415.0,2086.1,2519.3,5285.1,6909.0,2510.7,2963.8
Trash,0.23858,0.25246,0.24924,0.25729,0.34233,0.34281,0.32537,0.32128,0.28438,0.28786,...,11.442,11.627,37948.0,36342.0,11777.0,11420.0,24904.0,23160.0,12274.0,11730.0


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Humpty_Sharma_Ki_Dulhania,4.547587,-5.63623,1.434987,-0.279716,-0.669368,-1.271336,-0.705938,-0.263066,-0.273322,-0.794631,-0.060173,0.003418,-0.272562
Homeland,62.653646,-2.540778,0.94344,-1.226452,-0.285784,-0.821387,-0.986073,-1.069481,-1.126877,-0.613598,0.16874,-0.776176,0.38972
Trash,59.511905,-4.309526,-0.72833,-2.60298,0.1502,-0.210795,-0.315625,0.037404,-0.298176,0.943956,0.579414,0.388942,-0.008194


Unnamed: 0_level_0,1,1000,200000,acquired,ailing,avatar,avoid,babysitter,barbaric,battle,...,zero,zeus,zeus.1,zeus.2,zhonglian,zhuo,zombie,zombiehating,zombies,zuckerberg
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Humpty_Sharma_Ki_Dulhania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Homeland,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Trash,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Preparation of test data
The process here is similar to the training data.

In [14]:
df_movies_test = df_labled_movies_test.drop(['movie'], axis = 1)


#####################################################
# Textual data
df_test_text = pd.merge(df_movies_test, df_text_data_test, on='filename')
df_test_text.drop(['filename'],axis=1, inplace=True)

df_X_text_test = df_test_text.drop('goodforairplane',axis=1)
df_y_text_test = df_test_text['goodforairplane']

print("text:" + str(df_X_text_test.shape[0]))
display(df_X_text_test.head(2))


#####################################################
# Visual data
df_test_visual = pd.merge(df_movies_test, df_visual_data_test, on='filename')
df_test_visual.drop(['filename'],axis=1, inplace=True)

df_X_visual_test = df_test_visual.drop('goodforairplane',axis=1)
df_y_visual_test = df_test_visual['goodforairplane']
df_scaled_X_visual_test = df_X_visual_test#pd.DataFrame(StandardScaler().fit_transform(df_X_visual_test))

print("visual:"  + str(df_X_visual_test.shape[0]))
display(df_X_visual_test.head(2))


#####################################################
# Audio data
df_test_audio = pd.merge(df_movies_test, df_audio_data_test, on='filename')
df_test_audio.drop(['filename'],axis=1, inplace=True)
df_X_audio_test = df_test_audio.drop('goodforairplane',axis=1)
df_y_audio_test = df_test_audio['goodforairplane']

print("audio:" + str(df_X_audio_test.shape[0]))
display(df_X_audio_test.head(2))


#####################################################
# Meta data

def pre_process_meta_data_test(df_meta):
    #create columns of the two rows belonging to each movie
    df_data = df_meta.drop(['genre','country','language'],axis=1)
    df_data_genre = df_meta['genre'].apply( lambda x: x.replace(" ", "") ).str.get_dummies(sep=",")
    df_data_genre.columns = ['genre_'+str(x) for x in df_data_genre.columns]
    
    df_data_country = df_meta['country'].apply( lambda x: x.replace(" ", "") ).str.get_dummies(sep=",")
    df_data_country.columns = ['country_'+str(x) for x in df_data_country.columns]
    
    df_data_language = df_meta['language'].apply( lambda x: x.replace(" ", "") ).str.get_dummies(sep=",")
    df_data_language.columns = ['language_'+str(x) for x in df_data_language.columns]

    df_data = pd.concat([df_data,df_data_genre,df_data_country,df_data_language], axis=1)

    df_data['runtime'] = pd.to_numeric(df_data['runtime'].apply(lambda x: int(x.split(' ')[0]) if x != 'N/A' else 0))
    df_data['year'] =  df_data['year'].apply(pd.to_numeric)

    return df_data

df_meta_data_test_p = pre_process_meta_data_test(df_meta_extended_data_test)

df_test_meta = pd.merge(df_movies_test,df_meta_data_test_p, on='filename')
df_test_meta.drop(['filename'],axis=1, inplace=True)

df_X_meta_test = df_test_meta.drop('goodforairplane',axis=1)


df_y_meta_test = df_test_meta['goodforairplane']

label_encoder = MultiColumnLabelEncoder(['rated'])    
X_labelencoded_meta_test = label_encoder.fit_transform(df_X_meta_test)

display(X_labelencoded_meta_test.head(2))

text:223


Unnamed: 0,1,1000,200000,acquired,ailing,avatar,avoid,babysitter,barbaric,battle,...,zero,zeus,zeus.1,zeus.2,zhonglian,zhuo,zombie,zombiehating,zombies,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


visual:223


Unnamed: 0,"(0, 0)","(0, 1)","(1, 0)","(1, 1)","(2, 0)","(2, 1)","(3, 0)","(3, 1)","(4, 0)","(4, 1)",...,"(821, 0)","(821, 1)","(822, 0)","(822, 1)","(823, 0)","(823, 1)","(824, 0)","(824, 1)","(825, 0)","(825, 1)"
0,0.25284,0.25278,0.23444,0.2345,0.25556,0.25565,0.23095,0.23102,0.2672,0.26718,...,13.606,13.606,38450.0,38440.0,11844.0,11847.0,23975.0,23988.0,13069.0,13071.0
1,0.1366,0.12622,0.2911,0.29358,0.38919,0.38603,0.25374,0.24926,0.18142,0.20572,...,8.6886,8.6112,13428.0,14415.0,2086.1,2519.3,5285.1,6909.0,2510.7,2963.8


audio:223


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,4.547587,-5.63623,1.434987,-0.279716,-0.669368,-1.271336,-0.705938,-0.263066,-0.273322,-0.794631,-0.060173,0.003418,-0.272562
1,62.653646,-2.540778,0.94344,-1.226452,-0.285784,-0.821387,-0.986073,-1.069481,-1.126877,-0.613598,0.16874,-0.776176,0.38972


Unnamed: 0,year,runtime,rated,tomatorating,imbdrating,metarating,genre_Action,genre_Adventure,genre_Animation,genre_Biography,...,language_Russian,language_ScottishGaelic,language_SignLanguages,language_Sioux,language_Spanish,language_Swedish,language_SwissGerman,language_Turkish,language_Urdu,language_Vietnamese
0,2014,133,3,3.2,6.0,65.470588,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2011,55,7,3.570423,8.5,65.470588,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Further preparation for meta and textual data
In the text data files, there are different words that appear in one movie and not in an other one.

Therefore, we add columns with 0 values for words that are not in the movies.

The same is done for meta data with differing languages and genres.

In [15]:
print("Training")
print(df_X_text.shape)
print(X_labelencoded_meta.shape)
print(df_X_audio.shape)
print(df_scaled_X_visual.shape)

print("Test")
print(df_X_text_test.shape)
print(X_labelencoded_meta_test.shape)
print(df_X_audio_test.shape)
print(df_scaled_X_visual_test.shape)

##############################################
# Textual

# check why test data is not consistent with train data for textual, so we 
# do not compare different words
for col in df_X_text_test.columns:
    if col not in df_X_text.columns:
        df_X_text[col] = pd.Series([0] * df_X_text.shape[0])

for col in df_X_text.columns:
    if col not in df_X_text_test.columns:
        df_X_text_test[col] = pd.Series([0] * df_X_text_test.shape[0])



# We have to order the columns, otherwise the features selected will not be at the same place
df_X_text_test = df_X_text_test.reindex(sorted(df_X_text_test.columns), axis=1)
display(df_X_text_test.head(3))

df_X_text = df_X_text.reindex(sorted(df_X_text.columns), axis=1)
display(df_X_text.head(3))

#display(X_labelencoded_meta_test.loc[1:100, :])

##############################################
# Meta
for col in X_labelencoded_meta_test.columns:
    if col not in X_labelencoded_meta.columns:
        X_labelencoded_meta[col] = pd.Series([0] * X_labelencoded_meta.shape[0])

for col in X_labelencoded_meta.columns:
    if col not in X_labelencoded_meta_test.columns:
        X_labelencoded_meta_test[col] = pd.Series([0] * X_labelencoded_meta_test.shape[0])

# # We have to order the columns, otherwise the features selected will not be at the same place
X_labelencoded_meta_test = X_labelencoded_meta_test.reindex(sorted(X_labelencoded_meta_test.columns), axis=1)
display(X_labelencoded_meta_test.head(3))

X_labelencoded_meta = X_labelencoded_meta.reindex(sorted(X_labelencoded_meta.columns), axis=1)
display(X_labelencoded_meta.head(3))

Training
(95, 3283)
(95, 71)
(95, 13)
(95, 1652)
Test
(223, 6317)
(223, 103)
(223, 13)
(223, 1652)


Unnamed: 0,00,1,10,100,1000,1000.1,100000,10000km,101yearold,10round,...,zeus.1,zeus.2,zhonglian,zhuo,zing,zombie,zombiehating,zombies,zoologists,zuckerberg
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0


Unnamed: 0,00,1,10,100,1000,1000.1,100000,10000km,101yearold,10round,...,zeus.1,zeus.2,zhonglian,zhuo,zing,zombie,zombiehating,zombies,zoologists,zuckerberg
0,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0,0,0,0.0,0
1,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0,0,0,0.0,0
2,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0,0,0,0.0,0


Unnamed: 0,country_Argentina,country_Australia,country_Bahamas,country_Belgium,country_BosniaandHerzegovina,country_Brazil,country_Bulgaria,country_Canada,country_China,country_CzechRepublic,...,language_SwissGerman,language_Turkish,language_Urdu,language_Vietnamese,language_Yiddish,metarating,rated,runtime,tomatorating,year
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,65.470588,3,133,3.2,2014
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,65.470588,7,55,3.570423,2011
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,63.0,6,114,3.5,2014


Unnamed: 0,country_Argentina,country_Australia,country_Bahamas,country_Belgium,country_BosniaandHerzegovina,country_Brazil,country_Bulgaria,country_Canada,country_China,country_CzechRepublic,...,language_SwissGerman,language_Turkish,language_Urdu,language_Vietnamese,language_Yiddish,metarating,rated,runtime,tomatorating,year
0,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,30.0,5,102,2.9,2014
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,67.0,6,105,3.4,2014
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,48.0,6,141,3.8,2014


# Task 3.2 Feature Selection

They use LVW for feature selection as described in the mentioned paper.
What is very confusion in this section is that in the end again they refer to Table 2. as also in the previous section. 
I would conclude that in the previous section they just wanted to refer to the selected classifiers and in this section to the metrics results. 
But still we would have different classifiers. 

The implementation of the LVW shouldn't be to compilcated. Maybe there is already some code out there.

### Implementation of LVW

Here I implemented the LVW from the pseudo code of the refered paper (since I have not found any code on it), with adapting that we use a greater F1 score, instead of a lower error.

It was not clearly stated how they actually "slightly modified" the LVW...

In [16]:
def randomSet(size):
    '''
    Returns a subset of available features
    '''  
    number_of_features = random.randint(1, size-1)
    features = np.empty(number_of_features, dtype = int)
    
    i = 0
    while i < number_of_features:
        rand = random.randint(0, size-1)
        if rand not in features:
            features[i] = rand
            i = i + 1
            
    return features  


def LearnAlgo(S1, D_X, D_Y, model):
    '''
    Calculates and returns metrics on given data frame with feature
    subset S1
    '''
    D_X = D_X[D_X.columns[S1]]
    metrics = calculate_metrics(model, D_X, D_Y)
    return metrics
    

def LVW(K, D_X, D_Y, model, output = True):
    '''
    Implementation of the Las Vegas Wrapper, according to the paper
    "Feature Selection and Classification - A probabilistic approach",
    modifed to maximizing F1 instead of minimizing error.
    '''
    k = 0
    C = 100
    metrics = {"precision": 0,
               "recall": 0,
               "F1": 0}
    
    size = D_X.columns.size
    if size == 1: # fix for dataframes with size 1
        return calculate_metrics(model, D_X, D_Y), [0]
    
    while k < K:
        S1 = randomSet(size)
        C1 = S1.size
        metrics_1 = LearnAlgo(S1, D_X, D_Y, model)
        
        if (metrics_1["F1"] > metrics["F1"] or 
            (metrics_1["F1"] == metrics["F1"] and C1 < C)):
            if output:
                print("Current best F1 = " + str(metrics_1["F1"]) + ", size = " + str(C1))
            k = 0
            metrics = metrics_1
            C = C1
            S = S1
        
        k = k + 1

    return metrics, S


### Trys for different data modalities

In [17]:
# Audio
LVW(5, df_X_audio, df_y_audio, KNeighborsClassifier())

Current best F1 = 0.49645909645909647, size = 2
Current best F1 = 0.5722488622488623, size = 1
Current best F1 = 0.5906371406371406, size = 6


(precision    0.603776
 recall       0.601515
 F1           0.590637
 dtype: float64, array([4, 5, 1, 0, 9, 2]))

In [18]:
# Textual
LVW(5, df_X_text, df_y_text, KNeighborsClassifier())

Current best F1 = 0.4361258186258185, size = 580
Current best F1 = 0.5404322257263433, size = 4865


(precision    0.564087
 recall       0.583333
 F1           0.540432
 dtype: float64, array([6619, 5778, 4993, ..., 2135, 6550, 6215]))

In [19]:
# Visual
LVW(5, df_scaled_X_visual, df_y_visual, KNeighborsClassifier())

Current best F1 = 0.5478018278018277, size = 225
Current best F1 = 0.5572133422133421, size = 340
Current best F1 = 0.5599130499130499, size = 361
Current best F1 = 0.6069578569578569, size = 364


(precision    0.643819
 recall       0.621515
 F1           0.606958
 dtype: float64,
 array([ 825,  752,  965,  680, 1347,  533,  639, 1001,  597, 1368, 1060,
         649,  678,  434,  517, 1621,  621,  406,  577, 1575,  777,  628,
         903,  407, 1012, 1409,  921,  748,  929, 1009,  510,   22,  420,
         391, 1396, 1064, 1632,  600, 1529, 1555,  548, 1118,  784,  684,
        1071, 1033,  463,  904, 1408,  961, 1323,  514, 1233, 1339,  475,
        1448, 1363,  584, 1607, 1426, 1457, 1571, 1376, 1437, 1183, 1000,
        1589,  549, 1550,  972, 1058, 1614, 1343,  918, 1643, 1586, 1214,
        1270,  794,   41, 1606, 1403,  703,  753,  884, 1300,  575, 1039,
         952, 1526,  568,  726,  551,  496,  501,  956,  980,  834,  383,
         807, 1576,  488,  837, 1463,  920,  702,  734,  390,   64, 1450,
         482, 1215,   67, 1151, 1580, 1025, 1467,  740,   65, 1374,  943,
        1164, 1635, 1598,  529,  988,  120, 1487, 1069,  616,   85,   73,
         370,  781, 1148, 

In [20]:
# Meta
LVW(5, X_labelencoded_meta, df_y_meta, KNeighborsClassifier())

Current best F1 = 0.44324175824175827, size = 67
Current best F1 = 0.48158952158952156, size = 96
Current best F1 = 0.48950937950937945, size = 99
Current best F1 = 0.5257627557627557, size = 91


(precision    0.542358
 recall       0.537071
 F1           0.525763
 dtype: float64,
 array([  3,  33,  95,  68,  51,  31,  22, 103,  76, 106,  27,  40,  39,
         55, 101,  32,  77, 105,  44,  69,   1,  30,  37,  85,  79, 111,
        107,  73,  96,  10,  75,  58,  70,  42,  17,  92,   7, 104,  25,
         50,  11,  56,  21, 109,  41, 102,  43,  18,  63,  83,  78,  28,
         52,  98,  90,  81,  80,  66, 108,  12,  14,  91,  49,  47,  87,
         62,   2,  23,  36,  84,  86,  67,  99,   6,  71,  93, 112, 110,
         34, 100,  64,  65,  16,  48,  24,  94,   9,  45,  88,  97,  35]))

### Application of LVW on selected combinations

No information about seed and so on, just that in each run there are different subsets and therefore different results (we set a seed here for enabling reproduction). I stored the features we use (from our combinations and from them of the paper) into files so we dont have to execute this code every time.

In [21]:

def str2Class(str):
    if str == "KNeighborsClassifier" or str == "NearestCentroid" or str == "GaussianNB":
        return getattr(sys.modules[__name__], str)()
    return getattr(sys.modules[__name__], str)(random_state = 123)



def run_LVW_Selected_Combinations(K):
    
    df_final_results_lvw = pd.DataFrame()
    df_x = pd.DataFrame()
    df_y = pd.DataFrame()

    random.seed(123)
    
    for index, row in df_r.iterrows():
        model = str2Class(index)
        print(getModelName(model) + " - " + row["Modality"])

        # get correct data frame
        if row["Modality"] == "metadata":
            df_x = X_labelencoded_meta
            df_y = df_y_meta
        elif row["Modality"] == "visual":
            df_x = df_scaled_X_visual
            df_y = df_y_visual

        elif row["Modality"] == "textual":
            df_x = df_X_text
            df_y = df_y_text
        else:
            df_x = df_X_audio
            df_y = df_y_audio

        # Calculate and append LVW metrics
        metrics = pd.DataFrame()
        S = []
        if getModelName(model) == "RandomForestClassifier":
            print("skipping random forest..")
            # evaluate random forest withot LVW since it already performs feature selection
            random.seed(123)
            m = calculate_metrics(model, df_x, df_y)
        else:
            # feature selection metrics
            random.seed(123)
            m, S = LVW(K, df_x, df_y, model, False)
        print(m)
        features = ','.join(map(str, S))
        print(features)
        metrics[getModelName(model)] = m
        metrics = metrics.T
        metrics["Modality"] = row["Modality"]
        metrics["Features"] = features

        df_final_results_lvw = df_final_results_lvw.append(metrics)
    
    # Results with feature selection
    #display(df_final_results_lvw)#[df_final_results_lvw.columns.difference(["Features"])])

    # save final data frame (TODO: uncomment when rerunning LVW)
    df_final_results_lvw.to_csv('./data/results.csv', sep=';', encoding='utf-8')


### Final results
Some of the results got better, some got worse since we only use a feature subspace where the fact if the full data is tried out is just random.

In [22]:
# TODO: Uncomment if you want to rerun LVW!!!
# max number of runs for finding better LVW combinations -> better results, longer runtime with higher K   
run_LVW_Selected_Combinations(200)

DecisionTreeClassifier - metadata
precision    0.671380
recall       0.638788
F1           0.630269
dtype: float64
48,37,31,85,12,34,5,73,46,67,4,113,97,94,7,102,6,38,41,58,106
KNeighborsClassifier - textual
precision    0.683421
recall       0.669091
F1           0.638991
dtype: float64
331,611,5401,2314,7577,3479,6974,1766,2594,2047,1628,632,6457,4275,6055,346,2754,2096,3988,2706,4439,7817,1162,3921,1457,5992,1406,310,6019,2851,4571,4166,362,2433,6633,1947,7398,6157,360,900,2312,254,1095,6340,6413,1403,7440,1596,1984,6828,3626,4359,7944,2845,6578,87,7297,7346,6568,115,6796,1227,7291,6096,4544,5790,4944,4608,7136,352,4667,2696,4126,658,316,1165,7613,5857,5920,5172,5335,197,4634,5607,5589,1142,2418,1568,7237,2680,662,424,7970,3427,3537,6656,933,3731,275,6564,7842,6403,3937,3763,6137,4226,3302,6087,2854,334,7873,1256,4354,305,277,936,124,340,429,1896,99,7909,6617,4659,1276,6562,1334,7305,5491,2258,3582,6898,3715,4222,5604,5654,3022,4186,3044,3762,433,1780,6891,3736,5873,3438,5381,6482,6

precision    0.678831
recall       0.678182
F1           0.645215
dtype: float64
3453,5928,3612,2297,7986,4282,2328,5127,6405,7489,1293,5675,528,4951,7258,4653,1375,561,1497,5328,4312,5581,7327,642,5810,3620,5332,3376,7732,3823,4211,2861,2554,3004,4554,6112,4530,4913,4866,4945,2085,4053,5101,4712,4668,7348,4954,3199,3110,2662,6608,1225,6343,4897,4341,7336,5131,2964,3863,2269,1073,5305,400,7527,7695,3474,2032,7243,5198,1895,6840,4820,857,978,2133,3597,2289,1976,6581,2891,1525,4071,5717,6327,3727,3099,6156,2439,4555,3184,2868,3947,635,4234,6360,2876,6530,2050,6312,1910,3886,2953,769,5823,835,6121,7238,3699,6877,2632,92,596,4728,2280,3173,3432,701,5446,3956,5014,2613,4868,6345,1643,2622,3605,1019,2401,4363,3133,5335,5289,7431,5890,1150,6150,1406,3564,434,7944,2199,8009,873,2421,2321,2161,3520,5395,2314,2760,3105,1466,1793,2383,3658,4617,4216,5878,2241,1787,1634,6650,1732,2558,5960,2264,6991,2870,2073,3839,3743,2414,2968,4999,2190,7089,7627,489,6265,5214,4968,3411,2550,3774,3977,2061,3013,

precision    0.670996
recall       0.652121
F1           0.628818
dtype: float64
2466,5583,5459,6393,7244,7179,7166,6008,6781,4926,7786,6736,3637,5561,2476,6768,6793,6716,4015,2230,6969,7987,4699,2352,7215,5041,5030,3514,6859,3672,5298,3741,3387,5347,7171,3626,6099,4670,6269,7880,7121,3971,5350,7229,7956,7133,2188,3485,4099,2996,4541,7857,6775,6757,2637,7794,4737,5056,6790,4215,5104,3234,5147,7209,4370,3046,6802,6778,4462,2824,6666,3240,3792,7950,3932,5642,6892,5809,4191,7322,5868,3256,3835,4002,4840,4244,6024,6993,5336,5696,6900,7507,4556,4626,4470,2266,4618,4267,5590,7387,4325,2895,7271,2359,3312,5832,4961,6682,6267,2373,5792,5633,7928,5942,3071,6729,3871,2872,7945,5655,3809,4235,3687,5399,2959,3704,7448,5419,3763,3522,4767,5505,2725,2718,5906,4269,3508,4798,2251,5737,7356,7143,7615,5150,7949,3340,2692,4041,2391,3449,4135,6420,2187,3782,4491,2381,4096,2605,7593,6770,6179,6879,4268,2481,5353,7427,4240,4110,5828,6197,5489,2626,7275,7024,7552,6946,6078,3888,5067,3341,5471,2816,5255,3003

precision    0.750310
recall       0.713434
F1           0.681780
dtype: float64
3504,5486,6500,3875,5280,7873,7292,6392,5474,776,6294,3789,5055,4044,6788,2285,7091,2697,7395,7839,5559,4831,533,6727,7332,1085,6049,4692,7173,2221,1521,1048,2299,6682,6708,1406,4630,6685,5554,5910,1956,7187,4226,2769,3559,5804,1440,1120,6744,1900,5791,3094,7034,370,979,7713,7052,5392,6496,5220,2525,5803,1892,2466,2752,6584,4132,6046,5687,7527,6099,5893,3495,1751,2374,6135,3613,5339,7081,1193,3470,7454,6646,843,2348,4977,1675,6769,5378,7486,1908,6064,6524,4743,1143,3030,7111,5859,3176,7987,3415,1255,4936,3469,7267,5135,3056,4174,5052,4960,4055,6789,2272,1781,4846,6980,2550,3639,4081,5146,7008,6460,7865,2655,1274,4885,5204,2333,7336,6189,7670,4713,4567,522,2693,2326,6951,7573,8014,7620,927,7112,991,2362,6840,7517,328,446,3505,6642,6072,5543,761,3310,1642,6325,7976,4018,3463,7266,1712,2581,2479,610,4925,3160,2472,5154,4571,2501,6268,3043,6182,5137,7100,6174,990,6874,1806,1174,7535,1002,1041,3796,3019,1608,73

precision    0.686145
recall       0.683030
F1           0.675815
dtype: float64
7044,4170,7633,7114,4917,6397,6408,4094,4997,7178,7152,5965,2273,7024,5703,7980,7572,4660,3783,5937,3116,2824,2542,6976,6905,5967,2939,4403,6306,7271,7530,7528,6606,3893,5038,3333,6694,6410,4145,2670,5577,3403,7464,4340,5579,5809,2306,6902,7641,5552,3774,6063,2845,2638,2969,6039,5694,7858,3613,5478,3074,6243,2860,4011,3946,2867,6637,2780,4837,2522,4684,2409,3942,5402,4039,5177,6487,5114,3218,6845,3299,5805,7073,5380,7267,7675,69,2644,4720,2307,6961,3098,2977,6445,3536,7428,2453,2558,7626,6399,5042,5688,7256,3664,7205,4574,5670,5509,3877,3180,3647,7994,6076,4503,2394,7302,2880,3634,3422,7420,3870,8031,6503,7047,3892,6151,3958,7967,7120,2596,4477,6320,3884,2737,3376,7194,5379,4561,5398,4543,110,6652,4875,3975,6474,3475,4744,7973,6107,6523,5581,5568,6178,49,2203,4595,3077,4291,7486,7779,4276,3669,6514,7220,5615,6074,6824,5239,4455,6959,2725,5523,5546,4018,2778,7221,6055,2714,3523,6960,2439,4803,5723,4003,6135

precision    0.720844
recall       0.697273
F1           0.692278
dtype: float64
548,178,1574,834,545,220,776,1098,1151,680,697,326,276,690,1148,683,1437,502,335,3,893,1584,179,1223,773,143,13,646,1499,918,208,189,1366,291,258,1618,597,881,1174,976,543,961,624,703,1068,1641,989,423,1247,1308,1079,1156,645,25,815,1575,1323,1050,889,1402,1102,1307,1631,1365,1221,995,1066,1369,859,767,1044,65,1544,1321,371,1486,169,997,1347,536,348,685,1135,801,150,1480,935,793,1173,1594,688,33,387,1200,180,1286,744,26,720,469,830,1089,1026,1482,122,589,1013,1565
NearestCentroid - visual
precision    0.653089
recall       0.628687
F1           0.617516
dtype: float64
1383,659,33,1233,1573,62,567,613,77,256,1266,653,1077,1446
LogisticRegression - visual
precision    0.643304
recall       0.629495
F1           0.620596
dtype: float64
275,583,1519,192,942,451,735,1366,514,492,1574,561,714,602,812,1648,456,1489,499,422,177,494,635,439,335,626,592,1014,1289,1226,822,928,590,1291,756,1342,1487,849,1564,51,848,6

precision    0.698989
recall       0.661717
F1           0.647468
dtype: float64
1191,748,696,426,167,366,494,192,283,1505,1016,1474,765,570,909,888,1078,90,889,622,1484,1251,1270,24,1140,205,1223,1624,557,1284,1073,135,1300,432,1273,541,179,710,12,789,914,1084,138,1520,150,1224,220,71,1055,869,1346,407,137,617,1371,625,1595,1258,284,293,1138,1172,339,1189,1568,140,1031,107,1193,674,115,1376,1599,1077,1296,1290,279,1012,676,1405,965,520,460,1637,282,528,1471,878,1585,1280,1205,1384,1342,558,1307,1118,1115,1201,243,320,29,384,1490,434,538,173,916,1218,652,183,1407,251,825,1158,870,603,1498,92,343,324,592,1003,1037,253,247,876,163,352,615,707,133,610,1250,605,1522,971,681,1204,81,236,1309,585,657,292,390,1192,1453,521,350,154,34,93,951,1060,1534,1538,1423,750,309,862,325,265,992,942,87,816,1427,331,96,1174,866,15,1252,989,647,354,1107,1149,201,359,1308,988,44,903,530,633,1542,1648,397,1600,511,1267,377,1122,732,1064,1472,744,431,1061,1377,1470,391,1446,930,1572,188,222,157,80,830,1019,12

In [23]:
# Previous results
display(df_r)

# Access saved data from function run_LVW_Selected_Combinations()
test_read = pd.read_csv('./data/results.csv', sep=';', encoding='utf-8', header = 0, index_col = 0)
test_read

Unnamed: 0,precision,recall,F1,Modality
DecisionTreeClassifier,0.555659,0.547273,0.539899,metadata
KNeighborsClassifier,0.506534,0.53303,0.512809,textual
NearestCentroid,0.555174,0.555253,0.531696,textual
DecisionTreeClassifier,0.522597,0.536869,0.511263,textual
BaggingClassifier,0.529374,0.56798,0.523605,textual
GradientBoostingClassifier,0.65407,0.629697,0.58193,textual
GaussianNB,0.537073,0.558182,0.538881,textual
KNeighborsClassifier,0.558343,0.56,0.544837,visual
NearestCentroid,0.576882,0.539596,0.508983,visual
LogisticRegression,0.513166,0.509394,0.501544,visual


Unnamed: 0,precision,recall,F1,Modality,Features
DecisionTreeClassifier,0.67138,0.638788,0.630269,metadata,"48,37,31,85,12,34,5,73,46,67,4,113,97,94,7,102..."
KNeighborsClassifier,0.683421,0.669091,0.638991,textual,"331,611,5401,2314,7577,3479,6974,1766,2594,204..."
NearestCentroid,0.678831,0.678182,0.645215,textual,"3453,5928,3612,2297,7986,4282,2328,5127,6405,7..."
DecisionTreeClassifier,0.681568,0.648889,0.641436,textual,"4847,6571,3430,6108,3467,7943,2589,1543,4615,5..."
BaggingClassifier,0.670996,0.652121,0.628818,textual,"2466,5583,5459,6393,7244,7179,7166,6008,6781,4..."
GradientBoostingClassifier,0.75031,0.713434,0.68178,textual,"3504,5486,6500,3875,5280,7873,7292,6392,5474,7..."
GaussianNB,0.686145,0.68303,0.675815,textual,"7044,4170,7633,7114,4917,6397,6408,4094,4997,7..."
KNeighborsClassifier,0.720844,0.697273,0.692278,visual,"548,178,1574,834,545,220,776,1098,1151,680,697..."
NearestCentroid,0.653089,0.628687,0.617516,visual,"1383,659,33,1233,1573,62,567,613,77,256,1266,6..."
LogisticRegression,0.643304,0.629495,0.620596,visual,"275,583,1519,192,942,451,735,1366,514,492,1574..."


### LVW Feature selection on the classifiers of the paper
Since we have other classifiers as an output, I decided to also use the combinations stated in the paper, that should be also nice to compare them.

In [24]:

def run_LVW_Paper_Combinations(K):
    
    paper_combinations_modality = ['metadata', 'metadata', 'metadata','metadata', 'metadata',
                                   'metadata', 'metadata', 'metadata', 'metadata', 'textual',
                                   'textual', 'textual', 'visual', 'visual', 'visual', 'visual', 
                                   'visual', 'visual', 'visual', 'audio', 'audio']
    paper_combinations_classifier = ['KNeighborsClassifier', 'NearestCentroid', 'DecisionTreeClassifier','LogisticRegression', 
                                     'SVC', 'BaggingClassifier', 'RandomForestClassifier', 'AdaBoostClassifier', 
                                     'GradientBoostingClassifier', 'GaussianNB', 'KNeighborsClassifier', 'SVC', 
                                     'KNeighborsClassifier', 'DecisionTreeClassifier', 'LogisticRegression', 'SVC', 
                                     'RandomForestClassifier', 'AdaBoostClassifier', 'GradientBoostingClassifier', 
                                     'LogisticRegression', 'GradientBoostingClassifier']

    paper_combination_score = pd.DataFrame()
    paper_combination_score["Modality"] = pd.Series(paper_combinations_modality)
    paper_combination_score["Classifier"] = paper_combinations_classifier
    paper_combination_score["Features"] = pd.Series()
    paper_combination_score.set_index(["Classifier"], inplace = True)

    df_final_results_paper = pd.DataFrame()
    df_x = pd.DataFrame()
    df_y = pd.DataFrame()


    random.seed(123)

    for index, row in paper_combination_score.iterrows():
        model = str2Class(index)
        print(getModelName(model) + " - " + row["Modality"])

        # get correct data frame
        if row["Modality"] == "metadata":
            df_x = X_labelencoded_meta
            df_y = df_y_meta
        elif row["Modality"] == "visual":
            df_x = df_scaled_X_visual
            df_y = df_y_visual

        elif row["Modality"] == "textual":
            df_x = df_X_text
            df_y = df_y_text
        else:
            df_x = df_X_audio
            df_y = df_y_audio

        # Calculate and append LVW metrics
        metrics = pd.DataFrame()
        S = []
        if getModelName(model) == "RandomForestClassifier":
            print("skipping random forest..")
            # evaluate random forest withot LVW since it already performs feature selection
            random.seed(123)
            m = calculate_metrics(model, df_x, df_y)
        else:
            # feature selection metrics
            random.seed(123)
            m, S = LVW(K, df_x, df_y, model, False)
        print(m)
        features = ','.join(map(str, S))
        print(features)
        metrics[getModelName(model)] = m
        metrics = metrics.T
        metrics["Modality"] = row["Modality"]
        metrics["Features"] = features

        df_final_results_paper = df_final_results_paper.append(metrics) 

    F1_paper = [0.630, 0.591, 0.563, 0.578, 0.574, 0.631, 0.576, 0.536, 0.569, 0.702, 0.666, 0.707,
                0.608, 0.535, 0.608, 0.580, 0.638, 0.654, 0.587, 0.546, 0.587]

    df_final_results_paper["F1 Paper"] = F1_paper
    df_final_results_paper["Difference"] =  df_final_results_paper["F1 Paper"] - df_final_results_paper["F1"]
    #display(df_final_results_paper)

    # save final data frame 
    df_final_results_paper.to_csv('./data/results_paper.csv', sep=';', encoding='utf-8')
    

In [25]:
# TODO: Uncomment if you want to rerun LVW!!!
# max number of runs for finding better LVW combinations -> better results, longer runtime with higher K   
run_LVW_Paper_Combinations(200)

KNeighborsClassifier - metadata
precision    0.617213
recall       0.599495
F1           0.589546
dtype: float64
69,63,97,2,79,106,108,110,98,83,105,65,77,71,75,84,70,104,96,3,89,100,99,101,94,85,72,5,17,95,91,7,23,90,13,102,28,34,87,11,27,29,107,20,1,80,41,113,26,22,6,8,81,24,42,32,68,10,86,38,37,18,67
NearestCentroid - metadata
precision    0.574361
recall       0.615758
F1           0.539099
dtype: float64
106,48,96,44,25
DecisionTreeClassifier - metadata
precision    0.630174
recall       0.621919
F1           0.619452
dtype: float64
91,99,102,85,97,86,76,74,103,0,95,84,80,90,79,13,89,1,100,109,107,101,87,4,113,98,17,10,5,9,83,21,92,93,8,3,78,96,88,23,32,2,77,36,15,31,44,14,26,6,24,82,37,104,48,33,7,27,81,47,30,94,50,61,18,46,65,64,43,111,75,19,68,39
LogisticRegression - metadata
precision    0.653385
recall       0.637778
F1           0.596167
dtype: float64
96,31,56,2,64,27,61,42,70,88,94,63,74,85,79,30,14,54,13,59,8,84,95,44,108
SVC - metadata
precision    0.623069
recall       

precision    0.697266
recall       0.673030
F1           0.641974
dtype: float64
2226,2799,3845,4879,5395,6547,3710,7337,7891,2098,5694,7388,1225,1008,6674,73,7701,5579,6089,2173,6080,4320,5221,7839,2984,4117,4840,6758,8016,1354,6209,2442,6692,5458,1306,7497,5583,4733,4867,4951,3007,310,2502,4046,4016,4639,4572,6748,7194,7734,7216,6589,7915,5680,727,3976,7602,4357,4561,4440,7508,7171,1492,5017,6092,1689,6742,242,5643,7776,7477,4173,2072,5541,8017,6786,3717,4104,3984,3716,7953,7144,6321,692,6660,929,7530,4973,3901,2608,4530,4383,1212,4709,6310,3607,6502,5852,5267,688,5671,4373,6926,4450,4559,7932,6850,7140,3307,7672,38,2928,5561,7627,5113,4939,4949,6834,5718,6581,2713,5404,5871,6045,1769,5768,4687,3674,1744,5149,4717,2233,3292,352,3291,5748,3946,6132,6783,6958,2649,746,3128,7305,5180,5698,5927,7395,5571,7606,7568,7269,7759,7185,3025,1723,6720,320,7246,4809,5161,4378,5278,5126,7975,7379,6344,4265,17,2930,12,4012,7034,6250,5106,3165,1658,4389,4835,3645,4248,4990,6819,6983,3118,5183,4465,5

precision    0.300554
recall       0.547980
F1           0.388116
dtype: float64
2762,3927,6379,5249,7978,811,7915,5314,338,4628,6861,6578,154,1424,3155,790,4649,3670,6540,4785,6345,1160,5729,7023,7698,4520,5560,2050,934,1117,399,1919,2347,6068,2010,4742,8029,7970,6238,5024,1727
KNeighborsClassifier - visual
precision    0.720844
recall       0.697273
F1           0.692278
dtype: float64
548,178,1574,834,545,220,776,1098,1151,680,697,326,276,690,1148,683,1437,502,335,3,893,1584,179,1223,773,143,13,646,1499,918,208,189,1366,291,258,1618,597,881,1174,976,543,961,624,703,1068,1641,989,423,1247,1308,1079,1156,645,25,815,1575,1323,1050,889,1402,1102,1307,1631,1365,1221,995,1066,1369,859,767,1044,65,1544,1321,371,1486,169,997,1347,536,348,685,1135,801,150,1480,935,793,1173,1594,688,33,387,1200,180,1286,744,26,720,469,830,1089,1026,1482,122,589,1013,1565
DecisionTreeClassifier - visual
precision    0.692088
recall       0.683939
F1           0.681861
dtype: float64
43,320,1488,879,1472,1531,3

In [26]:
# Accessing saved data (saves time since recompiling takes long)
test_read_paper = pd.read_csv('./data/results_paper.csv', sep=';', encoding='utf-8', header = 0, index_col = 0)
test_read_paper[["Modality", "precision", "recall", "F1", "F1 Paper", "Difference"]]

Unnamed: 0,Modality,precision,recall,F1,F1 Paper,Difference
KNeighborsClassifier,metadata,0.617213,0.599495,0.589546,0.63,0.040454
NearestCentroid,metadata,0.574361,0.615758,0.539099,0.591,0.051901
DecisionTreeClassifier,metadata,0.630174,0.621919,0.619452,0.563,-0.056452
LogisticRegression,metadata,0.653385,0.637778,0.596167,0.578,-0.018167
SVC,metadata,0.623069,0.611717,0.599093,0.574,-0.025093
BaggingClassifier,metadata,0.595123,0.580202,0.574685,0.631,0.056315
RandomForestClassifier,metadata,0.342068,0.372323,0.35197,0.576,0.22403
AdaBoostClassifier,metadata,0.626233,0.615354,0.585141,0.536,-0.049141
GradientBoostingClassifier,metadata,0.647344,0.656667,0.609539,0.569,-0.040539
GaussianNB,textual,0.641715,0.640808,0.630979,0.702,0.071021


Some, like Support Vector Machines with radial kernel for textual data differ a lot! (0.318884 worse!)

# 3.3 Classifier stacking

## Majority Voting
This is the simplest case, where we select classifiers and feature subspaces through the steps above, and assign final predicted labels through majority voting on the labels of the 21 classifiers.

First we prepare a data frame containing our predictions that we have obtained by cv, so that we do not need to build it again for classifier stacking.

In [27]:
predictions_df = pd.DataFrame()
for i in range(0, 21):
    predictions_df[i] = pd.Series([0]*df_X_text.shape[0])
predictions_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Majority voting (CV)

In [28]:
classifier_combinations = pd.read_csv('./data/results_paper.csv', sep=';', encoding='utf-8', 
                                      header = 0, index_col = 0)[["Modality", "Features"]]
df_y = df_y_text

kf = KFold(n_splits = 10, random_state = 123)
           
random.seed(123)

f1_scores_voting = [0]*10
recall = 0
precision = 0

k = 0
for train_index, test_index in kf.split(df_X_text):
    predicitons = [0] * len(test_index)  
    
    i = 0
    for index, row in classifier_combinations.iterrows():
        model = str2Class(index)       
        
        # get correct data frame
        if row["Modality"] == "metadata":
            df_x = X_labelencoded_meta
        elif row["Modality"] == "visual":
            df_x = df_scaled_X_visual
        elif row["Modality"] == "textual":
            df_x = df_X_text
        else:
            df_x = df_X_audio
        
        
        # Calculate and append LVW metrics
        if getModelName(model) == "RandomForestClassifier":
            # evaluate random forest without selected features since it already performs feature selection
            mod = model.fit(df_x.loc[train_index, :], df_y[train_index])
            pred = mod.predict(df_x.iloc[test_index, :])
            
        else:
            # get results with features from LVW        
            features = [int(i) for i in row["Features"].split(",")]
            df_x_features = df_x[df_x.columns[features]]
            
            # Predicitions
            mod = model.fit(df_x_features.loc[train_index, :], df_y[train_index])
            pred = mod.predict(df_x_features.iloc[test_index, :])    
        
        # Add results to final label data set
        for j, p in enumerate(pred):
            predictions_df.loc[j + test_index[0] , i] = p
            
        predicitons = predicitons + pred
        i = i + 1
            
    predictions_majority = pd.Series(predicitons).map(lambda x: 0 if x < classifier_combinations.shape[0]/2 else 1)
    
    # Save F1 scores for significance testing later on
    f1_scores_voting[k] = f1_score(df_y[test_index], predictions_majority)
    k = k + 1
    
    recall = recall + recall_score(df_y[test_index], predictions_majority)
    precision = precision + precision_score(df_y[test_index], predictions_majority)
    
    
    
    
print("Precision: " + str(precision/10))
print("Recall: " + str(recall/10))
print("F1 score: " + str(np.mean(f1_scores_voting)))

print("F1 scores by CV: " + str(f1_scores_voting))

Precision: 0.6220238095238095
Recall: 0.8323809523809524
F1 score: 0.7085983624218918
F1 scores by CV: [0.75, 0.7692307692307692, 0.8235294117647058, 0.7142857142857143, 0.8571428571428571, 0.7692307692307693, 0.8333333333333333, 0.7692307692307693, 0.8000000000000002, 0.0]


### Majority voting (test)
Here we fit the models for the training data, and obtain test performance also by using 10-fold CV, like mentioned in the paper.

We did the CV for the training data to check its robustness against different splits, because somehow it makes no sense to split the test data at all.

In [29]:
# create prediction data frame for label stacking test
predictions_test_df = pd.DataFrame()
for i in range(0, 21):
    predictions_test_df[i] = pd.Series([0]*df_X_text_test.shape[0])
predictions_test_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
df_y_test = df_y_meta_test

f1_scores_voting_test = [0]*10
recall = 0
precision = 0

k = 0
# use k-fold cv for training data and apply it on test data
for train_index, test_index in kf.split(df_X_text):
    predicitons = [0] * df_y_test.shape[0] 
    
    i = 0
    for index, row in classifier_combinations.iterrows():
        model = str2Class(index)       
        
        # get correct data frames
        if row["Modality"] == "metadata":
            df_x = X_labelencoded_meta
            df_x_test = X_labelencoded_meta_test
        elif row["Modality"] == "visual":
            df_x = df_scaled_X_visual
            df_x_test = df_scaled_X_visual_test
        elif row["Modality"] == "textual":
            df_x = df_X_text
            df_x_test = df_X_text_test
        else:
            df_x = df_X_audio
            df_x_test = df_X_audio_test
        
        
        # Calculate and append LVW metrics
        if getModelName(model) == "RandomForestClassifier":
            # evaluate random forest without selected features since it already performs feature selection
            # fit on training data
            mod = model.fit(df_x.loc[train_index, :], df_y[train_index])
            # evaluation for test data
            pred = mod.predict(df_x_test)
            
        else:
            # get results with features from LVW        
            features = [int(i) for i in row["Features"].split(",")]
            df_x_features = df_x[df_x.columns[features]]
            
            # Fit model for CV of train data
            mod = model.fit(df_x_features.loc[train_index, :], df_y[train_index])
            df_x_features_test = df_x_test[df_x_test.columns[features]]
            
            # Predicitions for test data
            pred = mod.predict(df_x_features_test) 
            
        # Add results to final label data set
        for j, p in enumerate(pred):
            predictions_test_df.loc[j , i] = p
            
        predicitons = predicitons + pred
        i = i + 1

    predictions_majority = pd.Series(predicitons).map(lambda x: 0 if x < classifier_combinations.shape[0]/2 else 1)
    f1_scores_voting_test[k] = f1_score(df_y_test, predictions_majority)
    k = k + 1

    recall = recall + recall_score(df_y_test, predictions_majority)
    precision = precision + precision_score(df_y_test, predictions_majority)
    
print("Precision: " + str(precision/10))
print("Recall: " + str(recall/10))
print("F1 score: " + str(np.mean(f1_scores_voting_test)))
print("F1 scores by CV: " + str(f1_scores_voting_test))

Precision: 0.5743226836743283
Recall: 0.7533333333333332
F1 score: 0.651460103256611
F1 scores by CV: [0.6495176848874598, 0.6184210526315789, 0.6229508196721312, 0.6516129032258063, 0.6491803278688525, 0.6855345911949686, 0.6472491909385113, 0.6561514195583595, 0.6451612903225806, 0.688821752265861]


## Label Stacking
Assume we have n instances and T base classifiers, then we can generate an n by T matrix consisting of predictions (labels) given by each classifier. Label combining strategy tries to build a second-level classifier based on this label matrix, and return a final prediction result for that.

### Label Stacking (cv)
Here we use the prediction data frame obtained by the predictions already done in the previous section. The best one is obtained by NearestCentroid as a meta learner, with an F1 score of 0.744057.

In [31]:
def calculate_metrics_extended(clf, X, y):
    metric =  cross_validate(clf, X, y, scoring=('precision_weighted','recall_weighted','f1_weighted'), 
                             return_train_score=False, cv=10)  
    return pd.Series({'precision':metric['test_precision_weighted'].mean(),'recall':metric['test_recall_weighted'].mean(),
                      'F1':metric['test_f1_weighted'].mean(),
                     'F1 scores': metric['test_f1_weighted']})


def evaluate_models_extended(X, y):
    metrics = pd.DataFrame()

    for model in model_list:
        m = calculate_metrics_extended(model, X, y)
        metrics[getModelName(model)] = m

    return metrics.T

In [32]:
display(predictions_df.head(2))

label_stack = evaluate_models_extended(predictions_df, df_y)
label_stack[["precision", "recall", "F1"]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,1,1,1,1,1,0,1,1,1,1,...,1,1,0,0,1,0,1,1,1,1
1,0,1,1,1,1,1,1,1,1,1,...,1,0,1,1,0,0,0,0,0,0


Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.716673,0.704343,0.69623
NearestCentroid,0.763656,0.746566,0.744057
DecisionTreeClassifier,0.540813,0.539798,0.529899
LogisticRegression,0.676554,0.660808,0.649405
SVC,0.715491,0.706162,0.700039
BaggingClassifier,0.647631,0.639798,0.636934
AdaBoostClassifier,0.671207,0.65697,0.6498
GradientBoostingClassifier,0.695963,0.671212,0.658197
RandomForestClassifier,0.675529,0.651717,0.644557
GaussianNB,0.7431,0.725455,0.721316


### Label Stacking (test)
We obtained the best results with the AdaBoostClassifier, with an F1 score of 0.637919.

In [33]:
display(predictions_test_df.head(2))


label_stack_scores = pd.DataFrame()
label_stack_scores["Classifier"] = pd.Series()
label_stack_scores["Precision"] = pd.Series()
label_stack_scores["Recall"] = pd.Series()
label_stack_scores["F1"] = pd.Series()
label_stack_scores["F1 scores"] = pd.Series()

for model in model_list:
    f1 = [0]*10
    recall = [0]*10
    precision = [0]*10
    i = 0
    for train_index, test_index in kf.split(predictions_df):
        # Fit model for CV of train data
        mod = model.fit(predictions_df.loc[train_index, :], df_y[train_index])
        # Predicitions for test data
        pred = mod.predict(predictions_test_df)
        
        f1[i] = f1_score(df_y_test, pred)
        recall[i] = recall_score(df_y_test, pred)
        precision[i] = precision_score(df_y_test, pred)
        i = i + 1
    
    label_stack_scores = label_stack_scores.append(pd.Series({"Classifier": getModelName(model),
                                        "Precision": np.mean(precision),
                                        "Recall": np.mean(recall),
                                        "F1": np.mean(f1),
                                        "F1 scores": f1}), 
                              ignore_index = True)
    
label_stack_scores[["Classifier", "Precision", "Recall", "F1"]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,1,1,0,0,1,0,0,0,0,0,...,1,0,1,0,0,1,0,1,1,1
1,1,1,0,1,1,0,0,0,0,1,...,1,1,0,1,1,1,0,1,1,1


Unnamed: 0,Classifier,Precision,Recall,F1
0,KNeighborsClassifier,0.59207,0.668889,0.627959
1,NearestCentroid,0.57521,0.574815,0.574891
2,DecisionTreeClassifier,0.601738,0.651852,0.62473
3,LogisticRegression,0.598634,0.652593,0.623804
4,SVC,0.572556,0.606667,0.588034
5,BaggingClassifier,0.609175,0.553333,0.578541
6,AdaBoostClassifier,0.598354,0.684444,0.637919
7,GradientBoostingClassifier,0.602636,0.651111,0.625254
8,RandomForestClassifier,0.600504,0.520741,0.556993
9,GaussianNB,0.543287,0.54963,0.546341


## Label-Attribute Stacking
Similar to label stacking, label-feature stacking strategy uses both base-classifier predictions and features as training data to predict output.

For that reason, we merge the prediction labels obtained by the test data and merge them with all features.

In [34]:
# Adapt training data
new_cols = [0]*predictions_df.columns.size
for i, val in enumerate(predictions_df.columns):
    new_cols[i] = "l"+str(i)
predictions_df.columns = new_cols

label_feature_train = predictions_df.join(df_X_text) \
                                        .join(X_labelencoded_meta, rsuffix='_meta') \
                                        .join(df_X_audio) \
                                        .join(df_scaled_X_visual, rsuffix='vis') \

display(label_feature_train.head(2))


# Adapt test data
new_cols = [0]*predictions_test_df.columns.size
for i, val in enumerate(predictions_test_df.columns):
    new_cols[i] = "l"+str(i)
predictions_test_df.columns = new_cols

label_feature_test = predictions_test_df.join(df_X_text_test) \
                                        .join(X_labelencoded_meta_test, rsuffix='_meta') \
                                        .join(df_X_audio_test) \
                                        .join(df_scaled_X_visual_test, rsuffix='vis') \

display(label_feature_test.head(2))

Unnamed: 0,l0,l1,l2,l3,l4,l5,l6,l7,l8,l9,...,"(821, 0)","(821, 1)","(822, 0)","(822, 1)","(823, 0)","(823, 1)","(824, 0)","(824, 1)","(825, 0)","(825, 1)"
0,1,1,1,1,1,0,1,1,1,1,...,2.9504,3.2167,91672.0,81373.0,22207.0,21045.0,26201.0,24225.0,14542.0,13529.0
1,0,1,1,1,1,1,1,1,1,1,...,21.189,21.182,81665.0,83171.0,13672.0,13714.0,32531.0,32774.0,13753.0,13780.0


Unnamed: 0,l0,l1,l2,l3,l4,l5,l6,l7,l8,l9,...,"(821, 0)","(821, 1)","(822, 0)","(822, 1)","(823, 0)","(823, 1)","(824, 0)","(824, 1)","(825, 0)","(825, 1)"
0,1,1,0,0,1,0,0,0,0,0,...,13.606,13.606,38450.0,38440.0,11844.0,11847.0,23975.0,23988.0,13069.0,13071.0
1,1,1,0,1,1,0,0,0,0,1,...,8.6886,8.6112,13428.0,14415.0,2086.1,2519.3,5285.1,6909.0,2510.7,2963.8


### Label Attribute Stacking (CV)

Best result: RandomForestClassifier with 0.642249.

In [35]:
label_feature_stack_cv = evaluate_models_extended(label_feature_train, df_y)
label_feature_stack_cv[["precision", "recall", "F1"]]

Unnamed: 0,precision,recall,F1
KNeighborsClassifier,0.546787,0.542929,0.529648
NearestCentroid,0.405332,0.473333,0.405324
DecisionTreeClassifier,0.494425,0.506162,0.488474
LogisticRegression,0.535126,0.543737,0.528366
SVC,0.398547,0.55798,0.4225
BaggingClassifier,0.597849,0.556869,0.541559
AdaBoostClassifier,0.566269,0.562929,0.553006
GradientBoostingClassifier,0.514057,0.513535,0.502999
RandomForestClassifier,0.67399,0.649697,0.642249
GaussianNB,0.503584,0.518586,0.484453


### Label Attribute Stacking (Test)

Best result: 0.650156 from SVC.

In [36]:
label_feature_stack_scores = pd.DataFrame()
label_feature_stack_scores["Classifier"] = pd.Series()
label_feature_stack_scores["Precision"] = pd.Series()
label_feature_stack_scores["Recall"] = pd.Series()
label_feature_stack_scores["F1"] = pd.Series()
label_feature_stack_scores["F1 scores"] = pd.Series()

for model in model_list:
    f1 = [0]*10
    recall = [0]*10
    precision = [0]*10
    i = 0
    for train_index, test_index in kf.split(label_feature_train):
        # Fit model for CV of train data
        mod = model.fit(label_feature_train.loc[train_index, :], df_y[train_index])
        # Predicitions for test data
        pred = mod.predict(label_feature_test)
        
        f1[i] = f1_score(df_y_test, pred)
        recall[i] = recall_score(df_y_test, pred)
        precision[i] = precision_score(df_y_test, pred)
        i = i + 1
    
    label_feature_stack_scores = label_feature_stack_scores.append(pd.Series({"Classifier": getModelName(model),
                                        "Precision": np.mean(precision),
                                        "Recall": np.mean(recall),
                                        "F1": np.mean(f1),
                                        "F1 scores": f1}), 
                              ignore_index = True)
    
label_feature_stack_scores[["Classifier", "Precision", "Recall", "F1"]]

Unnamed: 0,Classifier,Precision,Recall,F1
0,KNeighborsClassifier,0.591511,0.554074,0.571189
1,NearestCentroid,0.591594,0.555556,0.547353
2,DecisionTreeClassifier,0.606563,0.592593,0.595649
3,LogisticRegression,0.596453,0.522222,0.556697
4,SVC,0.613331,0.799259,0.650156
5,BaggingClassifier,0.623167,0.575556,0.597043
6,AdaBoostClassifier,0.585324,0.602963,0.59325
7,GradientBoostingClassifier,0.601075,0.674074,0.634128
8,RandomForestClassifier,0.609779,0.526667,0.563505
9,GaussianNB,0.568302,0.595556,0.574522


# Significance Testing

## Testing for results obtained with Feature Selection
Here we made tests for alpha = 0.05 on combinations from the paper, the baseline is the F1 score predicting always the most frequent class  (around 0.7!).

We take a look on the cross validation settings of the results achieved by feature results, and compare our results to the baseline. According to the paper, the majority class baseline  is 0.5 (precision, recall and F1-score). We decided to calculate it on our own because it differs a lot from the actual F1 if predicting always the majority class.

The selected features from the task before are obtained to identify statistical significance.

We can assume that the F1 scores are normally distributed because of the CLT, because of that it is suitable to apply t tests. We take a the basline F1 score for the population of the data, and calculate our mean and standard deviation of F1 score out of the cross validation results.


In [37]:
alpha = 0.05

def calculate_F1_scores_cv(clf, X, y):
    metric =  cross_validate(clf, X, y, scoring=('f1_weighted'), return_train_score = False, cv = 10)  
    return metric['test_score']


def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), stats.sem(a)
    h = se * stats.t.ppf((1 + confidence) / 2., n-1)
    return m-h, m+h

In [55]:
classifier_combinations = pd.read_csv('./data/results_paper.csv', sep=';', encoding='utf-8', 
                                      header = 0)
classifier_combinations = classifier_combinations.rename(index=str, columns={"Unnamed: 0" : "Classifier"})

# Calculate Baseline
baseline =  (df_labled_movies[df_labled_movies['goodforairplane'] == 1]['goodforairplane'].count() /  
             df_labled_movies['goodforairplane'].count())
baseline_f1 = f1_score(([1] if baseline > 0.5 else [0]) * len(df_labled_movies['goodforairplane']), 
                       df_labled_movies['goodforairplane'])
print("Baseline: " + str(baseline_f1))


####################################################################
# Calculations for variances, confidence intervals, T-Test score and p-values


variances = [0] * classifier_combinations.shape[0]
conf = [""] * classifier_combinations.shape[0]
t_scores = [0] * classifier_combinations.shape[0]
p_vals = [0] * classifier_combinations.shape[0]
h0_baseline = [""] * classifier_combinations.shape[0]

t_scores_paper = [0] * classifier_combinations.shape[0]
p_vals_paper = [0] * classifier_combinations.shape[0]
h0_paper = [""] * classifier_combinations.shape[0]

# Stacks
mv_cv_t = [0] * classifier_combinations.shape[0]
mv_cv_p = [0] * classifier_combinations.shape[0]
mv_cv_h0 = [""] * classifier_combinations.shape[0]

label_cv_t = [0] * classifier_combinations.shape[0]
label_cv_p = [0] * classifier_combinations.shape[0]
label_cv_h0 = [""] * classifier_combinations.shape[0]

la_cv_t = [0] * classifier_combinations.shape[0]
la_cv_p = [0] * classifier_combinations.shape[0]
la_cv_h0 = [""] * classifier_combinations.shape[0]



# Go through all classifier modality combinations
i = 0
for index, row in classifier_combinations.iterrows():
    model = str2Class(row["Classifier"])

    # get correct data frame
    if row["Modality"] == "metadata":
        df_x = X_labelencoded_meta
    elif row["Modality"] == "visual":
        df_x = df_scaled_X_visual
    elif row["Modality"] == "textual":
        df_x = df_X_text
    else:
        df_x = df_X_audio

    # Calculate and append LVW metrics
    if getModelName(model) == "RandomForestClassifier":
        # evaluate random forest without selected features since it already performs feature selection
        metric = calculate_F1_scores_cv(model, df_x, df_y)
    else:
        # get results with features from LVW        
        features = [int(i) for i in row["Features"].split(",")]
        df_x_features = df_x[df_x.columns[features]]
        metric = calculate_F1_scores_cv(model, df_x_features, df_y)

    #########################################
    # Baseline tests (Mean = Baseline)
    
    # Calculate variance
    variances[i] = np.var(metric)
    
    # Confidence interval
    low, up = mean_confidence_interval(metric)
    conf[i] = "[" + str('{:.3f}'.format(low)) + ", " + str('{:.3f}'.format(up)) + "]"
    # Calculate one sample t-test score and p-value
    stat = stats.ttest_1samp(metric, baseline_f1)
    t_scores[i] = stat[0]
    p_vals[i] = stat[1]
    
    # Check H0 for baseline with two-sided test H0 = H1
    h0_baseline[i] = "reject" if  (p_vals[i]/2 <= alpha and t_scores[i] <= 0) else "keep"
                                #or (p_vals[i]/2 < alpha and t_scores[i] > 0) \
   
    
    #########################################
    # Paper distribution tests

    # Calculate one sample t-test score and p-value
    stat = stats.ttest_1samp(metric, row["F1 Paper"])
    t_scores_paper[i] = stat[0]
    p_vals_paper[i] = stat[1]
    
    # Check H0 for baseline, two sided test
    h0_paper[i] = "reject" if  (p_vals_paper[i] <= alpha) else "keep"
    
    #########################################
    # Stack tests
    
    # Majority Voting (CV)
    # Here we perform tests for H0: mu_stack > mu_classifier
    stat = stats.ttest_1samp(metric, np.mean(f1_scores_voting))
    mv_cv_t[i] = stat[0]
    mv_cv_p[i] = stat[1]
    # Check H0 for baseline, one sided test
    mv_cv_h0[i] = "reject" if  (mv_cv_p[i]/2 < alpha and t_scores[i] < 0) else "keep"
    
    
    # Label Stacking (CV)
    stat = stats.ttest_1samp(metric, np.mean(label_stack.loc["NearestCentroid"]["F1 scores"][0]))
    label_cv_t[i] = stat[0]
    label_cv_p[i] = stat[1]
    # Check H0 for baseline, one sided test
    label_cv_h0[i] = "reject" if  (label_cv_p[i]/2 < alpha and t_scores[i] < 0) else "keep"
    
    # Label Feature Stacking (CV)
    stat = stats.ttest_1samp(metric, np.mean(label_feature_stack_cv.loc["RandomForestClassifier"]["F1 scores"][0]))
    la_cv_t[i] = stat[0]
    la_cv_p[i] = stat[1]
    # Check H0 for baseline, one sided test
    la_cv_h0[i] = "reject" if  (la_cv_p[i]/2 < alpha and t_scores[i] < 0) else "keep"
    
    i = i + 1
           
classifier_combinations["Variance"] = pd.Series(variances, index=classifier_combinations.index)
classifier_combinations["95% CI"] = pd.Series(conf, index=classifier_combinations.index)
classifier_combinations["T Score Baseline"] = pd.Series(t_scores, index=classifier_combinations.index)
classifier_combinations["p-value Baseline"] = pd.Series(p_vals, index=classifier_combinations.index)
classifier_combinations["H0 Baseline"] = pd.Series(h0_baseline, index=classifier_combinations.index)

classifier_combinations["T Score Paper"] = pd.Series(t_scores_paper, index=classifier_combinations.index)
classifier_combinations["p-value Paper"] = pd.Series(p_vals_paper, index=classifier_combinations.index)
classifier_combinations["H0 Paper"] = pd.Series(h0_paper, index=classifier_combinations.index)

classifier_combinations["T Score Majority"] = pd.Series(mv_cv_t, index=classifier_combinations.index)
classifier_combinations["p-value Majority"] = pd.Series(mv_cv_p, index=classifier_combinations.index)
classifier_combinations["H0 Majority"] = pd.Series(mv_cv_h0, index=classifier_combinations.index)

classifier_combinations["T Score Label"] = pd.Series(label_cv_t, index=classifier_combinations.index)
classifier_combinations["p-value Label"] = pd.Series(label_cv_p, index=classifier_combinations.index)
classifier_combinations["H0 Label"] = pd.Series(label_cv_h0, index=classifier_combinations.index)

classifier_combinations["T Score Label Feature"] = pd.Series(la_cv_t, index=classifier_combinations.index)
classifier_combinations["p-value Label Feature"] = pd.Series(la_cv_p, index=classifier_combinations.index)
classifier_combinations["H0 Label Feature"] = pd.Series(la_cv_h0, index=classifier_combinations.index)


Baseline: 0.707482993197279


In [56]:
# Baseline Results
display(classifier_combinations[["Classifier", "Modality", "F1", "Variance", "95% CI",
                                 "T Score Baseline", "p-value Baseline", "H0 Baseline"]])

Unnamed: 0,Classifier,Modality,F1,Variance,95% CI,T Score Baseline,p-value Baseline,H0 Baseline
0,KNeighborsClassifier,metadata,0.589546,0.017284,"[0.490, 0.689]",-2.691257,0.02474553,reject
1,NearestCentroid,metadata,0.539099,0.030329,"[0.408, 0.670]",-2.900643,0.01757695,reject
2,DecisionTreeClassifier,metadata,0.619452,0.045452,"[0.459, 0.780]",-1.23875,0.2467717,keep
3,LogisticRegression,metadata,0.596167,0.021132,"[0.487, 0.706]",-2.297236,0.04721246,reject
4,SVC,metadata,0.599093,0.036224,"[0.456, 0.743]",-1.708477,0.1217229,keep
5,BaggingClassifier,metadata,0.574685,0.022916,"[0.461, 0.689]",-2.631763,0.02727969,reject
6,RandomForestClassifier,metadata,0.35197,0.02615,"[0.230, 0.474]",-6.595362,9.980679e-05,reject
7,AdaBoostClassifier,metadata,0.585141,0.023244,"[0.470, 0.700]",-2.407365,0.03941895,reject
8,GradientBoostingClassifier,metadata,0.609539,0.032569,"[0.473, 0.746]",-1.628158,0.1379328,keep
9,GaussianNB,textual,0.630979,0.023841,"[0.515, 0.747]",-1.486425,0.1713341,keep


In [51]:
# Paper Results
display(classifier_combinations[["Classifier", "Modality", "F1","F1 Paper", "Variance", "95% CI",
                                 "T Score Paper", "p-value Paper", "H0 Paper"]])

Unnamed: 0,Classifier,Modality,F1,F1 Paper,Variance,95% CI,T Score Paper,p-value Paper,H0 Paper
0,KNeighborsClassifier,metadata,0.589546,0.63,0.017284,"[0.490, 0.689]",-0.923143,0.380016,keep
1,NearestCentroid,metadata,0.539099,0.591,0.030329,"[0.408, 0.670]",-0.894068,0.3945653,keep
2,DecisionTreeClassifier,metadata,0.619452,0.563,0.045452,"[0.459, 0.780]",0.794371,0.447419,keep
3,LogisticRegression,metadata,0.596167,0.578,0.021132,"[0.487, 0.706]",0.374913,0.7164073,keep
4,SVC,metadata,0.599093,0.574,0.036224,"[0.456, 0.743]",0.395526,0.7016664,keep
5,BaggingClassifier,metadata,0.574685,0.631,0.022916,"[0.461, 0.689]",-1.11604,0.293315,keep
6,RandomForestClassifier,metadata,0.35197,0.576,0.02615,"[0.230, 0.474]",-4.156134,0.00246182,reject
7,AdaBoostClassifier,metadata,0.585141,0.536,0.023244,"[0.470, 0.700]",0.966971,0.3588227,keep
8,GradientBoostingClassifier,metadata,0.609539,0.569,0.032569,"[0.473, 0.746]",0.673889,0.5173133,keep
9,GaussianNB,textual,0.630979,0.702,0.023841,"[0.515, 0.747]",-1.379894,0.2009301,keep


## Significance testing for Classifier stacking
We compared the stack results in the section before with the results from single classifiers. We should obtain significant better results with stacking approaches.

Also we compared all of them against the baseline and values from the paper.

### Obtaining final stack results

In [57]:
stack_results = pd.DataFrame()
stack_results["Precision"] = pd.Series()
stack_results["Recall"] = pd.Series()
stack_results["F1"] = pd.Series()
stack_results["F1 scores"] = pd.Series()

stack_results = stack_results.append(pd.Series({"Precision": 0.6351190476190476,
                               "Recall": 0.8061904761904761,
                               "F1": np.mean(f1_scores_voting),
                               "F1 scores": f1_scores_voting}, name = "Voting (cv)"))

stack_results = stack_results.append(pd.Series({"Precision": label_stack.loc["NearestCentroid"]["precision"],
                               "Recall": label_stack.loc["NearestCentroid"]["recall"],
                               "F1": label_stack.loc["NearestCentroid"]["F1"],
                               "F1 scores": label_stack.loc["NearestCentroid"]["F1 scores"]}, 
                                               name = "Label Stacking (cv)"))

stack_results = stack_results.append(pd.Series({"Precision": label_feature_stack_cv.loc["RandomForestClassifier"]["precision"],
                               "Recall": label_feature_stack_cv.loc["RandomForestClassifier"]["recall"],
                               "F1": label_feature_stack_cv.loc["RandomForestClassifier"]["F1"],
                               "F1 scores": label_feature_stack_cv.loc["RandomForestClassifier"]["F1 scores"]}, 
                                               name = "Label Attribute Stacking (cv)"))

stack_results = stack_results.append(pd.Series({"Precision": 0.5853638889089406,
                               "Recall": 0.7355555555555556,
                               "F1": np.mean(f1_scores_voting_test),
                               "F1 scores": f1_scores_voting_test}, name = "Voting (test)"))

best_label_test = label_stack_scores.loc[label_stack_scores["Classifier"] == "AdaBoostClassifier"]
stack_results = stack_results.append(pd.Series({"Precision": best_label_test["Precision"].values[0],
                               "Recall": best_label_test["Recall"].values[0],
                               "F1": best_label_test["F1"].values[0],
                               "F1 scores": best_label_test["F1 scores"].values[0]}, 
                                               name = "Label Stacking (test)"))

stack_results[["Precision", "Recall", "F1"]]

Unnamed: 0,Precision,Recall,F1
Voting (cv),0.635119,0.80619,0.708598
Label Stacking (cv),0.763656,0.746566,0.744057
Label Attribute Stacking (cv),0.67399,0.649697,0.642249
Voting (test),0.585364,0.735556,0.65146
Label Stacking (test),0.598354,0.684444,0.637919


### Testing against single classifiers from CV

In [58]:
display(classifier_combinations[["Classifier", "Modality", "F1", "Variance", "95% CI",
                                 "T Score Majority", "p-value Majority", "H0 Majority"]])

Unnamed: 0,Classifier,Modality,F1,Variance,95% CI,T Score Majority,p-value Majority,H0 Majority
0,KNeighborsClassifier,metadata,0.589546,0.017284,"[0.490, 0.689]",-2.716709,0.02373536,reject
1,NearestCentroid,metadata,0.539099,0.030329,"[0.408, 0.670]",-2.919857,0.01703578,reject
2,DecisionTreeClassifier,metadata,0.619452,0.045452,"[0.459, 0.780]",-1.254445,0.2412789,keep
3,LogisticRegression,metadata,0.596167,0.021132,"[0.487, 0.706]",-2.320254,0.04546693,reject
4,SVC,metadata,0.599093,0.036224,"[0.456, 0.743]",-1.726057,0.1184154,keep
5,BaggingClassifier,metadata,0.574685,0.022916,"[0.461, 0.689]",-2.653867,0.02630891,reject
6,RandomForestClassifier,metadata,0.35197,0.02615,"[0.230, 0.474]",-6.616054,9.745965e-05,reject
7,AdaBoostClassifier,metadata,0.585141,0.023244,"[0.470, 0.700]",-2.429313,0.03802539,reject
8,GradientBoostingClassifier,metadata,0.609539,0.032569,"[0.473, 0.746]",-1.646699,0.1340259,keep
9,GaussianNB,textual,0.630979,0.023841,"[0.515, 0.747]",-1.508096,0.1658029,keep


In [59]:
display(classifier_combinations[["Classifier", "Modality", "F1", "Variance", "95% CI",
                                 "T Score Label", "p-value Label", "H0 Label"]])

Unnamed: 0,Classifier,Modality,F1,Variance,95% CI,T Score Label,p-value Label,H0 Label
0,KNeighborsClassifier,metadata,0.589546,0.017284,"[0.490, 0.689]",-5.217334,0.0005511537,reject
1,NearestCentroid,metadata,0.539099,0.030329,"[0.408, 0.670]",-4.807578,0.0009634925,reject
2,DecisionTreeClassifier,metadata,0.619452,0.045452,"[0.459, 0.780]",-2.796471,0.02083242,reject
3,LogisticRegression,metadata,0.596167,0.021132,"[0.487, 0.706]",-4.581734,0.001324538,reject
4,SVC,metadata,0.599093,0.036224,"[0.456, 0.743]",-3.453348,0.007236372,reject
5,BaggingClassifier,metadata,0.574685,0.022916,"[0.461, 0.689]",-4.825568,0.0009396737,reject
6,RandomForestClassifier,metadata,0.35197,0.02615,"[0.230, 0.474]",-8.649009,1.180781e-05,reject
7,AdaBoostClassifier,metadata,0.585141,0.023244,"[0.470, 0.700]",-4.585628,0.001317207,reject
8,GradientBoostingClassifier,metadata,0.609539,0.032569,"[0.473, 0.746]",-3.46834,0.007067237,reject
9,GaussianNB,textual,0.630979,0.023841,"[0.515, 0.747]",-3.637241,0.00542413,reject


In [60]:
display(classifier_combinations[["Classifier", "Modality", "F1", "Variance", "95% CI",
                                  "T Score Label Feature", "p-value Label Feature", "H0 Label Feature"]])

Unnamed: 0,Classifier,Modality,F1,Variance,95% CI,T Score Label Feature,p-value Label Feature,H0 Label Feature
0,KNeighborsClassifier,metadata,0.589546,0.017284,"[0.490, 0.689]",-3.036462,0.01409842,reject
1,NearestCentroid,metadata,0.539099,0.030329,"[0.408, 0.670]",-3.161239,0.01152723,reject
2,DecisionTreeClassifier,metadata,0.619452,0.045452,"[0.459, 0.780]",-1.451623,0.1805558,keep
3,LogisticRegression,metadata,0.596167,0.021132,"[0.487, 0.706]",-2.609428,0.02829731,reject
4,SVC,metadata,0.599093,0.036224,"[0.456, 0.743]",-1.946925,0.08337743,reject
5,BaggingClassifier,metadata,0.574685,0.022916,"[0.461, 0.689]",-2.931561,0.01671452,reject
6,RandomForestClassifier,metadata,0.35197,0.02615,"[0.230, 0.474]",-6.876007,7.258875e-05,reject
7,AdaBoostClassifier,metadata,0.585141,0.023244,"[0.470, 0.700]",-2.705039,0.02419321,reject
8,GradientBoostingClassifier,metadata,0.609539,0.032569,"[0.473, 0.746]",-1.879631,0.09286097,reject
9,GaussianNB,textual,0.630979,0.023841,"[0.515, 0.747]",-1.780348,0.1087164,keep


### Tests against baseline
Test for no statistical difference, hypothesis is rejected for significantly lower or higher values.

In [61]:
baseline_test =  (df_labled_movies_test[df_labled_movies_test['goodforairplane'] == 1]['goodforairplane'].count() /  
             df_labled_movies_test['goodforairplane'].count())
baseline_f1_test = f1_score(([1] if baseline_test > 0.5 else [0]) * len(df_labled_movies_test['goodforairplane']), 
                       df_labled_movies_test['goodforairplane'])
print("Baseline: " + str(baseline_f1_test))


variances = [0] * stack_results.shape[0]
conf = [""] * stack_results.shape[0]
t_scores = [0] * stack_results.shape[0]
p_vals = [0] * stack_results.shape[0]
h0_baseline = [""] * stack_results.shape[0]

i = 0
for index, row in stack_results.iterrows():
    
    val = row["F1 scores"]
     # Calculate variance
    variances[i] = np.var(val)
    
    # Confidence interval
    low, up = mean_confidence_interval(val)
    conf[i] = "[" + str('{:.3f}'.format(low)) + ", " + str('{:.3f}'.format(up)) + "]"
    
    baseline = baseline_f1_test
    if index.split("(")[1].split(")")[0] == "cv":
        baseline = baseline_f1
    stat = stats.ttest_1samp(val, baseline)
    t_scores[i] = stat[0]
    p_vals[i] = stat[1]
    # Check H0 for baseline, two sided test
    h0_baseline[i] = "reject" if  (p_vals[i] <= alpha) else "keep"
    
    i = i + 1
    
stack_results["Variance"] = pd.Series(variances, index=stack_results.index)
stack_results["95% CI"] = pd.Series(conf, index=stack_results.index)
stack_results["T Score Baseline"] = pd.Series(t_scores, index=stack_results.index)
stack_results["p-value Baseline"] = pd.Series(p_vals, index=stack_results.index)
stack_results["H0 Baseline"] = pd.Series(h0_baseline, index=stack_results.index)

stack_results[["Precision", "Recall", "F1", "Variance", "95% CI", "T Score Baseline", "p-value Baseline", "H0 Baseline"]]

Baseline: 0.7541899441340781


Unnamed: 0,Precision,Recall,F1,Variance,95% CI,T Score Baseline,p-value Baseline,H0 Baseline
Voting (cv),0.635119,0.80619,0.708598,0.057407,"[0.528, 0.889]",0.013965,0.9891622,keep
Label Stacking (cv),0.763656,0.746566,0.744057,0.008705,"[0.674, 0.814]",1.175984,0.2697699,keep
Label Attribute Stacking (cv),0.67399,0.649697,0.642249,0.016003,"[0.547, 0.738]",-1.547003,0.1562668,keep
Voting (test),0.585364,0.735556,0.65146,0.000455,"[0.635, 0.668]",-14.448575,1.561112e-07,reject
Label Stacking (test),0.598354,0.684444,0.637919,0.001167,"[0.612, 0.664]",-10.212518,3.002845e-06,reject


### Tests against paper values

In [62]:
stack_results["F1 Paper"] = [0.71, 0.78, 0.75, 0.70, 0.73]

t_scores = [0] * stack_results.shape[0]
p_vals = [0] * stack_results.shape[0]
h0_baseline = [""] * stack_results.shape[0]

i = 0
for index, row in stack_results.iterrows():
    
    val = row["F1 scores"]
    stat = stats.ttest_1samp(val, row["F1 Paper"])
    t_scores[i] = stat[0]
    p_vals[i] = stat[1]
    # Check H0 for baseline, two sided test
    h0_baseline[i] = "reject" if  (p_vals[i] <= alpha) else "keep"
    
    i = i + 1
    
stack_results["Variance"] = pd.Series(variances, index=stack_results.index)
stack_results["95% CI"] = pd.Series(conf, index=stack_results.index)
stack_results["T Score Paper"] = pd.Series(t_scores, index=stack_results.index)
stack_results["p-value Paper"] = pd.Series(p_vals, index=stack_results.index)
stack_results["H0 Paper"] = pd.Series(h0_baseline, index=stack_results.index)

stack_results[["Precision", "Recall", "F1", "F1 Paper", "Variance", "95% CI", 
               "T Score Paper", "p-value Paper", "H0 Paper"]]

Unnamed: 0,Precision,Recall,F1,F1 Paper,Variance,95% CI,T Score Paper,p-value Paper,H0 Paper
Voting (cv),0.635119,0.80619,0.708598,0.71,0.057407,"[0.528, 0.889]",-0.01755,0.986381,keep
Label Stacking (cv),0.763656,0.746566,0.744057,0.78,0.008705,"[0.674, 0.814]",-1.15568,0.27757,keep
Label Attribute Stacking (cv),0.67399,0.649697,0.642249,0.75,0.016003,"[0.547, 0.738]",-2.555281,0.030926,reject
Voting (test),0.585364,0.735556,0.65146,0.7,0.000455,"[0.635, 0.668]",-6.826958,7.7e-05,reject
Label Stacking (test),0.598354,0.684444,0.637919,0.73,0.001167,"[0.612, 0.664]",-8.087832,2e-05,reject
