# Experiment Design ex2 - CoE: textual and audio features

## Imports

In [1]:
import csv
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
import time
import random

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_validate

## Load data

In [2]:
def load_audio(trainpath = "./data/CoE_dataset_icpr/Dev_Set/audio_descriptors/*",
               testpath = "./data/CoE_dataset_icpr/Test_Set/audio_descriptors/*",
              trainrefpath = "./data/CoE_dataset_icpr/Dev_Set/CoeDevelopmentTrainingdata.csv",
              testrefpath = "./data/CoE_dataset_icpr/Dev_Set/CoeDevelopmentTestdata.csv"):
    """ loads all audio data and includes the target variable
    kwargs:
        trainpath: path to the folder containing all audio descriptor csv files of the train set
        testpath: path to the folder containing all audio descriptor csv files of the test set
        trainrefpath: path to the csv containing filename, movie name and the target variable for the train set
        testrefpath: path to the csv containing filename, movie name and the target variable for the test set
    """
    train = pd.DataFrame()
    test = pd.DataFrame()
    trainfiles = []
    testfiles = []

    # load training data
    for csvpath in tqdm(glob.glob(trainpath), desc='Loading audio train data'):
        trainfiles.append(csvpath.split('/')[-1].split('.csv')[0])
        tmp = pd.DataFrame(pd.read_csv(csvpath, header=None)).mean(axis=1)
        train = train.append(pd.DataFrame(tmp.values.flatten()).transpose())

    # load test data
    for csvpath in tqdm(glob.glob(testpath), desc='Loading audio test data'):
        testfiles.append(csvpath.split('/')[-1].split('.csv')[0])
        tmp = pd.DataFrame(pd.read_csv(csvpath, header=None)).mean(axis=1)
        test = test.append(pd.DataFrame(tmp.values.flatten()).transpose())

    # add filename and target variable
    train['fname'] = trainfiles
    test['fname'] = testfiles
    train = train.merge(pd.read_csv(trainrefpath)[['file_name', 'goodforairplanes']], left_on='fname', right_on='file_name').drop(columns=['file_name'])
    test = test.merge(pd.read_csv(testrefpath)[['file_name', 'goodforairplanes']], left_on='fname', right_on='file_name').drop(columns=['file_name'])
    
    # set file name as index
    train.set_index(['fname'], inplace=True)
    test.set_index(['fname'], inplace=True)
    
    # replace NAs in audio
    train = train.fillna(0)
    test = test.fillna(0)
    
    return train, test

In [3]:
def load_text():
    train = pd.read_csv("./data/CoE_dataset_icpr/Dev_Set/text_descriptors/tdf_idf_dev.csv")
    test = pd.read_csv("./data/CoE_dataset_icpr/Test_Set/text_descriptors/tdf_idf_test.csv")
    
    return train, test

In [4]:
audio_train, audio_test = load_audio()
text_train, text_test = load_text()

Loading audio train data: 100%|██████████| 95/95 [00:50<00:00,  1.87it/s]
Loading audio test data: 100%|██████████| 223/223 [01:56<00:00,  1.92it/s]


## Las Vegas Wrapper for feature selection

In [5]:
def LVW(classifier_function, tX, ty, vX, vy, K, original_features):
    
    acc = 0
    k = 0
    C = len(original_features)
    
    while k < K:
        #print('k: ', k)
        ran_choice = range(1,len(original_features)-1)
        S1 = random.sample(original_features, random.choice(ran_choice))
        C1 = len(S1)
        
        x_train = tX[tX.columns.intersection(S1)]
        x_test = vX[vX.columns.intersection(S1)]
        
        acc1 = classifier_function(tX, ty, vX, vy, acc = True)
        
        if (acc1 > acc) or (acc1 == acc and C1 < C):
            k = 0
            acc = acc1
            C = C1
            S = S1
        
        else:
            k += 1
            
    return S

## Learning algorithms

In [6]:
def seperate_tvar(train, test, target_var):
    """ seperate training/test set and the target variable
    kwargs:
        train: dataframe containing dependent and independent variables
        test: dataframe containing dependent and independent variables
        target_var: column name of the target variable as a string
    """
    if target_var in train.columns:
        train_y = train[target_var]
        train_x = train.drop(columns=target_var)
    if target_var in test.columns:
        test_y = test[target_var]
        test_x = test.drop(columns=target_var)
    
    return train_y, train_x, test_y, test_x

In [7]:
# dict with classifier name: time spent for fitting and classifier obejcts
classifier = {"Logistic Regression": LogisticRegression(),
              "Gradient Boosting Tree": GradientBoostingClassifier(),
              "KNN": KNeighborsClassifier(),
              "SVM": SVC(),
              "Naive bayes": GaussianNB()}

res = pd.DataFrame(columns=['Algorithm', 'Precision', 'Recall', 'F-score', 'Training time (s)']).set_index(['Algorithm'])

for c in classifier:
    train_y, train_x, test_y, test_x = seperate_tvar(audio_train, audio_test, 'goodforairplanes')
    
    # apply LVW feature selection on train_x and test_x
#     features = LVW(classifier[c], train_x, train_y, test_x, test_y, 500, range(0,train_x.shape[1]-1))
#     train_x = train_x[train_x.columns.intersection(features)]
#     test_x = test_x[test_x.columns.intersection(features)]
    
    
    model = classifier[c]
    start_time = time.time()
    model.fit(train_x, train_y)
    dur = time.time() - start_time
    pred_y = model.predict(test_x)
    prfs = precision_recall_fscore_support(test_y, pred_y)
    res.loc[c] = [prfs[0], prfs[1], prfs[2], dur]

res



Unnamed: 0_level_0,Precision,Recall,F-score,Training time (s)
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,"[0.29411764705882354, 0.3870967741935484]","[0.20833333333333334, 0.5]","[0.24390243902439027, 0.43636363636363634]",0.002007
Gradient Boosting Tree,"[0.4, 0.391304347826087]","[0.4166666666666667, 0.375]","[0.4081632653061225, 0.3829787234042554]",0.040689
KNN,"[0.6086956521739131, 0.6]","[0.5833333333333334, 0.625]","[0.5957446808510638, 0.6122448979591836]",0.000675
SVM,"[0.55, 0.5357142857142857]","[0.4583333333333333, 0.625]","[0.5, 0.576923076923077]",0.001297
Naive bayes,"[0.5161290322580645, 0.5294117647058824]","[0.6666666666666666, 0.375]","[0.5818181818181819, 0.4390243902439025]",0.000799
