<a href="https://colab.research.google.com/github/micolspitale93/COCOvoices/blob/master/Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Preparing Feature Sets**
We will create a csv file for each modality (visual, audio, and text) and save to be used for modeling.

## Mounting drive, import libs, and constants declarations

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# all imports go here
import pandas as pd
import os
import pickle
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC
import numpy as np
import xgboost as xgb
!pip install boruta
from boruta import BorutaPy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting boruta
  Downloading Boruta-0.3-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 1.5 MB/s 
Installing collected packages: boruta
Successfully installed boruta-0.3


In [3]:
# path to data folder 
DATA_DIR = '/content/drive/Shareddrives/Affective Intelligence & Robotics Lab (AFAR)/Projects/Jiaee Micol Project/BSFT_dataset_features_extracted/'
MODALITY = ["face"]#["audio", "face"]
SUBDIR_MODALITY = ["bsft_librosa/", "bsft_openface/collated/"]

# path to labels file 
LABELS_PATH = '/content/drive/Shareddrives/Affective Intelligence & Robotics Lab (AFAR)/Projects/Jiaee Micol Project/labels.xlsx'

# path to results directory 
RESULTS_DIR = '/content/drive/Shareddrives/Affective Intelligence & Robotics Lab (AFAR)/Projects/Jiaee Micol Project/results/'

# path to feature_sets directory 
FEATURE_DIR = '/content/drive/Shareddrives/Affective Intelligence & Robotics Lab (AFAR)/Projects/Jiaee Micol Project/feature_sets/'

## Functions definition

In [4]:
def pre_process_outsideCV(input_df):
  # drop columns with NaN values
  df = input_df.dropna(axis='columns')
  df = df.loc[:, (df != df.iloc[0]).any()] 
  return df

In [5]:
def define_columns(df, column_to_ignore):
  #for col in df.columns:
  #  #if '_c' in col:
  #    column_to_ignore.append(col)
  colnames = []
  colnames.append('filename')
  colnames.append('label')
  colnames.append('person_id')
  colnames.append('week')
  colnames.append('person_group')
  colnames.append('week_group')
  for col in df.columns:
    if col not in column_to_ignore:
      mean_fname = str(col) + '__mean'
      median_fname = str(col) + '__median'
      stddev_fname = str(col) + '__stddev'
      autocorr_name = str(col) + '__autocorr'
      colnames.append(mean_fname)
      colnames.append(median_fname)
      colnames.append(stddev_fname)
      colnames.append(autocorr_name)
  return colnames

In [6]:
def prepare_fixed_length_vector(data, colnames, columns_to_ignore, data_dir):
  # new dataframe that stores one fixed-length feature vector per video
  # each vector contains the (1) mean, (2) standard deviation, (3) median, and (4) autocorrelation with lag 24
  final_df = pd.DataFrame(columns=colnames)
  first = True
  print(len(data))
  for filename in data:
    if "_S.csv" in filename:
      # read the data
      print(data_dir + filename)
      curr_df = pd.read_csv(data_dir + filename)
      # new row values (the new row to be added)
      new_row = []
      # append the filename, label, person_id, and week number
      new_row.append(filename)
      new_row.append(label_dict[filename])
      split_filename = filename.split('_')
      person_id = split_filename[1]
      week = split_filename[2]
      new_row.append(person_id)
      new_row.append(week)
      if week == "21Oct" or week == "18Nov":
        week_group = 1
      elif week == "27Oct" or week == "25Nov":
        week_group = 2
      elif week == "04Nov" or week == "02Dec":
        week_group = 3
      elif week == "11Nov" or week == "09Dec":
        week_group = 4
      person_group = person_id.replace("S","")
      new_row.append(person_group)
      new_row.append(week_group)

      # loop through each relevant column of this dataframe
      for col in curr_df:
        if col not in columns_to_ignore:
          colvalues = curr_df[col].values
          # compute standard time-series attributes
          mean = np.mean(colvalues)
          median = np.median(colvalues)
          stddev = np.std(colvalues)

          # compute autocorrelation with lag 24
          autocorr = curr_df[col].autocorr(lag=24)  

          # add to the new df row
          new_row.append(mean)
          new_row.append(median)
          new_row.append(stddev)
          new_row.append(autocorr)

      new_row_series = pd.Series(new_row, index = final_df.columns)
      final_df = final_df.append(new_row_series, ignore_index=True)
      print(final_df.shape)
  return final_df

In [7]:
def make_label_dictonary(data_files, labels):
  # this maps each file name to a label
  label_dict = {}
  for file in data_files:
    splitfile = file.split('_')
    participant_id = splitfile[1]
    week = splitfile[2]
    if week == "21Oct" or week == "18Nov":
      week = 1
    elif week == "27Oct" or week == "25Nov":
      week = 2
    elif week == "04Nov" or week == "02Dec":
      week = 3
    elif week == "11Nov" or week == "09Dec":
      week = 4
    role = splitfile[3]
    label = labels[labels['participantID_week']==participant_id + "_"+str(week)]["label"].values[0] #TODO: to check according to the labels file
    label_dict[file] = label
  return label_dict

## Running code for preparing features

In [None]:
# load the labels
labels = pd.read_excel(LABELS_PATH, sheet_name='labels')
#print(labels.head())
mode_label_dict = {}
mode_colnames = {}
for mode in MODALITY:
  if mode == "audio":
    columns_to_ignore  = []
    subdir = SUBDIR_MODALITY[0]
  elif mode == "face":
    columns_to_ignore = ['frame', 'face_id', 'timestamp', 'confidence', 'success']
    subdir = SUBDIR_MODALITY[1]
  data_files = os.listdir(DATA_DIR + subdir)
  sample_df = pd.read_csv(DATA_DIR + subdir+  data_files[0])
  label_dict = make_label_dictonary(data_files, labels)
  col_names = define_columns(sample_df, columns_to_ignore)
  print(col_names)
  prepared_df = prepare_fixed_length_vector(data_files, col_names, columns_to_ignore, DATA_DIR + subdir)
  prepared_df.to_csv(FEATURE_DIR + mode +'_fixed_length_vectors.csv')
  mode_label_dict[mode] = label_dict
  mode_colnames[mode] = col_names

['filename', 'label', 'person_id', 'week', 'person_group', 'week_group', 'mel_1__mean', 'mel_1__median', 'mel_1__stddev', 'mel_1__autocorr', 'mel_2__mean', 'mel_2__median', 'mel_2__stddev', 'mel_2__autocorr', 'mel_3__mean', 'mel_3__median', 'mel_3__stddev', 'mel_3__autocorr', 'mel_4__mean', 'mel_4__median', 'mel_4__stddev', 'mel_4__autocorr', 'mel_5__mean', 'mel_5__median', 'mel_5__stddev', 'mel_5__autocorr', 'mel_6__mean', 'mel_6__median', 'mel_6__stddev', 'mel_6__autocorr', 'mel_7__mean', 'mel_7__median', 'mel_7__stddev', 'mel_7__autocorr', 'mel_8__mean', 'mel_8__median', 'mel_8__stddev', 'mel_8__autocorr', 'mel_9__mean', 'mel_9__median', 'mel_9__stddev', 'mel_9__autocorr', 'mel_10__mean', 'mel_10__median', 'mel_10__stddev', 'mel_10__autocorr', 'mel_11__mean', 'mel_11__median', 'mel_11__stddev', 'mel_11__autocorr', 'mel_12__mean', 'mel_12__median', 'mel_12__stddev', 'mel_12__autocorr', 'mel_13__mean', 'mel_13__median', 'mel_13__stddev', 'mel_13__autocorr', 'mel_14__mean', 'mel_14__me

# **Unimodal modeling**
In this section we will predict XX via monomodal approaches using BSFT dataset.
The ML models selected are:
- 'logistic_regression',
-        'rbf_svm',
-          'decision_tree',
-          'linear_svm',
-          'adaboost',
-          'xgboost',
-          'bagging',
-          'rforest'

The basic deep learning models are:
- 'LSTM'
- 'TCN'

We will predict XX via audio, face, or text. 

## Import libs and dataframe definition

In [8]:
from xgboost import plot_importance
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn import tree
from xgboost import XGBClassifier
!pip install optuna
import optuna
import warnings
import itertools
optuna.logging.set_verbosity(optuna.logging.FATAL)
warnings.filterwarnings("ignore")
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer
from sklearn.model_selection import LeaveOneOut, LeaveOneGroupOut

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.2-py3-none-any.whl (348 kB)
[K     |████████████████████████████████| 348 kB 2.1 MB/s 
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 10.2 MB/s 
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 57.2 MB/s 
Collecting Mako
  Downloading Mako-1.2.3-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 2.7 MB/s 
Collecting pbr!=2.1.0,>=2.0.0
  Downloading pbr-5.10.0-py2.py3-none-any.whl (112 kB)
[K     |████████████████████████████████| 112 kB 65.8 MB/s 
[?25hCollecting cmd2>=1.0.0
  Downloading cmd2-2.4.2-py3-none-any.whl (147 

In [9]:
# Check if the label proportions
DF_fixed_length = {}
for mode in MODALITY:
  DF_fixed_length[mode] = pd.read_csv(FEATURE_DIR + mode+'_fixed_length_vectors.csv')
  label_proportion = DF_fixed_length[mode]['label'].sum()/len(DF_fixed_length[mode])
  print('BASELINE (classifier that always predicts "Positive Affect level"): ', label_proportion)


BASELINE (classifier that always predicts "Positive Affect level"):  0.5853658536585366


## Functions definitions

In [10]:
def objective(trial, clf_name, X, y):
    if clf_name == 'logistic_regression':
        C = trial.suggest_loguniform('C', 1e-2, 1e+2)
        clf_model = LogisticRegression(C=C, max_iter=1e+5, solver='liblinear' , random_state=RANDOM_STATE)
   
    elif clf_name == 'linear_svm':
        C = trial.suggest_loguniform('C', 1e-2, 1e+2)
        degree = trial.suggest_int('degree',1, 50)
        gamma = trial.suggest_loguniform('gamma',0.001,10000)
        clf_model = SVC(C=C, kernel='linear', degree=degree,gamma=gamma, random_state=RANDOM_STATE)

    elif clf_name == 'rbf_svm': 
        C = trial.suggest_loguniform('C', 1e-2, 1e+2)
        degree = trial.suggest_int('degree',1, 50)
        gamma = trial.suggest_loguniform('gamma',1e-2,1e+2)
        clf_model = SVC(C=C, kernel='rbf', degree=degree,gamma=gamma, random_state=RANDOM_STATE)

    elif clf_name == 'decision_tree':
        max_depth = trial.suggest_int('max_depth', 2, 20)
        clf_model = DecisionTreeClassifier(max_depth=max_depth, random_state=RANDOM_STATE)

    elif clf_name == 'rforest':
        n_estimators = trial.suggest_int('n_estimators', 2, 20)
        max_depth = int(trial.suggest_float('max_depth', 1, 32, log=True))
        clf_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=RANDOM_STATE)

    elif clf_name == 'adaboost':
        n_estimators = trial.suggest_int('n_estimators', 20, 100)
        clf_model = AdaBoostClassifier(n_estimators=n_estimators, random_state=RANDOM_STATE)

    elif clf_name == 'xgboost':
        n_estimators = trial.suggest_int('n_estimators', 20, 100)
        clf_model = xgb.XGBClassifier(n_estimators=n_estimators, random_state=RANDOM_STATE)

    elif clf_name == 'bagging':
        n_estimators = trial.suggest_int('n_estimators', 20, 100)
        clf_model = BaggingClassifier(n_estimators=n_estimators, random_state=RANDOM_STATE)

    metrics = {'acc' : 'accuracy',
               'auc' : 'roc_auc',
               'f1' : 'f1'}
    clf = make_pipeline(MinMaxScaler(), clf_model)
    score = cross_validate(clf, X, y, scoring=metrics)
    return score['test_acc'].mean()

In [11]:
# define models
MODELS = ['logistic_regression',
          'rbf_svm',
          'decision_tree',
          'linear_svm',
          'adaboost',
          'xgboost',
          'bagging',
          'rforest']
# define modeling constants
#GROUP = "WEEK" # WEEK or PERSON
NON_MODELING_COLS = ['week_group', 'person_group', 'week', 'label', 'person_id', 'filename'] # columns not used in X features
RANDOM_STATE = 0 # random state for all models and cross-validation splits
NUM_SPLITS = 5 # number of cross-validation splits for model evaluation
NUM_REPEATS = 10 # number of times we will repeat the cross-validation (with different splits)

In [12]:
def run_modeling_experiment_logo(df, num_splits, num_repeats, NT, verbose, group):
  model_predictions = {} # predictions made by each model
  model_ground_truths = {} # should all be the same, but might as well store these
  model_features_ranked = {} # features ranked during features-selection
  model_feature_importance = {}
  model_features_selected = {} # features selected during feature-selection
  model_params = {} # params of trained models
  model_cm = {} # confusion matrix of trained models
  model_probas = {}

  # store raw results across all folds
  all_model_results = pd.DataFrame(data=None, columns=['model', 'cv_fold', 'accuracy', 'auc', 'precision', 'recall', 'f1', 'feature_importance'])
  all_values = pd.DataFrame(data=None, columns=['model', 'cv_fold', 'ground_truths', 'probas', 'preds'])

  df_without_NaN = pre_process_outsideCV(df)
  # loop through models
  for model in MODELS:
    # print model
    print(model)
    x_old = df_without_NaN
    # get X and y
    y = df_without_NaN['label']
    X = df_without_NaN.drop(NON_MODELING_COLS, axis=1)
    groups_week = x_old["week_group"]
    groups_participants = x_old["person_group"]
    ids = x_old["filename"]
    participants = x_old["person_id"]
    weeks = x_old["week"]
    if group == "WEEK":
      groups = groups_week
    elif group == "PERSON":
      groups = groups_participants
    # enter cross-validation loop
    #rskf = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=num_repeats, random_state = RANDOM_STATE)
    logo = LeaveOneGroupOut()


    # initialize fold numbers and predictions/ground truth lists
    fold_num = 0
    predictions = []
    probs = []
    ground_truths = [] #Y_test
    features_ranked = []
    features_importance = []
    


    #for train_index, test_index in rskf.split(X, y):
    for train_index, test_index in logo.split(X, y, groups=groups):
      # get train and test indices
      #print("TRAIN:", train_index, "TEST:", test_index)
      id_test = ids.loc[test_index]
      participant_test = participants.loc[test_index]
      picture_test = weeks.loc[test_index]
      X_train, X_test = X.loc[train_index], X.loc[test_index]
      y_train, y_test = y.values[train_index], y.values[test_index]
      # get the scaled version of the train and test set
      mm = MinMaxScaler()
      X_train_scaled = pd.DataFrame(mm.fit_transform(X_train), columns= X_train.columns)
      X_test_scaled = pd.DataFrame(mm.transform(X_test), columns=X_test.columns)

      ## compute the optimal hyper-parameters on the training set
      sampler = optuna.samplers.TPESampler(multivariate=True)
      # sampler = optuna.samplers.RandomSampler()
      name = model+'_'+str(fold_num)
      # default is Tree-structured Parzen Estimator (TPE) optimization algorithm
      study = optuna.create_study(direction='maximize', 
                                  sampler=sampler, 
                                  study_name=name, 
                                  pruner=optuna.pruners.HyperbandPruner(min_resource=1, reduction_factor=3))
      ## we use X_train in the following, not X_train_scaled, because optuna pipeline in objective function has MinMaxScaler() already
      study.optimize(lambda trial: objective(trial, model, X_train, y_train), n_trials=NT)
      trial = study.best_trial
      best_params = trial.params
     
      if model == 'logistic_regression': 
        optimal_clf = LogisticRegression(solver='lbfgs', **best_params)
        
      elif model == 'linear_svm': 
        optimal_clf = SVC(kernel='linear', probability=True, **best_params)

      elif model == 'rbf_svm': 
        optimal_clf = SVC(kernel='rbf', probability=True, **best_params)

      elif model == 'decision_tree': 
        feature_names = [f"feature {i}" for i in range(X_train.shape[1])]
        optimal_clf = DecisionTreeClassifier(**best_params)
        

      elif model == 'rforest': 
        optimal_clf = RandomForestClassifier(**best_params)

      elif model == 'adaboost':  
        optimal_clf = AdaBoostClassifier(**best_params)

      elif model == 'xgboost': 
        optimal_clf = xgb.XGBClassifier(**best_params)

      elif model == 'bagging': 
        optimal_clf = BaggingClassifier(**best_params)
        

      # now re-train the optimal model on the train set and test on the held-out test set

      optimal_clf.fit(X_train_scaled, y_train)
      preds = optimal_clf.predict(X_test_scaled)
      probas = optimal_clf.predict_proba(X_test_scaled)[:, 1]
      acc = accuracy_score(y_test, preds)
      try:
          auc = roc_auc_score(y_test, probas)
      except ValueError:
          auc = 0
          pass
      precision = precision_score(y_test, preds, labels=np.unique(preds))
      recall = recall_score(y_test, preds, labels=np.unique(preds))
      f1 = f1_score(y_test, preds,  labels=np.unique(preds))
      pars = optimal_clf.get_params()
      feature_importance = []
      # store results 
      all_model_results.loc[len(all_model_results.index)] = [model, fold_num, acc, auc, precision, recall, f1, feature_importance] 
      all_values.loc[len(all_values.index)]= [model, fold_num, y_test, probas, preds]
      probs.append(probas)
      predictions.append(preds)
      ground_truths.append(y_test)
      model_params[model + '__fold-' + str(fold_num)] = pars

      # increment fold_numroc_auc_score
      fold_num +=1

      # print for sanity
      curr_avg_acc = round(all_model_results[all_model_results['model']==model]['accuracy'].mean(), 2) 
      curr_avg_auc = round(all_model_results[all_model_results['model']==model]['auc'].mean(), 2)
      #curr_avg_fi = all_model_results[all_model_results['model']==model]['feature_importance'].mean()
      print('participant_test: ' + str(participant_test) +'   acc: ' + str(curr_avg_acc) + '   auc: ' + str(curr_avg_auc))

    # store model predictions and ground truths
    model_predictions[model] = predictions
    model_ground_truths[model] = ground_truths
    model_probas[model] = probs
    ground_truths_concatenate = np.concatenate( ground_truths, axis=0 )
    predictions_concatenate = np.concatenate( predictions, axis=0 )
    model_cm[model] = confusion_matrix(ground_truths_concatenate, predictions_concatenate, labels=[0,1])     
    #feature_importance = all_model_results[all_model_results['model'] == model]['feature_importance'].mean()   

    # verbose
    if verbose:
      print('model: ', model)
      print('mean accuracy: ', all_model_results[all_model_results['model'] == model]['accuracy'].mean())
      print('stddev accuracy: ', all_model_results[all_model_results['model'] == model]['accuracy'].std())
      print('mean auc: ', all_model_results[all_model_results['model'] == model]['auc'].mean())
      print('stddev auc: ', all_model_results[all_model_results['model'] == model]['auc'].std())
      print('mean precision: ', all_model_results[all_model_results['model'] == model]['precision'].mean())
      print('stddev precision: ', all_model_results[all_model_results['model'] == model]['precision'].std())
      print('mean recall: ', all_model_results[all_model_results['model'] == model]['recall'].mean())
      print('stddev recall: ', all_model_results[all_model_results['model'] == model]['recall'].std())
      print('mean f1: ', all_model_results[all_model_results['model'] == model]['f1'].mean())
      print('stddev f1: ', all_model_results[all_model_results['model'] == model]['f1'].std())
      print()
      print()
    print()
    
  return all_model_results, all_values, model_predictions, model_ground_truths, model_probas, model_features_ranked, model_features_selected, model_params, model_cm, feature_importance

In [13]:
def run_modeling_experiment(df, num_splits, num_repeats, NT, verbose):
  model_predictions = {} # predictions made by each model
  model_ground_truths = {} # should all be the same, but might as well store these
  model_features_ranked = {} # features ranked during features-selection
  model_feature_importance = {}
  model_features_selected = {} # features selected during feature-selection
  model_params = {} # params of trained models
  model_probas = {}

  # store raw results across all folds
  all_model_results = pd.DataFrame(data=None, columns=['model', 'cv_fold', 'accuracy', 'auc', 'precision', 'recall', 'f1'])
  all_values = pd.DataFrame(data=None, columns=['model', 'cv_fold', 'ground_truths', 'probas', 'preds'])

  df_without_NaN = pre_process_outsideCV(df)
  # loop through models
  for model in MODELS:
    # print model
    print(model)

    # get X and y
    x_old = df_without_NaN
    y = df_without_NaN['label']
    X = df_without_NaN.drop(NON_MODELING_COLS, axis=1)
    ids = x_old["filename"]

    # enter cross-validation loop
    rskf = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=num_repeats, random_state = RANDOM_STATE)

    # initialize fold numbers and predictions/ground truth lists
    fold_num = 0
    predictions = []
    ground_truths = []
    probs = []
    features_ranked = []
    features_importance = []

    for train_index, test_index in rskf.split(X, y):
      # get train and test indices

      X_train, X_test = X.loc[train_index], X.loc[test_index]
      y_train, y_test = y.values[train_index], y.values[test_index]
      id_test = ids.loc[test_index]
      # get the scaled version of the train and test set
      mm = MinMaxScaler()
      X_train_scaled = pd.DataFrame(mm.fit_transform(X_train), columns= X_train.columns)
      X_test_scaled = pd.DataFrame(mm.transform(X_test), columns=X_test.columns)

      ## compute the optimal hyper-parameters on the training set
      sampler = optuna.samplers.TPESampler(multivariate=True)
      # sampler = optuna.samplers.RandomSampler()
      name = model+'_'+str(fold_num)
      # default is Tree-structured Parzen Estimator (TPE) optimization algorithm
      study = optuna.create_study(direction='maximize', 
                                  sampler=sampler, 
                                  study_name=name, 
                                  pruner=optuna.pruners.HyperbandPruner(min_resource=1, reduction_factor=3))
      ## we use X_train in the following, not X_train_scaled, because optuna pipeline in objective function has MinMaxScaler() already
      study.optimize(lambda trial: objective(trial, model, X_train, y_train), n_trials=NT)
      trial = study.best_trial
      best_params = trial.params
     
      if model == 'logistic_regression': 
        optimal_clf = LogisticRegression(solver='lbfgs', **best_params)
        
      elif model == 'linear_svm': 
        optimal_clf = SVC(kernel='linear', probability=True, **best_params)

      elif model == 'rbf_svm': 
        optimal_clf = SVC(kernel='rbf', probability=True, **best_params)

      elif model == 'decision_tree': 
        optimal_clf = DecisionTreeClassifier(**best_params)

      elif model == 'rforest': 
        optimal_clf = RandomForestClassifier(**best_params)

      elif model == 'adaboost':  
        optimal_clf = AdaBoostClassifier(**best_params)

      elif model == 'xgboost': 
        optimal_clf = xgb.XGBClassifier(**best_params)

      elif model == 'bagging': 
        optimal_clf = BaggingClassifier(**best_params)
        

      # now re-train the optimal model on the train set and test on the held-out test set

      optimal_clf.fit(X_train_scaled, y_train)
      preds = optimal_clf.predict(X_test_scaled)
      probas = optimal_clf.predict_proba(X_test_scaled)[:, 1]
      acc = accuracy_score(y_test, preds)
      auc = roc_auc_score(y_test, probas)
      precision = precision_score(y_test, preds, labels=np.unique(preds))
      recall = recall_score(y_test, preds, labels=np.unique(preds))
      f1 = f1_score(y_test, preds,  labels=np.unique(preds))
      pars = optimal_clf.get_params()

      # store results 
      all_model_results.loc[len(all_model_results.index)] = [model, fold_num, acc, auc, precision, recall, f1]
      all_values.loc[len(all_values.index)]= [mode, model, fold_num, y_test, probas, preds]
      probs.append(probas)
      predictions.append(preds)
      ground_truths.append(y_test)
      model_params[model + '__fold-' + str(fold_num)] = pars

      # increment fold_num
      fold_num +=1

      # print for sanity
      curr_avg_acc = round(all_model_results[all_model_results['model']==model]['accuracy'].mean(), 2) 
      curr_avg_auc = round(all_model_results[all_model_results['model']==model]['auc'].mean(), 2)
      print('participant_picture_test: ' + str(id_test) +' acc: ' + str(curr_avg_acc) + '   auc: ' + str(curr_avg_auc))

    # store model predictions and ground truths
    model_predictions[model] = predictions
    model_ground_truths[model] = ground_truths
    model_probas[model] = probs

    # verbose
    if verbose:
      print('model: ', model)
      print('mean accuracy: ', all_model_results[all_model_results['model'] == model]['accuracy'].mean())
      print('stddev accuracy: ', all_model_results[all_model_results['model'] == model]['accuracy'].std())
      print('mean auc: ', all_model_results[all_model_results['model'] == model]['auc'].mean())
      print('stddev auc: ', all_model_results[all_model_results['model'] == model]['auc'].std())
      print('mean precision: ', all_model_results[all_model_results['model'] == model]['precision'].mean())
      print('stddev precision: ', all_model_results[all_model_results['model'] == model]['precision'].std())
      print('mean recall: ', all_model_results[all_model_results['model'] == model]['recall'].mean())
      print('stddev recall: ', all_model_results[all_model_results['model'] == model]['recall'].std())
      print('mean f1: ', all_model_results[all_model_results['model'] == model]['f1'].mean())
      print('stddev f1: ', all_model_results[all_model_results['model'] == model]['f1'].std())
      print()
      print()
    print()
  return all_model_results, all_values, model_predictions, model_ground_truths, model_probas, model_features_ranked, model_features_selected, model_params

## Running experiments

In [58]:
# run experiments monomodal for FACE, and AUDIO
for mode in MODALITY:
  model_results, model_values, model_preds, model_gtruths, model_probas, model_featranks, model_featselected, model_params = run_modeling_experiment(DF_fixed_length[mode], 2, 1, 1, True)
  model_results.to_csv(RESULTS_DIR + mode +'_all_model_results.csv')
  model_values["mode"] = mode
  model_values.to_csv(RESULTS_DIR + mode +'_all_model_values.csv')

                  model cv_fold  \
0   logistic_regression       0   
1   logistic_regression       1   
2               rbf_svm       0   
3               rbf_svm       1   
4         decision_tree       0   
5         decision_tree       1   
6            linear_svm       0   
7            linear_svm       1   
8              adaboost       0   
9              adaboost       1   
10              xgboost       0   
11              xgboost       1   
12              bagging       0   
13              bagging       1   
14              rforest       0   
15              rforest       1   

                                        ground_truths  \
0   [1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, ...   
1   [1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, ...   
2   [1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, ...   
3   [1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, ...   
4   [1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, ...   
5   [1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, ...   
6   [

In [None]:
# leave one week and subject out
for mode in MODALITY:
  group = "WEEK"
  model_results, model_values, model_preds, model_gtruths, model_probas, model_featranks, model_featselected, model_params = run_modeling_experiment_logo(DF_fixed_length[mode], NUM_SPLITS, NUM_REPEATS, 50, True, group)
  model_results.to_csv(RESULTS_DIR + mode +'_'+group+'_all_model_results.csv')
  group = "SUBJECT"
  model_results, model_values, model_preds, model_gtruths, model_probas, model_featranks, model_featselected, model_params = run_modeling_experiment_logo(DF_fixed_length[mode], NUM_SPLITS, NUM_REPEATS, 50, True, group)
  model_results.to_csv(RESULTS_DIR + mode +'_'+group+'_all_model_results.csv')

# **Multimodal modeling**

## **Early Fusion**
Fuse the audio, face, and text data at the feature-level. 

### Concatenate feature vectors

In [14]:
# Check if the label proportions
DF_fixed_length = {}
for mode in MODALITY:
  DF_fixed_length[mode] = pd.read_csv(FEATURE_DIR + mode+'_fixed_length_vectors.csv')
  label_proportion = DF_fixed_length[mode]['label'].sum()/len(DF_fixed_length[mode])
  print('BASELINE (classifier that always predicts "Positive Affect level"): ', label_proportion)

BASELINE (classifier that always predicts "Positive Affect level"):  0.5853658536585366


In [15]:
df = pd.concat([DF_fixed_length[MODALITY[0]], DF_fixed_length[MODALITY[1]]],  axis=1, join="inner")
df_early_fusion = pre_process_outsideCV(df) # df without NaN values
print(df_early_fusion.shape)

IndexError: ignored

### Running experiments

In [None]:
# run experiments monomodal for concatenated feature vector
model_results, model_values, model_preds, model_gtruths, model_probas, model_featranks, model_featselected, model_params = run_modeling_experiment(df_early_fusion, NUM_SPLITS, NUM_REPEATS, 50, True)
model_results.to_csv(RESULTS_DIR + mode +'_all_model_results_early_fusion.csv')

In [None]:
# leave one week and subject out
group = "WEEK"
model_results, model_values, model_preds, model_gtruths, model_probas, model_featranks, model_featselected, model_params = run_modeling_experiment_logo(df_early_fusion, NUM_SPLITS, NUM_REPEATS, 50, True, group)
model_results.to_csv(RESULTS_DIR + mode +'_'+group+'_all_model_results_early_fusion.csv')
group = "SUBJECT"
model_results, model_values, model_preds, model_gtruths, model_probas, model_featranks, model_featselected, model_params = run_modeling_experiment_logo(df_early_fusion, NUM_SPLITS, NUM_REPEATS, 50, True, group)
model_results.to_csv(RESULTS_DIR + mode +'_'+group+'_all_model_results_early_fusion.csv')

## **Late Fusion**
Concatenation of modalities at decision-level. Fusion mechanisms:  averaging, voting schemes, weighting based on channel noise and signal variance, or a learned model (see [this paper](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8269806&fbclid=IwAR1C_TKJXvLIdOsgkOiwX_A10pZocHEiBOvhgwjeYgWiTf9B7_N3PiszMQM&tag=1)) 

### Dataframe creation from unimodal predictions

In [16]:
def evaluation_metrics(gt, preds, probas):
  acc = accuracy_score(gt, preds)
  auc = roc_auc_score(gt, probas)
  precision = precision_score(gt, preds, labels=np.unique(preds))
  recall = recall_score(gt, preds, labels=np.unique(preds))
  f1 = f1_score(gt, preds,  labels=np.unique(preds))
  return acc, auc, precision, recall, f1

In [17]:
from ast import literal_eval
df = {}
MODALITY = ["audio", "face"]
for mode in MODALITY:
  df[mode] = pd.read_csv(RESULTS_DIR + mode +'_all_model_values.csv')
df = pd.concat([df[MODALITY[0]],df[MODALITY[1]]], ignore_index = True)#, df[MODALITY[1]]])

### Major voting (hard and soft)
The final classifier decision is either the class label predicted most frequently by the unimodal classifiers (hard majority voting) or the class label with the highest average predicted class probability across the unimodal classifiers (soft majority voting).

In [64]:
def major_voting_late_fusion(df, mv_type):
  #mv_type (string): major voting type: hard or soft
  # store raw results across all folds
  all_model_results = pd.DataFrame(data=None, columns=['model', 'cv_fold', 'accuracy', 'auc', 'precision', 'recall', 'f1'])
  for model in MODELS:
    df_model = df.loc[df['model'] == model]
    # define as many df_model as the number of modalities to take into account
    df_model_mode1 = df_model.loc[df_model['mode'] == MODALITY[0]].reset_index() 
    df_model_mode2 = df_model.loc[df_model['mode'] == MODALITY[1]].reset_index() 
    #loop through folds:
    for index, row in df_model_mode1.iterrows():
      fold_num = df_model_mode1["cv_fold"][index]
      gt = [float(i) for i in ((df_model_mode1["ground_truths"][index].replace("[","")).replace("]", "")).split()]
      pred_1 = [float(i) for i in ((df_model_mode1["preds"][index].replace("[","")).replace("]", "")).split()] 
      pred_2 =  [float(i) for i in((df_model_mode2["preds"][index].replace("[","")).replace("]", "")).split()]
      prob_1 =  [float(i) for i in ((df_model_mode1["probas"][index].replace("[","")).replace("]", "")).split()]
      prob_2 = [float(i) for i in((df_model_mode2["probas"][index].replace("[","")).replace("]", "")).split()]
      pred_major_voting = []
      prob_major_voting = []
      for i in range(0,len(pred_1)):
        if mv_type == "hard":
          if pred_1[i] >= pred_2[i]:
            pred_major_voting.append(pred_1[i])
            prob_major_voting.append(prob_1[i])
          elif pred_1[i] < pred_2[i]:
            pred_major_voting.append(pred_2[i])
            prob_major_voting.append(prob_2[i])
        elif mv_type == "soft":
          if prob_1[i] >= prob_2[i]:
            prob_major_voting.append(prob_1[i])
            pred_major_voting.append(pred_1[i])
          elif prob_1[i] < prob_2[i]:
            prob_major_voting.append(prob_2[i])
            pred_major_voting.append(pred_2[i])
      [acc, auc, precision, recall, f1] =  evaluation_metrics(gt, pred_major_voting, prob_major_voting)
      all_model_results.loc[len(all_model_results.index)] = [model, fold_num, acc, auc, precision, recall, f1]
      print(all_model_results.head())
  return all_model_results


In [66]:
## for each model, get the most frequent class label predicted for each unimodal classifiers 
MV_TYPES = ["hard", "soft"]
for mv_type in MV_TYPES:
  model_results = major_voting_late_fusion(df, mv_type)
  model_results.to_csv(RESULTS_DIR + 'multimodal_all_model_results_major_voting_'+mv_type+'.csv')

                 model cv_fold  accuracy       auc  precision    recall  \
0  logistic_regression       0  0.714286  0.675926   0.714286  0.833333   

         f1  
0  0.769231  
                 model cv_fold  accuracy       auc  precision    recall  \
0  logistic_regression       0  0.714286  0.675926   0.714286  0.833333   
1  logistic_regression       1  0.700000  0.729167   0.800000  0.666667   

         f1  
0  0.769231  
1  0.727273  
                 model cv_fold  accuracy       auc  precision    recall  \
0  logistic_regression       0  0.714286  0.675926   0.714286  0.833333   
1  logistic_regression       1  0.700000  0.729167   0.800000  0.666667   
2              rbf_svm       0  0.571429  0.500000   0.571429  1.000000   

         f1  
0  0.769231  
1  0.727273  
2  0.727273  
                 model cv_fold  accuracy       auc  precision    recall  \
0  logistic_regression       0  0.714286  0.675926   0.714286  0.833333   
1  logistic_regression       1  0.700000  0.72

### Stacking (hard and soft)
Building a meta-model (it gives more context). A final classifier is trained on either the predicted class labels of the unimodal classifiers (hard stacking) or the predicted class probabilities of the unimodal classifiers (soft stacking).

In [None]:
NON_MODELING_COLS = ['model', 'cv_fold', 'mode']

In [67]:
from sklearn.ensemble import StackingClassifier
# TODOO!!
def stacking(df, num_splits, num_repeats, NT, verbose):
  all_model_results = pd.DataFrame(data=None, columns=['model', 'cv_fold', 'accuracy', 'auc', 'precision', 'recall', 'f1'])

  df_without_NaN = pre_process_outsideCV(df)
  # loop through models
  for model in MODELS:
    # print model
    print(model)

    # get X and y
    x_old = df_without_NaN
    y = df_without_NaN['label']
    X = df_without_NaN.drop(NON_MODELING_COLS, axis=1)
    ids = x_old["filename"]

    # enter cross-validation loop
    rskf = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=num_repeats, random_state = RANDOM_STATE)

    # initialize fold numbers and predictions/ground truth lists
    fold_num = 0
    predictions = []
    ground_truths = []
    probs = []
    features_ranked = []
    features_importance = []

    for train_index, test_index in rskf.split(X, y):
      # get train and test indices

      X_train, X_test = X.loc[train_index], X.loc[test_index]
      y_train, y_test = y.values[train_index], y.values[test_index]
      id_test = ids.loc[test_index]
      # get the scaled version of the train and test set
      mm = MinMaxScaler()
      X_train_scaled = pd.DataFrame(mm.fit_transform(X_train), columns= X_train.columns)
      X_test_scaled = pd.DataFrame(mm.transform(X_test), columns=X_test.columns)

      ## compute the optimal hyper-parameters on the training set
      sampler = optuna.samplers.TPESampler(multivariate=True)
      # sampler = optuna.samplers.RandomSampler()
      name = model+'_'+str(fold_num)
      # default is Tree-structured Parzen Estimator (TPE) optimization algorithm
      study = optuna.create_study(direction='maximize', 
                                  sampler=sampler, 
                                  study_name=name, 
                                  pruner=optuna.pruners.HyperbandPruner(min_resource=1, reduction_factor=3))
      ## we use X_train in the following, not X_train_scaled, because optuna pipeline in objective function has MinMaxScaler() already
      study.optimize(lambda trial: objective(trial, model, X_train, y_train), n_trials=NT)
      trial = study.best_trial
      best_params = trial.params
     
      if model == 'logistic_regression': 
        optimal_clf = LogisticRegression(solver='lbfgs', **best_params)
        
      elif model == 'linear_svm': 
        optimal_clf = SVC(kernel='linear', probability=True, **best_params)

      elif model == 'rbf_svm': 
        optimal_clf = SVC(kernel='rbf', probability=True, **best_params)

      elif model == 'decision_tree': 
        optimal_clf = DecisionTreeClassifier(**best_params)

      elif model == 'rforest': 
        optimal_clf = RandomForestClassifier(**best_params)

      elif model == 'adaboost':  
        optimal_clf = AdaBoostClassifier(**best_params)

      elif model == 'xgboost': 
        optimal_clf = xgb.XGBClassifier(**best_params)

      elif model == 'bagging': 
        optimal_clf = BaggingClassifier(**best_params)
        

      # now re-train the optimal model on the train set and test on the held-out test set
      estimators = [optimal_clf_1, optimal_clf_2]
      stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(solver='lbfgs', **best_params))
      stacking_clf.fit(X_train_scaled, y_train)
      preds = stacking_clf.predict(X_test_scaled)
      probas = stacking_clf.predict_proba(X_test_scaled)[:, 1]
      acc = accuracy_score(y_test, preds)
      auc = roc_auc_score(y_test, probas)
      precision = precision_score(y_test, preds, labels=np.unique(preds))
      recall = recall_score(y_test, preds, labels=np.unique(preds))
      f1 = f1_score(y_test, preds,  labels=np.unique(preds))
      pars = stacking_clf.get_params()

      # store results 
      all_model_results.loc[len(all_model_results.index)] = [model, fold_num, acc, auc, precision, recall, f1]
      all_values.loc[len(all_values.index)]= [mode, model, fold_num, y_test, probas, preds]
      probs.append(probas)
      predictions.append(preds)
      ground_truths.append(y_test)
      model_params[model + '__fold-' + str(fold_num)] = pars

      # increment fold_num
      fold_num +=1

      # print for sanity
      curr_avg_acc = round(all_model_results[all_model_results['model']==model]['accuracy'].mean(), 2) 
      curr_avg_auc = round(all_model_results[all_model_results['model']==model]['auc'].mean(), 2)
      print('participant_picture_test: ' + str(id_test) +' acc: ' + str(curr_avg_acc) + '   auc: ' + str(curr_avg_auc))

    # store model predictions and ground truths
    model_predictions[model] = predictions
    model_ground_truths[model] = ground_truths
    model_probas[model] = probs

    # verbose
    if verbose:
      print('model: ', model)
      print('mean accuracy: ', all_model_results[all_model_results['model'] == model]['accuracy'].mean())
      print('stddev accuracy: ', all_model_results[all_model_results['model'] == model]['accuracy'].std())
      print('mean auc: ', all_model_results[all_model_results['model'] == model]['auc'].mean())
      print('stddev auc: ', all_model_results[all_model_results['model'] == model]['auc'].std())
      print('mean precision: ', all_model_results[all_model_results['model'] == model]['precision'].mean())
      print('stddev precision: ', all_model_results[all_model_results['model'] == model]['precision'].std())
      print('mean recall: ', all_model_results[all_model_results['model'] == model]['recall'].mean())
      print('stddev recall: ', all_model_results[all_model_results['model'] == model]['recall'].std())
      print('mean f1: ', all_model_results[all_model_results['model'] == model]['f1'].mean())
      print('stddev f1: ', all_model_results[all_model_results['model'] == model]['f1'].std())
      print()
      print()
    print()
  return all_model_results, all_values, model_predictions, model_ground_truths, model_probas, model_features_ranked, model_features_selected, model_params
  

### Hybrid Stacking (hard and soft)
An early fusion vector with features from a set of modalities is concatenated with either the predicted class labels of the set of corresponding unimodal classifiers (hard hybrid fusion) or the predicted class probabilities of the set of corresponding unimodal classifiers (soft hybrid fusion), in order to create final feature vectors that are used in a classifier.

### Others approaches (not implemented)
**Bagging** 
A Bagging classifier is an ensemble meta-estimator that fits base classifiers each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction.
sklearn.ensemble.BaggingClassifier


**Boosting**
Boosting is an ensemble learning method that combines a set of weak learners into a strong learner to minimize training errors. In boosting, a random sample of data is selected, fitted with a model and then trained sequentially—that is, each model tries to compensate for the weaknesses of its predecessor.