# Task 4: The Challenge

## Prepare Dataset

In [39]:
import numpy as np
import pandas as pd
import torch
import xgboost as xgb
from tqdm import tqdm

from sklearn.model_selection import GroupShuffleSplit, GroupKFold, cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, auc, confusion_matrix, make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [40]:
# load datasets

df_train = pd.read_csv("task_3_training_e8da4715deef7d56_f8b7378_generic.csv", header=0)
df_test = pd.read_csv("task_4_test_dd4bd32b08b776e6_daf99ad_generic.csv", header=0)

# keep high level features (GEMS) as separate array for training

hl_train = df_train[df_train.columns[175:]]

# drop columns 175-201, since they contain the high-level features not available in the test set

df_train = df_train.drop(df_train.columns[175:], axis=1)

# drop valence and arousal as specified in the task description as well as pianist, segment and snippet column

df_train = df_train.drop(["arousal", "valence", "pianist_id", "segment_id", "snippet_id"], axis=1)
df_test = df_test.drop(["pianist_id", "segment_id", "snippet_id"], axis=1)

In [41]:
# separate data and labels from training set

X_train = df_train[df_train.columns[:-1]]
y_train = df_train[df_train.columns[-1]]


## Predicting high-level features

In [42]:
# normalize data

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(df_test)

# use non-optimized xgb to predict high level features
# multioutputregressor trains one regressor for each high level feature

xgb_multi = MultiOutputRegressor(xgb.XGBRegressor(objective="reg:squarederror", random_state=42), n_jobs=-1)
xgb_multi.fit(X_train, hl_train)
hl_pred = xgb_multi.predict(X_test)

In [5]:
hl_pred

array([[2.9042423 , 3.0406234 , 1.4873655 , ..., 0.10163505, 0.04431162,
        0.03267397],
       [3.0142393 , 2.9344585 , 1.86377   , ..., 0.01307283, 0.0660662 ,
        0.34811604],
       [2.512981  , 2.1732771 , 1.7583784 , ..., 0.04756289, 0.37012482,
        0.02735191],
       ...,
       [2.9266903 , 2.861176  , 1.5889468 , ..., 0.07562069, 0.24788648,
        0.07221433],
       [2.5819573 , 2.550523  , 1.387153  , ..., 0.26977864, 0.7670786 ,
        0.04646803],
       [2.8836863 , 3.0191398 , 1.3788074 , ..., 0.29734248, 0.7923114 ,
        0.13583864]], dtype=float32)

In [4]:
# merge high level features together with test set
columns = ["gems_wonder","gems_transcendence","gems_tenderness","gems_nostalgia","gems_peacefulness","gems_power","gems_joyful_activation","gems_tension","gems_sadness","gemmes_movement","gemmes_force","gemmes_interior","gemmes_wandering","gemmes_flow","gems_wonder_binary","gems_transcendence_binary","gems_tenderness_binary","gems_nostalgia_binary","gems_peacefulness_binary","gems_power_binary","gems_joyful_activation_binary","gems_tension_binary","gems_sadness_binary","gemmes_movement_binary","gemmes_force_binary","gemmes_interior_binary","gemmes_wandering_binary","gemmes_flow_binary"]
df_test = pd.concat([df_test, pd.DataFrame(hl_pred, columns=columns)], axis=1)
df_test.to_csv("df_test_restored_hl.csv", index=False)
df_test

NameError: name 'hl_pred' is not defined

## Predict quadrants

In [43]:
# reload df_train to restore high level features removed earlier and drop non usable columns and get df_test from saved file (so the notebook doesn't need to be run every time)

df_train = pd.read_csv("task_3_training_e8da4715deef7d56_f8b7378_generic.csv", header=0)
df_test = pd.read_csv("df_test_restored_hl.csv", header=0)

# save segment_id column for GroupShuffleSplit and drop pianist, segment and snippet column
group_idx = df_train["segment_id"]

df_train = df_train.drop(["arousal", "valence", "pianist_id", "segment_id", "snippet_id"], axis=1)

In [44]:
# The last 14 columns (GEMS) are binary. So far we predicted values in range [0,1]. We round the values to match the binary type

df_test[df_test.columns[-14:]] = df_test[df_test.columns[-14:]].round(decimals=0).astype(int)

In [45]:
# prepare data and label

X = df_train.drop(["quadrant"], axis=1)
y = df_train["quadrant"]

In [46]:
# custom scoring function for GridSearchCV

# Labels and corresponding mood
# 1: happy
# 2: angry
# 3: sad
# 4: relaxed

def filmotion_scoring(y, y_pred):
    balance = 0
    revenue_matrix = np.array([[5,-5,-5,2],[-5,10,2,-5],[-5,2,10,-5],[2,-5,-2,5]])
    for true, pred in zip(y, y_pred):
        balance += revenue_matrix[int(true)-1,int(pred)-1]
    return balance/len(y)

In [47]:
mask = np.logical_or(np.logical_or(group_idx==20, group_idx==21), group_idx==23)

# define test set
X_test = X.loc[mask]
y_test = y.loc[mask]
group_idx_test = group_idx.loc[mask]

# define training set
mask = np.invert(mask)
X_train = X.loc[mask]
y_train = y.loc[mask]
group_idx_train = group_idx.loc[mask]

In [48]:
# we will use GroupKFold which seperates our data into k folds in which the groups differ, in a way that in 
# the first fold only group 1,2 in the second fold only group 3,4 ... are listed
gkf = GroupKFold(n_splits=6)

In [49]:
# make custom scorer for grid search using previously defined scoring function
filmotion = make_scorer(filmotion_scoring)

In [50]:
# Getting the best parameter settings using GridSearch

#########
## KNN ##
#########

knn_params = {"n_neighbors" : [3, 5, 25, 50], "algorithm": ["auto"], "weights": ["uniform"]}

knn_model = GridSearchCV(KNeighborsClassifier(), param_grid=knn_params, cv=gkf, scoring=filmotion)
knn_model.fit(X_train, y_train, groups=group_idx_train)

#########
## XGB ##
#########

xgb_params = {'min_child_weight': [4], 'gamma': [0], 'max_depth': [4],'n_estimators': [800]}

xgb_model = GridSearchCV(estimator=XGBClassifier(use_label_encoder=False, eval_metric='merror'), param_grid=xgb_params, cv=gkf, scoring=filmotion)
xgb_model.fit(X_train, y_train, groups=group_idx_train)

########
## RF ##
########

rf_params = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

rf_model = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_params, cv=gkf, scoring=filmotion)
rf_model.fit(X_train, y_train, groups=group_idx_train)

#########################
## Logistic Regression ##
#########################

log_reg_param = {"penalty": ['l2'], 'C': [1], "dual":[True]}
reg_model = GridSearchCV(LogisticRegression(solver='liblinear', max_iter=8000), param_grid=log_reg_param, cv=gkf, scoring=filmotion)
reg_model.fit(X_train, y_train, groups=group_idx_train)

NameError: name 'XGBClassifier' is not defined