# Task 4: The Challenge

## Prepare Dataset

In [145]:
import numpy as np
import pandas as pd
import torch
import xgboost as xgb
from tqdm import tqdm

from sklearn.model_selection import GroupShuffleSplit, GroupKFold, cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, auc, confusion_matrix, make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [46]:
# load datasets

df_train = pd.read_csv("task_3_training_e8da4715deef7d56_f8b7378_generic.csv", header=0)
df_test = pd.read_csv("task_4_test_dd4bd32b08b776e6_daf99ad_generic.csv", header=0)

# keep high level features (GEMS) as separate array for training

hl_train = df_train[df_train.columns[175:]]

# drop columns 175-201, since they contain the high-level features not available in the test set

df_train = df_train.drop(df_train.columns[175:], axis=1)

# drop valence and arousal as specified in the task description as well as pianist, segment and snippet column

df_train = df_train.drop(["arousal", "valence", "pianist_id", "segment_id", "snippet_id"], axis=1)
df_test = df_test.drop(["pianist_id", "segment_id", "snippet_id"], axis=1)

In [47]:
# separate data and labels from training set

X_train = df_train[df_train.columns[:-1]]
y_train = df_train[df_train.columns[-1]]


## Predicting high-level features

In [4]:
# normalize data

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(df_test)

# use non-optimized xgb to predict high level features
# multioutputregressor trains one regressor for each high level feature

xgb_multi = MultiOutputRegressor(xgb.XGBRegressor(objective="reg:squarederror", random_state=42), n_jobs=-1)
xgb_multi.fit(X_train, hl_train)
hl_pred = xgb_multi.predict(X_test)

In [5]:
hl_pred

array([[2.9042423 , 3.0406234 , 1.4873655 , ..., 0.10163505, 0.04431162,
        0.03267397],
       [3.0142393 , 2.9344585 , 1.86377   , ..., 0.01307283, 0.0660662 ,
        0.34811604],
       [2.512981  , 2.1732771 , 1.7583784 , ..., 0.04756289, 0.37012482,
        0.02735191],
       ...,
       [2.9266903 , 2.861176  , 1.5889468 , ..., 0.07562069, 0.24788648,
        0.07221433],
       [2.5819573 , 2.550523  , 1.387153  , ..., 0.26977864, 0.7670786 ,
        0.04646803],
       [2.8836863 , 3.0191398 , 1.3788074 , ..., 0.29734248, 0.7923114 ,
        0.13583864]], dtype=float32)

In [6]:
# merge high level features together with test set
columns = ["gems_wonder","gems_transcendence","gems_tenderness","gems_nostalgia","gems_peacefulness","gems_power","gems_joyful_activation","gems_tension","gems_sadness","gemmes_movement","gemmes_force","gemmes_interior","gemmes_wandering","gemmes_flow","gems_wonder_binary","gems_transcendence_binary","gems_tenderness_binary","gems_nostalgia_binary","gems_peacefulness_binary","gems_power_binary","gems_joyful_activation_binary","gems_tension_binary","gems_sadness_binary","gemmes_movement_binary","gemmes_force_binary","gemmes_interior_binary","gemmes_wandering_binary","gemmes_flow_binary"]
df_test = pd.concat([df_test, pd.DataFrame(hl_pred, columns=columns)], axis=1)
df_test.to_csv("df_test_restored_hl.csv", index=False)
df_test

Unnamed: 0,essentia_dissonance_mean,essentia_dissonance_stdev,essentia_dynamic_complexity,essentia_loudness,essentia_onset_rate,essentia_pitch_salience_mean,essentia_pitch_salience_stdev,essentia_spectral_centroid_mean,essentia_spectral_centroid_stdev,essentia_spectral_complexity_mean,...,gems_peacefulness_binary,gems_power_binary,gems_joyful_activation_binary,gems_tension_binary,gems_sadness_binary,gemmes_movement_binary,gemmes_force_binary,gemmes_interior_binary,gemmes_wandering_binary,gemmes_flow_binary
0,0.229886,0.062064,4.818434,93184008.0,3.600000,0.614987,0.090779,1365.162964,331.913849,28.465279,...,-0.046054,1.097160,0.779082,0.078369,-0.015770,0.287028,0.351035,0.101635,0.044312,0.032674
1,0.178258,0.062866,3.003938,109259888.0,4.000000,0.564438,0.094290,1215.545288,271.362946,27.719908,...,-0.052386,0.318361,0.148020,0.431009,-0.067539,0.348549,0.613892,0.013073,0.066066,0.348116
2,0.209623,0.063750,1.983438,98162960.0,4.600000,0.553583,0.111736,1339.914185,255.108398,28.067129,...,0.200367,0.274190,0.515871,0.603049,-0.055371,0.400845,0.461815,0.047563,0.370125,0.027352
3,0.215004,0.061839,1.470698,72971816.0,3.915029,0.642106,0.092053,1143.090576,483.887421,28.758064,...,0.177475,0.520224,0.708390,0.462060,0.022678,0.570064,0.361350,0.192380,0.089040,0.096276
4,0.187017,0.072593,2.613981,80688832.0,2.600000,0.564319,0.104814,1148.319458,238.749054,28.587963,...,0.042277,0.858141,-0.026433,0.343827,0.188275,0.258797,0.455369,0.428896,0.048531,0.165239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2080,0.239235,0.048011,1.178853,50233408.0,2.516549,0.640804,0.073967,1298.177979,268.525360,27.625000,...,0.047049,0.228876,0.146252,0.363076,0.163203,0.048167,0.551620,-0.001526,0.440754,0.640478
2081,0.293661,0.060317,2.598517,178390704.0,2.400000,0.676194,0.080788,1602.075806,396.212982,29.527779,...,0.259998,0.630264,0.346204,0.758534,0.066646,0.727459,0.563753,0.123525,0.215549,0.058116
2082,0.272440,0.058219,1.851364,178757696.0,4.600000,0.681900,0.081200,1246.282837,252.229889,30.159721,...,-0.061432,0.364309,0.436263,0.261264,0.151829,0.283592,0.427677,0.075621,0.247886,0.072214
2083,0.273417,0.060010,1.930694,285332608.0,2.000000,0.708230,0.066477,1536.803711,552.418701,29.668982,...,0.085237,0.890854,0.566359,0.685380,0.134412,0.725270,0.593679,0.269779,0.767079,0.046468


## Predict quadrants

In [146]:
# reload df_train to restore high level features removed earlier and drop non usable columns and get df_test from saved file (so the notebook doesn't need to be run every time)

df_train = pd.read_csv("task_3_training_e8da4715deef7d56_f8b7378_generic.csv", header=0)
df_test = pd.read_csv("df_test_restored_hl.csv", header=0)

# save segment_id column for GroupShuffleSplit and drop pianist, segment and snippet column
group_idx = df_train["segment_id"]

df_train = df_train.drop(["arousal", "valence", "pianist_id", "segment_id", "snippet_id"], axis=1)

In [147]:
# prepare data and label

X = df_train.drop(["quadrant"], axis=1)
y = df_train["quadrant"]

In [148]:
# custom scoring function for GridSearchCV

# Labels and corresponding mood
# 1: happy
# 2: angry
# 3: sad
# 4: relaxed

def filmotion_scoring(y, y_pred):
    balance = 0
    revenue_matrix = np.array([[5,-5,-5,2],[-5,10,2,-5],[-5,2,10,-5],[2,-5,-2,5]])
    for true, pred in zip(y, y_pred):
        balance += revenue_matrix[int(true)-1,int(pred)-1]
    return balance/len(y)

In [149]:
mask = np.logical_or(np.logical_or(group_idx==20, group_idx==21), group_idx==23)

# define test set
X_test = X.loc[mask]
y_test = y.loc[mask]
group_idx_test = group_idx.loc[mask]

# define training set
mask = np.invert(mask)
X_train = X.loc[mask]
y_train = y.loc[mask]
group_idx_train = group_idx.loc[mask]

In [150]:
# we will use GroupKFold which seperates our data into k folds in which the groups differ, in a way that in 
# the first fold only group 1,2 in the second fold only group 3,4 ... are listed
gkf = GroupKFold(n_splits=6)

In [151]:
# make custom scorer for grid search using previously defined scoring function
filmotion = make_scorer(filmotion_scoring)

In [155]:
knn_param = {"n_neighbors" : [3, 5, 25, 50], "algorithm": ["auto"],
            "weights": ["uniform"]}
knn_model = GridSearchCV(KNeighborsClassifier(), param_grid=knn_param, cv=gkf, scoring=filmotion)
knn_model.fit(X_train, y_train, groups=group_idx_train)
knn_param = knn_model.best_params_
knn_model = knn_model.best_estimator_

In [159]:
knn_prob = knn_model.predict(X_train)
training_score = cross_val_score(knn_model, X_train, y_train, cv=gkf, groups=group_idx_train, scoring=filmotion)
print("KNearest has accuracy: ",training_score.mean())

KNearest has accuracy:  0.4950680580657534


In [160]:
# TODO: Ensemble and get final estimation