In [1]:
import numpy as np
import pandas as pd

from utils import get_fields
from statsmodels.stats.outliers_influence import variance_inflation_factor 

In [2]:
# 'mf' for music-focused (perceptual descriptors)
experiment = "mf"

ordered_fields, _ = get_fields(experiment)
ordered_fields = list(ordered_fields) # not alphabetical order

## Load control ratings

In [3]:
ratings_df = pd.read_csv(f"{experiment}_ratings.csv")

## Compute ratings means by `stimulus_id`

In [4]:
ratings_df = ratings_df.drop(["prolific_id"], axis=1)
ratings_means_df = ratings_df.groupby('stimulus_id').mean()
ratings_means_df.head()

Unnamed: 0_level_0,Electric/Acoustic,Distorted/Clear,Many Instruments/Few Instruments,Loud/Soft,Heavy/Light,High pitch/Low pitch,Wide pitch variation/Narrow pitch variation,Punchy/Smooth,Harmonious/Disharmonious,Clear melody/No melody,Repetitive/Non-repetitive,Complex rhythm/Simple rhythm,Fast tempo/Slow tempo,Dense/Sparse,Strong beat/Weak beat
stimulus_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
-BaTPbE0Gdo,3.666667,6.5,4.333333,5.833333,6.666667,2.666667,4.666667,6.0,1.5,2.166667,4.333333,5.666667,3.5,4.166667,4.166667
-KKsNKY4V8k,3.0,5.166667,2.5,2.5,5.0,2.5,3.666667,4.0,2.666667,2.5,2.666667,5.0,2.5,3.166667,3.166667
-Mqc2csT3ZM,2.666667,3.5,4.666667,4.666667,5.333333,2.666667,4.666667,3.5,3.5,3.333333,2.833333,5.333333,3.333333,3.666667,4.0
-NEHGAMiA2I,4.833333,6.166667,5.5,5.666667,6.166667,3.666667,4.5,5.5,2.333333,3.666667,2.833333,4.833333,4.333333,4.5,4.333333
-SEKfzdaIK0,3.666667,3.166667,5.333333,3.5,2.833333,4.166667,4.5,2.166667,3.0,3.0,2.666667,4.833333,2.666667,4.0,3.0


In [5]:
# VIF values > 5 indicate high multicollinearity
vif_data = ratings_means_df[ordered_fields].copy() - ratings_means_df[ordered_fields].mean(axis=0)

vif = pd.DataFrame()
vif["variable"] = ordered_fields
vif["VIF"] = [variance_inflation_factor(vif_data.values, i) for i in range(vif_data.shape[1])]


print(vif)

                                       variable       VIF
0                             Electric/Acoustic  2.173015
1                               Distorted/Clear  3.485856
2              Many Instruments/Few Instruments  1.868239
3                                     Loud/Soft  4.549181
4                                   Heavy/Light  7.091429
5                          High pitch/Low pitch  2.300148
6   Wide pitch variation/Narrow pitch variation  1.761620
7                                 Punchy/Smooth  3.789109
8                      Harmonious/Disharmonious  3.767592
9                        Clear melody/No melody  3.970719
10                    Repetitive/Non-repetitive  1.970653
11                 Complex rhythm/Simple rhythm  1.602566
12                        Fast tempo/Slow tempo  1.743356
13                                 Dense/Sparse  3.341959
14                        Strong beat/Weak beat  3.491619
