In [1]:
import pickle
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Preprocessing

In [2]:
annotation_path = 'FI/gt/annotation_training.pkl'
with open(annotation_path, 'rb') as f:
    annotation = pickle.load(f, encoding='latin1')

In [3]:
df_train = pd.DataFrame()
audios_features_dir = "data/train/audios_features/"

for f in os.listdir(audios_features_dir):
    # check if the file is csv
    if (f.rsplit(".")[-1] == "csv"):
        df = pd.read_csv(os.path.join(audios_features_dir, f), sep=";")

        # calculate mean for each feature
        df = pd.DataFrame(df.mean()).transpose()
        
        # add label
        filename = f.rsplit(".")[0] + "." + f.rsplit(".")[1] + ".mp4"
        df["extraversion"] = annotation['extraversion'][filename]
        df["neuroticism"] = annotation['neuroticism'][filename]
        df["agreeableness"] = annotation['agreeableness'][filename]
        df["conscientiousness"] = annotation['conscientiousness'][filename]
        df["openness"] = annotation['openness'][filename]
        
        df_train = pd.concat([df_train, df], axis=0, ignore_index=True)

In [4]:
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,507,508,509,510,511,extraversion,neuroticism,agreeableness,conscientiousness,openness
0,0.203672,-0.429291,0.081586,0.185123,0.152433,0.004605,-0.552288,-0.777085,-0.091837,-0.070481,...,-0.806242,0.839549,0.041291,0.091085,0.360024,0.551402,0.5,0.527473,0.650485,0.744444
1,0.179766,-0.407752,0.565502,0.231725,0.061883,0.044931,-0.574203,-0.754546,-0.200325,0.0405,...,-0.776032,0.785381,0.019143,-0.105647,0.241413,0.392523,0.427083,0.516484,0.475728,0.466667
2,0.228794,-0.432109,0.623883,0.201075,0.07498,0.010355,-0.651151,-0.737271,-0.008256,-0.060556,...,-0.804595,0.961633,-0.099022,-0.281925,0.274971,0.317757,0.322917,0.549451,0.368932,0.544444
3,0.20225,-0.542036,1.062031,0.173404,0.10274,0.02518,-0.500155,-0.91562,0.189747,-0.071974,...,-0.780025,0.911415,-0.049688,-0.068375,0.261487,0.299065,0.291667,0.373626,0.320388,0.344444
4,0.121846,-0.348027,-0.314708,0.288748,0.221186,0.34351,-0.49179,-0.374474,0.191107,-0.256439,...,-0.42592,0.982384,-0.003899,0.017689,0.230127,0.476636,0.604167,0.593407,0.572816,0.611111


In [5]:
df_train.shape

(6000, 517)

In [6]:
df_train.isnull().sum().sum()

0

In [7]:
features = df_train.iloc[:, :512].to_numpy()
labels = df_train.iloc[:, 512:].to_numpy()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)

# Training

In [9]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [10]:
score = 1 - mean_squared_error(y_test, y_pred)
print(score)

0.9844431859633631
