In [1]:
# put download.sh and this notebook in the same folder on google drive. Remember to modify path!
from google.colab import drive
drive.mount('/content/gdrive')

import os

path = "/content/gdrive/MyDrive/DL_Project/Data"
os.chdir(path)

Mounted at /content/gdrive


In [2]:
import librosa
import os
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
#from torchsummaryX import summary
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import csv
import torch.optim as optim
from tqdm import tqdm
from os.path import exists

# **FEATURE EXTRACTION**

In [3]:
configs = [
    [2048, 512, 2048],
    [2048, 512, 1024],
    [2048, 1024, 1024],
    [1024, 512, 1024],
    [1024, 256, 1024]
]

def process_data(personality_dir="Personality_Scores", metadata_dir="Metadata", audio_dir="Audio_clips"):
    df_personality = get_personality_scores(personality_dir)
    df_metadata = get_metadata(metadata_dir)
    df_feature = get_features(audio_dir)
    
    df = pd.merge(df_personality, df_metadata, left_on="Clip_ID", right_on="Clip_ID")
    df = pd.merge(df, df_feature, left_on="Clip_ID", right_on="Clip_ID")
    
    return df

def get_personality_scores(data_dir_path):
    df = pd.read_csv(data_dir_path + "/Score_011.csv")

    for i in range(1, 11):
        df_tmp = pd.read_csv(data_dir_path + f"/Score_0{i:02d}.csv")
        
        df["Extraversion"] = df["Extraversion"] + df_tmp["Extraversion"]
        df["Agreeableness"] = df["Agreeableness"] + df_tmp["Agreeableness"]
        df["Conscientiousness"] = df["Conscientiousness"] + df_tmp["Conscientiousness"]
        df["Neuroticism"] = df["Neuroticism"] + df_tmp["Neuroticism"]
        df["Openness"] = df["Openness"] + df_tmp["Openness"]

    df["Extraversion"] = df["Extraversion"] / 11
    df["Agreeableness"] = df["Agreeableness"] / 11
    df["Conscientiousness"] = df["Conscientiousness"] / 11
    df["Neuroticism"] = df["Neuroticism"] / 11
    df["Openness"] = df["Openness"] / 11
    
    return df

def get_metadata(data_dir_path):
    df = pd.read_csv(data_dir_path + "/Metadata.csv")
    return df

def get_features(data_dir_path):
    data = []
    
    for filename in os.listdir(data_dir_path):
        f = os.path.join(data_dir_path, filename)
        
        y, sr = librosa.load(f)
        output = []
        pad_len = 0
        for config in configs:
            s = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=config[0], hop_length=config[1], win_length=config[2])
            pad_len = max(pad_len, len(s[0]))
            output.append(s)
        
        for i, cur in enumerate(output):
            cur = np.pad(cur, ((0,0), (0, pad_len-len(cur[0]))))
            output[i] = cur
        output = np.array(output)

        data.append([filename.split(".")[0], output])
    
    df = pd.DataFrame(data, columns=["Clip_ID", "features"])
    return df


In [4]:
!pip3 install pickle5
import pickle5 as pickle

Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[?25l[K     |█▎                              | 10 kB 33.0 MB/s eta 0:00:01[K     |██▋                             | 20 kB 36.7 MB/s eta 0:00:01[K     |███▉                            | 30 kB 35.9 MB/s eta 0:00:01[K     |█████▏                          | 40 kB 24.8 MB/s eta 0:00:01[K     |██████▍                         | 51 kB 19.8 MB/s eta 0:00:01[K     |███████▊                        | 61 kB 21.8 MB/s eta 0:00:01[K     |█████████                       | 71 kB 23.5 MB/s eta 0:00:01[K     |██████████▎                     | 81 kB 25.1 MB/s eta 0:00:01[K     |███████████▌                    | 92 kB 27.3 MB/s eta 0:00:01[K     |████████████▉                   | 102 kB 25.6 MB/s eta 0:00:01[K     |██████████████                  | 112 kB 25.6 MB/s eta 0:00:01[K     |███████████████▍                | 122 kB 25.6 MB/s eta 0:00:01[K     |████████████████▋ 

In [226]:
if exists("./processed_data.pkl"):
  with open("./processed_data.pkl", 'rb') as f:
    data = pickle.load(f)
else:
  ! bash download.sh
  data = process_data()
  data.to_pickle("processed_data.pkl")

# **HYPER PARAMETERS**

In [227]:
agg_type = 'numerical'
features = ['Extraversion', 'Agreeableness', 'Conscientiousness', 'Neuroticism', 'Openness']

if agg_type == 'numerical':
    agg = ''
else:
    agg = '_categorical'

batch_size = 16
lr = 1e-3
epochs = 200

# **DATA LOADER**

In [228]:
if agg_type=='numerical':
    cutoff = 0
    data['Extraversion'] = np.where(data['Extraversion'] > cutoff,1,0)
    data['Agreeableness'] = np.where(data['Agreeableness'] > cutoff,1,0)
    data['Conscientiousness'] = np.where(data['Conscientiousness'] > cutoff,1,0)
    data['Neuroticism'] = np.where(data['Neuroticism'] > cutoff,1,0)
    data['Openness'] = np.where(data['Openness'] > cutoff,1,0)


In [229]:
#Split dataframe into train, validation, and test sets
train, val, test = np.split(
    data.sample(frac=1, random_state=101),
    [int(.7*len(data)), int(.85*len(data))]
)

In [230]:
train.head()

Unnamed: 0,Clip_ID,Extraversion,Agreeableness,Conscientiousness,Neuroticism,Openness,Speaker_ID,Gender,Status,features
107,cut_feb2103-guest-0-2,1,0,1,0,1,279,M,J,"[[[0.016320303, 0.0054691625, 0.001031203, 0.0..."
74,cut_feb1802-guest-4-16,1,1,1,0,1,3,M,J,"[[[0.14793351, 0.5178786, 0.60005057, 0.137282..."
374,cut_feb0301-human-9,0,1,0,0,0,134,M,G,"[[[0.0004180302, 0.00038920733, 0.00029120274,..."
274,cut_feb2803-guest-5-21,1,0,1,1,1,241,M,J,"[[[0.00033727323, 0.0004824664, 0.001124168, 0..."
212,cut_feb2404-guest-4-30,1,1,1,0,1,208,M,J,"[[[0.39541173, 0.07909767, 0.0041097235, 0.007..."


In [231]:
type(data['features'].iloc[5])

numpy.ndarray

In [232]:
train['features'].iloc[5].shape

(5, 128, 862)

In [233]:
train.shape

(448, 10)

In [234]:
type(train)

pandas.core.frame.DataFrame

In [235]:
def get_h(arr):
    vals = list()
    for i in range(1, len(arr)):
        vals.append(arr[i] - arr[i-1])

    vals_uni, vals_counts = np.unique(np.array(vals), return_counts=True)

    count_dict = dict(zip(vals_uni, vals_counts))

    vals_prob = np.array(vals_counts) / np.array(vals_counts).sum()

    count = len(vals_uni)

    """

    for i in range(len(vals_uni)):
        if vals_counts[i] != 1:
            print("non-unique encountered", vals_uni[i], vals_counts[i])

        if vals_uni[i] == 0:
            print("zero count", vals_prob[i])

    print(len(vals))
    print(len(vals_uni))

    print("vals_uni", len(vals_uni), vals_uni[:50])
    print("vals_counts", len(vals_counts), vals_counts[:50])
    print("vals_prob", len(vals_prob), vals_prob[:50])
    print("count_dict", len(count_dict), count_dict)
    print("count", count)

    """

    p_logp = list()

    k = 0

    for i, val in enumerate(vals_uni):

        p = vals_prob[i]

        if p < 10**(-5):
            print(p)
            k += 1
        else:
            if p == 0:
                p_logp.append(0)
            else:
                p_logp.append(p * np.log(p))

    #p_logp = vals_uni * np.log(vals_uni)
    p_logp_sum = np.array(p_logp).sum()
    #print(p_logp_sum)
    h = - p_logp_sum / np.log(count)
    return h


In [236]:
print(train['features'].iloc[0][0][0].min())
print(train['features'].iloc[0][0][0].max())
print(train['features'].iloc[0][0][0].mean())
print(get_h(train['features'].iloc[0][0][0]))

0.0
0.09506792
0.002189333
0.6146113531026953


In [245]:
a_min_columns = ["a_min" + str(i) for i in range(128)]
b_min_columns = ["b_min" + str(i) for i in range(128)]
c_min_columns = ["c_min" + str(i) for i in range(128)]
d_min_columns = ["d_min" + str(i) for i in range(128)]
e_min_columns = ["e_min" + str(i) for i in range(128)]


a_max_columns = ["a_max" + str(i) for i in range(128)]
b_max_columns = ["b_max" + str(i) for i in range(128)]
c_max_columns = ["c_max" + str(i) for i in range(128)]
d_max_columns = ["d_max" + str(i) for i in range(128)]
e_max_columns = ["e_max" + str(i) for i in range(128)]


a_mean_columns = ["a_mean" + str(i) for i in range(128)]
b_mean_columns = ["b_mean" + str(i) for i in range(128)]
c_mean_columns = ["c_mean" + str(i) for i in range(128)]
d_mean_columns = ["d_mean" + str(i) for i in range(128)]
e_mean_columns = ["e_mean" + str(i) for i in range(128)]


a_h_columns = ["a_h" + str(i) for i in range(128)]
b_h_columns = ["b_h" + str(i) for i in range(128)]
c_h_columns = ["c_h" + str(i) for i in range(128)]
d_h_columns = ["d_h" + str(i) for i in range(128)]
e_h_columns = ["e_h" + str(i) for i in range(128)]



In [246]:
data_feats_train = pd.DataFrame()

samples_count = train.shape[0]

for i in range(len(a_min_columns)):

    data_feats_train[str(a_min_columns[i])] = [0]*samples_count
    data_feats_train[str(b_min_columns[i])] = [0]*samples_count
    data_feats_train[str(c_min_columns[i])] = [0]*samples_count
    data_feats_train[str(d_min_columns[i])] = [0]*samples_count
    data_feats_train[str(e_min_columns[i])] = [0]*samples_count

    data_feats_train[str(a_max_columns[i])] = [0]*samples_count
    data_feats_train[str(b_max_columns[i])] = [0]*samples_count
    data_feats_train[str(c_max_columns[i])] = [0]*samples_count
    data_feats_train[str(d_max_columns[i])] = [0]*samples_count
    data_feats_train[str(e_max_columns[i])] = [0]*samples_count

    data_feats_train[str(a_mean_columns[i])] = [0]*samples_count
    data_feats_train[str(b_mean_columns[i])] = [0]*samples_count
    data_feats_train[str(c_mean_columns[i])] = [0]*samples_count
    data_feats_train[str(d_mean_columns[i])] = [0]*samples_count
    data_feats_train[str(e_mean_columns[i])] = [0]*samples_count

    data_feats_train[str(a_h_columns[i])] = [0]*samples_count
    data_feats_train[str(b_h_columns[i])] = [0]*samples_count
    data_feats_train[str(c_h_columns[i])] = [0]*samples_count
    data_feats_train[str(d_h_columns[i])] = [0]*samples_count
    data_feats_train[str(e_h_columns[i])] = [0]*samples_count

data_feats_train = data_feats_train.copy().astype('float64')

for i in range(len(a_min_columns)):

    for row_num in range(samples_count):

        data_feats_train[str(a_min_columns[i])].iloc[row_num] = train['features'].iloc[row_num][0][i].min()
        data_feats_train[str(b_min_columns[i])].iloc[row_num] = train['features'].iloc[row_num][1][i].min()
        data_feats_train[str(c_min_columns[i])].iloc[row_num] = train['features'].iloc[row_num][2][i].min()
        data_feats_train[str(d_min_columns[i])].iloc[row_num] = train['features'].iloc[row_num][3][i].min()
        data_feats_train[str(e_min_columns[i])].iloc[row_num] = train['features'].iloc[row_num][4][i].min()

        data_feats_train[str(a_max_columns[i])].iloc[row_num] = train['features'].iloc[row_num][0][i].max()
        data_feats_train[str(b_max_columns[i])].iloc[row_num] = train['features'].iloc[row_num][1][i].max()
        data_feats_train[str(c_max_columns[i])].iloc[row_num] = train['features'].iloc[row_num][2][i].max()
        data_feats_train[str(d_max_columns[i])].iloc[row_num] = train['features'].iloc[row_num][3][i].max()
        data_feats_train[str(e_max_columns[i])].iloc[row_num] = train['features'].iloc[row_num][4][i].max()

        data_feats_train[str(a_mean_columns[i])].iloc[row_num] = train['features'].iloc[row_num][0][i].mean()
        data_feats_train[str(b_mean_columns[i])].iloc[row_num] = train['features'].iloc[row_num][1][i].mean()
        data_feats_train[str(c_mean_columns[i])].iloc[row_num] = train['features'].iloc[row_num][2][i].mean()
        data_feats_train[str(d_mean_columns[i])].iloc[row_num] = train['features'].iloc[row_num][3][i].mean()
        data_feats_train[str(e_mean_columns[i])].iloc[row_num] = train['features'].iloc[row_num][4][i].mean()

        data_feats_train[str(a_h_columns[i])].iloc[row_num] = get_h(train['features'].iloc[row_num][0][i])
        data_feats_train[str(b_h_columns[i])].iloc[row_num] = get_h(train['features'].iloc[row_num][1][i])
        data_feats_train[str(c_h_columns[i])].iloc[row_num] = get_h(train['features'].iloc[row_num][2][i])
        data_feats_train[str(d_h_columns[i])].iloc[row_num] = get_h(train['features'].iloc[row_num][3][i])
        data_feats_train[str(e_h_columns[i])].iloc[row_num] = get_h(train['features'].iloc[row_num][4][i])

    print(f"job done: {i} of 128")

gen_stat = train[["Gender", "Status"]]
gen_stat.reset_index(inplace=True, drop = True)
gen_stat["Gender"] = gen_stat["Gender"].replace({'M': 0, 'F': 1})
gen_stat["Status"] = gen_stat["Status"].replace({'G': 0, 'J': 1})

data_feats_train = pd.concat([gen_stat, data_feats_train], axis=1)
data_feats_train

  import sys
  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]
  
  from ipykernel import kernelapp as app
  app.launch_new_instance()


job done: 0 of 128
job done: 1 of 128
job done: 2 of 128
job done: 3 of 128
job done: 4 of 128
job done: 5 of 128
job done: 6 of 128
job done: 7 of 128
job done: 8 of 128
job done: 9 of 128
job done: 10 of 128
job done: 11 of 128
job done: 12 of 128
job done: 13 of 128
job done: 14 of 128
job done: 15 of 128
job done: 16 of 128
job done: 17 of 128
job done: 18 of 128
job done: 19 of 128
job done: 20 of 128
job done: 21 of 128
job done: 22 of 128
job done: 23 of 128
job done: 24 of 128
job done: 25 of 128
job done: 26 of 128
job done: 27 of 128
job done: 28 of 128
job done: 29 of 128
job done: 30 of 128
job done: 31 of 128
job done: 32 of 128
job done: 33 of 128
job done: 34 of 128
job done: 35 of 128
job done: 36 of 128
job done: 37 of 128
job done: 38 of 128
job done: 39 of 128
job done: 40 of 128
job done: 41 of 128
job done: 42 of 128
job done: 43 of 128
job done: 44 of 128
job done: 45 of 128
job done: 46 of 128
job done: 47 of 128
job done: 48 of 128
job done: 49 of 128
job done: 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Gender,Status,a_min0,b_min0,c_min0,d_min0,e_min0,a_max0,b_max0,c_max0,...,a_mean127,b_mean127,c_mean127,d_mean127,e_mean127,a_h127,b_h127,c_h127,d_h127,e_h127
0,0,1,0.0,0.0,0.0,0.0,0.000006,0.095068,0.072835,0.045593,...,1.486511e-08,1.151150e-08,1.151149e-08,5.752497e-09,7.605755e-09,0.614611,0.614611,0.355358,0.614611,1.0
1,0,1,0.0,0.0,0.0,0.0,0.000004,3.863860,3.119115,0.937131,...,9.598727e-10,5.333483e-10,5.333458e-10,2.665233e-10,5.892638e-10,0.614611,0.614611,0.355358,0.614611,1.0
2,0,0,0.0,0.0,0.0,0.0,0.000009,0.003301,0.003073,0.002675,...,4.779123e-08,2.273461e-08,2.273460e-08,1.136087e-08,3.111371e-08,0.614611,0.614611,0.355358,0.614611,1.0
3,0,1,0.0,0.0,0.0,0.0,0.000047,10.013515,6.100081,3.768315,...,1.648249e-10,2.738223e-11,2.737804e-11,1.368336e-11,1.311275e-10,0.614611,0.614611,0.355358,0.614611,1.0
4,0,1,0.0,0.0,0.0,0.0,0.000016,1.068840,0.698902,0.698902,...,1.678920e-06,1.329024e-06,1.329024e-06,6.641357e-07,8.454110e-07,0.614611,0.614611,0.355358,0.614611,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,0,0,0.0,0.0,0.0,0.0,0.009485,24.907949,15.939569,13.826316,...,7.269821e-08,5.769438e-08,5.769437e-08,2.883085e-08,3.653847e-08,0.614611,0.614611,0.355358,0.614611,1.0
444,0,0,0.0,0.0,0.0,0.0,0.000006,0.311565,0.184745,0.106751,...,1.854203e-07,6.260152e-08,6.260152e-08,3.128305e-08,1.326944e-07,0.614611,0.614611,0.355358,0.614611,1.0
445,0,1,0.0,0.0,0.0,0.0,0.000005,0.329194,0.237889,0.237889,...,1.421597e-09,1.090110e-09,1.090107e-09,5.447461e-10,7.323429e-10,0.614611,0.614611,0.355358,0.614611,1.0
446,0,1,0.0,0.0,0.0,0.0,0.000022,17.189899,11.555379,11.555379,...,7.018744e-08,5.617489e-08,5.617488e-08,2.807156e-08,3.505705e-08,0.614611,0.614611,0.355358,0.614611,1.0


In [247]:
data_feats_train.to_csv('data_feats_train.csv', index=False)

In [248]:
data_Extraversion_y_train =         train["Extraversion"]
data_Agreeableness_y_train =        train["Agreeableness"]
data_Conscientiousness_y_train =    train["Conscientiousness"]
data_Neuroticism_y_train =          train["Neuroticism"]
data_Openness_y_train =             train["Openness"]

features = ['Extraversion', 'Agreeableness', 'Conscientiousness', 'Neuroticism', 'Openness']

y_train_list = [data_Extraversion_y_train, 
               data_Agreeableness_y_train, 
               data_Conscientiousness_y_train,
               data_Neuroticism_y_train,
               data_Openness_y_train
               ]


In [249]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

model_list = list()

for i in range(5):

    y = y_train_list[i]
    x = data_feats_train

    model = LogisticRegression(random_state=0)
    model.fit(x, y)

    model_list.append(model)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [250]:
train_accuracy_list = list()
train_precision_list = list()
train_recall_list = list()

for i in range(5):

    model = model_list[i]

    y = y_train_list[i]
    x = data_feats_train

    pred = pd.Series(model.predict(x))
    y = y.reset_index(drop=True)

    z = pd.concat([y, pred], axis=1)
    z.columns = ['True', 'Prediction']
    #z.head()

    accuracy = metrics.accuracy_score(y, pred)
    precision = metrics.precision_score(y, pred)
    recall = metrics.recall_score(y, pred)

    train_accuracy_list.append(accuracy)
    train_precision_list.append(precision)
    train_recall_list.append(recall)

    print(f"{features[i]}: Accuracy = \t{accuracy}")
    print(f"{features[i]}: Precision = \t{precision}")
    print(f"{features[i]}: Recall =    \t{recall}\n")
    

Extraversion: Accuracy = 	0.8616071428571429
Extraversion: Precision = 	0.8871473354231975
Extraversion: Recall =    	0.9158576051779935

Agreeableness: Accuracy = 	0.8861607142857143
Agreeableness: Precision = 	0.9067524115755627
Agreeableness: Recall =    	0.9276315789473685

Conscientiousness: Accuracy = 	0.9977678571428571
Conscientiousness: Precision = 	0.9975369458128078
Conscientiousness: Recall =    	1.0

Neuroticism: Accuracy = 	0.9285714285714286
Neuroticism: Precision = 	0.9078947368421053
Neuroticism: Recall =    	0.7340425531914894

Openness: Accuracy = 	0.8526785714285714
Openness: Precision = 	0.8790035587188612
Openness: Recall =    	0.8853046594982079



# Validation dataset

In [251]:
data_feats_val = pd.DataFrame()

samples_count = val.shape[0]

for i in range(len(a_min_columns)):

    data_feats_val[str(a_min_columns[i])] = [0]*samples_count
    data_feats_val[str(b_min_columns[i])] = [0]*samples_count
    data_feats_val[str(c_min_columns[i])] = [0]*samples_count
    data_feats_val[str(d_min_columns[i])] = [0]*samples_count
    data_feats_val[str(e_min_columns[i])] = [0]*samples_count

    data_feats_val[str(a_max_columns[i])] = [0]*samples_count
    data_feats_val[str(b_max_columns[i])] = [0]*samples_count
    data_feats_val[str(c_max_columns[i])] = [0]*samples_count
    data_feats_val[str(d_max_columns[i])] = [0]*samples_count
    data_feats_val[str(e_max_columns[i])] = [0]*samples_count

    data_feats_val[str(a_mean_columns[i])] = [0]*samples_count
    data_feats_val[str(b_mean_columns[i])] = [0]*samples_count
    data_feats_val[str(c_mean_columns[i])] = [0]*samples_count
    data_feats_val[str(d_mean_columns[i])] = [0]*samples_count
    data_feats_val[str(e_mean_columns[i])] = [0]*samples_count

    data_feats_val[str(a_h_columns[i])] = [0]*samples_count
    data_feats_val[str(b_h_columns[i])] = [0]*samples_count
    data_feats_val[str(c_h_columns[i])] = [0]*samples_count
    data_feats_val[str(d_h_columns[i])] = [0]*samples_count
    data_feats_val[str(e_h_columns[i])] = [0]*samples_count

data_feats_val = data_feats_val.copy().astype('float64')

  import sys
  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]
  
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [252]:
for i in range(len(a_min_columns)):

    for row_num in range(samples_count):

        data_feats_val[str(a_min_columns[i])].iloc[row_num] = val['features'].iloc[row_num][0][i].min()
        data_feats_val[str(b_min_columns[i])].iloc[row_num] = val['features'].iloc[row_num][1][i].min()
        data_feats_val[str(c_min_columns[i])].iloc[row_num] = val['features'].iloc[row_num][2][i].min()
        data_feats_val[str(d_min_columns[i])].iloc[row_num] = val['features'].iloc[row_num][3][i].min()
        data_feats_val[str(e_min_columns[i])].iloc[row_num] = val['features'].iloc[row_num][4][i].min()

        data_feats_val[str(a_max_columns[i])].iloc[row_num] = val['features'].iloc[row_num][0][i].max()
        data_feats_val[str(b_max_columns[i])].iloc[row_num] = val['features'].iloc[row_num][1][i].max()
        data_feats_val[str(c_max_columns[i])].iloc[row_num] = val['features'].iloc[row_num][2][i].max()
        data_feats_val[str(d_max_columns[i])].iloc[row_num] = val['features'].iloc[row_num][3][i].max()
        data_feats_val[str(e_max_columns[i])].iloc[row_num] = val['features'].iloc[row_num][4][i].max()

        data_feats_val[str(a_mean_columns[i])].iloc[row_num] = val['features'].iloc[row_num][0][i].mean()
        data_feats_val[str(b_mean_columns[i])].iloc[row_num] = val['features'].iloc[row_num][1][i].mean()
        data_feats_val[str(c_mean_columns[i])].iloc[row_num] = val['features'].iloc[row_num][2][i].mean()
        data_feats_val[str(d_mean_columns[i])].iloc[row_num] = val['features'].iloc[row_num][3][i].mean()
        data_feats_val[str(e_mean_columns[i])].iloc[row_num] = val['features'].iloc[row_num][4][i].mean()

        data_feats_val[str(a_h_columns[i])].iloc[row_num] = get_h(val['features'].iloc[row_num][0][i])
        data_feats_val[str(b_h_columns[i])].iloc[row_num] = get_h(val['features'].iloc[row_num][1][i])
        data_feats_val[str(c_h_columns[i])].iloc[row_num] = get_h(val['features'].iloc[row_num][2][i])
        data_feats_val[str(d_h_columns[i])].iloc[row_num] = get_h(val['features'].iloc[row_num][3][i])
        data_feats_val[str(e_h_columns[i])].iloc[row_num] = get_h(val['features'].iloc[row_num][4][i])

gen_stat = val[["Gender", "Status"]]
gen_stat.reset_index(inplace=True, drop = True)
gen_stat["Gender"] = gen_stat["Gender"].replace({'M': 0, 'F': 1})
gen_stat["Status"] = gen_stat["Status"].replace({'G': 0, 'J': 1})
gen_stat

data_feats_val = pd.concat([gen_stat, data_feats_val], axis=1)
data_feats_val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Gender,Status,a_min0,b_min0,c_min0,d_min0,e_min0,a_max0,b_max0,c_max0,...,a_mean127,b_mean127,c_mean127,d_mean127,e_mean127,a_h127,b_h127,c_h127,d_h127,e_h127
0,0,1,0.0,0.0,0.0,0.0,0.000009,0.028093,0.026152,0.026152,...,7.460604e-08,2.868586e-08,2.868586e-08,1.433480e-08,5.175448e-08,0.614611,0.614611,0.355358,0.614611,1.0
1,0,0,0.0,0.0,0.0,0.0,0.000013,2.154893,1.993473,1.197511,...,4.788161e-07,3.817014e-07,3.817014e-07,1.907426e-07,2.398585e-07,0.614611,0.614611,0.355358,0.614611,1.0
2,0,1,0.0,0.0,0.0,0.0,0.000016,1.172296,1.601307,1.601307,...,2.002037e-08,1.600009e-08,1.600009e-08,7.995518e-09,1.001026e-08,0.614611,0.614611,0.355358,0.614611,1.0
3,0,0,0.0,0.0,0.0,0.0,0.000008,0.050211,0.048373,0.048373,...,1.117605e-07,8.899250e-08,8.899250e-08,4.447104e-08,5.603298e-08,0.614611,0.614611,0.355358,0.614611,1.0
4,1,1,0.0,0.0,0.0,0.0,0.000007,5.155371,3.487280,1.963730,...,1.873249e-06,1.476698e-06,1.476698e-06,7.379311e-07,9.461382e-07,0.614611,0.614611,0.355358,0.614611,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0,1,0.0,0.0,0.0,0.0,0.000022,12.641891,9.784947,1.749601,...,6.786004e-07,5.432862e-07,5.432862e-07,2.714893e-07,3.388533e-07,0.614611,0.614611,0.355358,0.614611,1.0
92,0,1,0.0,0.0,0.0,0.0,0.000010,4.014663,3.666741,3.666741,...,3.201263e-07,2.560602e-07,2.560602e-07,1.279577e-07,1.599627e-07,0.614611,0.614611,0.355358,0.614611,1.0
93,0,0,0.0,0.0,0.0,0.0,0.006230,28.584768,20.982738,17.673126,...,1.479576e-07,1.670907e-08,1.670906e-08,8.349810e-09,1.213994e-07,0.614611,0.614611,0.355358,0.614611,1.0
94,0,0,0.0,0.0,0.0,0.0,0.000327,26.246387,18.302452,6.459031,...,1.697727e-07,1.353102e-07,1.353102e-07,6.761677e-08,8.506051e-08,0.614611,0.614611,0.355358,0.614611,1.0


In [253]:
data_feats_val.to_csv('data_feats_val.csv', index=False)

In [254]:
data_Extraversion_y_val =         val["Extraversion"]
data_Agreeableness_y_val =        val["Agreeableness"]
data_Conscientiousness_y_val =    val["Conscientiousness"]
data_Neuroticism_y_val =          val["Neuroticism"]
data_Openness_y_val =             val["Openness"]

# features = ['Extraversion', 'Agreeableness', 'Conscientiousness', 'Neuroticism', 'Openness']

y_val_list = [data_Extraversion_y_val, 
               data_Agreeableness_y_val, 
               data_Conscientiousness_y_val,
               data_Neuroticism_y_val,
               data_Openness_y_val
               ]


In [255]:
val_accuracy_list = list()
val_precision_list = list()
val_recall_list = list()


for i in range(5):

    model = model_list[i]

    y = y_val_list[i]
    x = data_feats_val

    pred = pd.Series(model.predict(x))
    y = y.reset_index(drop=True)

    z = pd.concat([y, pred], axis=1)
    z.columns = ['True', 'Prediction']
    #z.head()

    accuracy = metrics.accuracy_score(y, pred)
    precision = metrics.precision_score(y, pred)
    recall = metrics.recall_score(y, pred)

    val_accuracy_list.append(accuracy)
    val_precision_list.append(precision)
    val_recall_list.append(recall)

    print(f"{features[i]}: Accuracy = \t{accuracy}")
    print(f"{features[i]}: Precision = \t{precision}")
    print(f"{features[i]}: Recall =    \t{recall}\n")


Extraversion: Accuracy = 	0.71875
Extraversion: Precision = 	0.7837837837837838
Extraversion: Recall =    	0.8405797101449275

Agreeableness: Accuracy = 	0.5625
Agreeableness: Precision = 	0.6470588235294118
Agreeableness: Recall =    	0.7096774193548387

Conscientiousness: Accuracy = 	0.8541666666666666
Conscientiousness: Precision = 	0.9404761904761905
Conscientiousness: Recall =    	0.8977272727272727

Neuroticism: Accuracy = 	0.71875
Neuroticism: Precision = 	0.5
Neuroticism: Recall =    	0.48148148148148145

Openness: Accuracy = 	0.59375
Openness: Precision = 	0.7164179104477612
Openness: Recall =    	0.7058823529411765



# Test dataset

In [256]:
data_feats_test = pd.DataFrame()

samples_count = test.shape[0]

for i in range(len(a_min_columns)):

    data_feats_test[str(a_min_columns[i])] = [0]*samples_count
    data_feats_test[str(b_min_columns[i])] = [0]*samples_count
    data_feats_test[str(c_min_columns[i])] = [0]*samples_count
    data_feats_test[str(d_min_columns[i])] = [0]*samples_count
    data_feats_test[str(e_min_columns[i])] = [0]*samples_count

    data_feats_test[str(a_max_columns[i])] = [0]*samples_count
    data_feats_test[str(b_max_columns[i])] = [0]*samples_count
    data_feats_test[str(c_max_columns[i])] = [0]*samples_count
    data_feats_test[str(d_max_columns[i])] = [0]*samples_count
    data_feats_test[str(e_max_columns[i])] = [0]*samples_count

    data_feats_test[str(a_mean_columns[i])] = [0]*samples_count
    data_feats_test[str(b_mean_columns[i])] = [0]*samples_count
    data_feats_test[str(c_mean_columns[i])] = [0]*samples_count
    data_feats_test[str(d_mean_columns[i])] = [0]*samples_count
    data_feats_test[str(e_mean_columns[i])] = [0]*samples_count

    data_feats_test[str(a_h_columns[i])] = [0]*samples_count
    data_feats_test[str(b_h_columns[i])] = [0]*samples_count
    data_feats_test[str(c_h_columns[i])] = [0]*samples_count
    data_feats_test[str(d_h_columns[i])] = [0]*samples_count
    data_feats_test[str(e_h_columns[i])] = [0]*samples_count

data_feats_test = data_feats_test.copy().astype('float64')

  import sys
  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]
  
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [257]:
for i in range(len(a_min_columns)):

    for row_num in range(samples_count):

        data_feats_test[str(a_min_columns[i])].iloc[row_num] = test['features'].iloc[row_num][0][i].min()
        data_feats_test[str(b_min_columns[i])].iloc[row_num] = test['features'].iloc[row_num][1][i].min()
        data_feats_test[str(c_min_columns[i])].iloc[row_num] = test['features'].iloc[row_num][2][i].min()
        data_feats_test[str(d_min_columns[i])].iloc[row_num] = test['features'].iloc[row_num][3][i].min()
        data_feats_test[str(e_min_columns[i])].iloc[row_num] = test['features'].iloc[row_num][4][i].min()

        data_feats_test[str(a_max_columns[i])].iloc[row_num] = test['features'].iloc[row_num][0][i].max()
        data_feats_test[str(b_max_columns[i])].iloc[row_num] = test['features'].iloc[row_num][1][i].max()
        data_feats_test[str(c_max_columns[i])].iloc[row_num] = test['features'].iloc[row_num][2][i].max()
        data_feats_test[str(d_max_columns[i])].iloc[row_num] = test['features'].iloc[row_num][3][i].max()
        data_feats_test[str(e_max_columns[i])].iloc[row_num] = test['features'].iloc[row_num][4][i].max()

        data_feats_test[str(a_mean_columns[i])].iloc[row_num] = test['features'].iloc[row_num][0][i].mean()
        data_feats_test[str(b_mean_columns[i])].iloc[row_num] = test['features'].iloc[row_num][1][i].mean()
        data_feats_test[str(c_mean_columns[i])].iloc[row_num] = test['features'].iloc[row_num][2][i].mean()
        data_feats_test[str(d_mean_columns[i])].iloc[row_num] = test['features'].iloc[row_num][3][i].mean()
        data_feats_test[str(e_mean_columns[i])].iloc[row_num] = test['features'].iloc[row_num][4][i].mean()

        data_feats_test[str(a_h_columns[i])].iloc[row_num] = get_h(test['features'].iloc[row_num][0][i])
        data_feats_test[str(b_h_columns[i])].iloc[row_num] = get_h(test['features'].iloc[row_num][1][i])
        data_feats_test[str(c_h_columns[i])].iloc[row_num] = get_h(test['features'].iloc[row_num][2][i])
        data_feats_test[str(d_h_columns[i])].iloc[row_num] = get_h(test['features'].iloc[row_num][3][i])
        data_feats_test[str(e_h_columns[i])].iloc[row_num] = get_h(test['features'].iloc[row_num][4][i])

gen_stat = test[["Gender", "Status"]]
gen_stat.reset_index(inplace=True, drop = True)
gen_stat["Gender"] = gen_stat["Gender"].replace({'M': 0, 'F': 1})
gen_stat["Status"] = gen_stat["Status"].replace({'G': 0, 'J': 1})

data_feats_test = pd.concat([gen_stat, data_feats_test], axis=1)
data_feats_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Gender,Status,a_min0,b_min0,c_min0,d_min0,e_min0,a_max0,b_max0,c_max0,...,a_mean127,b_mean127,c_mean127,d_mean127,e_mean127,a_h127,b_h127,c_h127,d_h127,e_h127
0,0,1,0.0,0.0,0.0,0.0,6.034248e-08,3.027572,2.487509,2.487509,...,5.474578e-12,4.212164e-12,4.209839e-12,2.104890e-12,2.812195e-12,0.614611,0.614611,0.355358,0.614611,1.0
1,1,0,0.0,0.0,0.0,0.0,2.231586e-06,0.129381,0.065930,0.027239,...,1.855899e-11,9.752030e-12,4.650845e-13,4.873254e-12,4.943878e-12,0.616584,0.616584,0.356454,0.616584,1.0
2,0,1,0.0,0.0,0.0,0.0,1.961518e-06,0.123729,0.126584,0.126584,...,1.646077e-08,8.644358e-09,8.644355e-09,4.319732e-09,1.033706e-08,0.614611,0.614611,0.355358,0.614611,1.0
3,0,0,0.0,0.0,0.0,0.0,5.800785e-05,25.363369,14.359472,3.623568,...,4.173860e-08,1.273760e-08,1.273760e-08,6.365191e-09,3.049680e-08,0.614611,0.614611,0.355358,0.614611,1.0
4,0,0,0.0,0.0,0.0,0.0,6.274464e-04,1.441465,0.872344,0.848939,...,1.623347e-08,1.276586e-08,1.276586e-08,6.379317e-09,8.213769e-09,0.614611,0.614611,0.355358,0.614611,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0,1,0.0,0.0,0.0,0.0,9.272754e-06,1.955786,1.420515,0.766407,...,1.134678e-08,8.820511e-09,8.820510e-09,4.407758e-09,5.789154e-09,0.614611,0.614611,0.355358,0.614611,1.0
92,0,0,0.0,0.0,0.0,0.0,7.992868e-06,1.453642,0.862053,0.862053,...,6.344408e-09,2.702655e-09,1.965791e-09,1.350562e-09,4.482363e-09,0.616659,0.616659,0.356790,0.616659,1.0
93,0,0,0.0,0.0,0.0,0.0,3.576697e-05,0.134436,0.089455,0.089455,...,1.584726e-07,1.267661e-07,1.267661e-07,6.334717e-08,7.918062e-08,0.614611,0.614611,0.355358,0.614611,1.0
94,0,0,0.0,0.0,0.0,0.0,5.731107e-06,1.521410,1.200336,0.260099,...,5.961029e-08,4.772384e-08,4.772384e-08,2.384841e-08,2.976594e-08,0.648789,0.648789,0.392955,0.648789,1.0


In [258]:
data_feats_test.to_csv('data_feats_test.csv', index=False)

In [259]:
data_Extraversion_y_test =         test["Extraversion"]
data_Agreeableness_y_test =        test["Agreeableness"]
data_Conscientiousness_y_test =    test["Conscientiousness"]
data_Neuroticism_y_test =          test["Neuroticism"]
data_Openness_y_test =             test["Openness"]

# features = ['Extraversion', 'Agreeableness', 'Conscientiousness', 'Neuroticism', 'Openness']
y_test_list = [data_Extraversion_y_test, 
               data_Agreeableness_y_test, 
               data_Conscientiousness_y_test,
               data_Neuroticism_y_test,
               data_Openness_y_test
               ]


In [261]:
test_accuracy_list = list()
test_precision_list = list()
test_recall_list = list()


for i in range(5):

    model = model_list[i]

    y = y_test_list[i]
    x = data_feats_test

    pred = pd.Series(model.predict(x))
    y = y.reset_index(drop=True)

    z = pd.concat([y, pred], axis=1)
    z.columns = ['True', 'Prediction']
    #z.head()

    accuracy = metrics.accuracy_score(y, pred)
    precision = metrics.precision_score(y, pred)
    recall = metrics.recall_score(y, pred)

    test_accuracy_list.append(accuracy)
    test_precision_list.append(precision)
    test_recall_list.append(recall)

    print(f"{features[i]}: Accuracy = \t{accuracy}")
    print(f"{features[i]}: Precision = \t{precision}")
    print(f"{features[i]}: Recall =    \t{recall}\n")


Extraversion: Accuracy = 	0.6145833333333334
Extraversion: Precision = 	0.7627118644067796
Extraversion: Recall =    	0.6617647058823529

Agreeableness: Accuracy = 	0.5520833333333334
Agreeableness: Precision = 	0.6081081081081081
Agreeableness: Recall =    	0.7627118644067796

Conscientiousness: Accuracy = 	0.8333333333333334
Conscientiousness: Precision = 	0.9390243902439024
Conscientiousness: Recall =    	0.875

Neuroticism: Accuracy = 	0.65625
Neuroticism: Precision = 	0.34782608695652173
Neuroticism: Recall =    	0.3076923076923077

Openness: Accuracy = 	0.6145833333333334
Openness: Precision = 	0.765625
Openness: Recall =    	0.6901408450704225



In [262]:
np.round(train_accuracy_list, 2)

array([0.86, 0.89, 1.  , 0.93, 0.85])

In [263]:
np.round(val_accuracy_list, 2)

array([0.72, 0.56, 0.85, 0.72, 0.59])

In [264]:
np.round(test_accuracy_list, 2)

array([0.61, 0.55, 0.83, 0.66, 0.61])