In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2
from scipy.stats import beta
import pickle
from scipy.stats import pearsonr
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,mean_squared_error
from itertools import combinations
from tabulate import tabulate

In [3]:
df=pd.read_csv('CSV/Labelled_data.csv')
questions={
    'EI':[0,2,4,6,8,10,12,14,16],
    'WI':[27,30,33,36,38,41,44,47,50],
    'EC':[39,32,35,40,43,46,49,52,53],
    'WC':[1,5,9,13,17,19,21,23,25],
    'EA':[3,7,11,16,18,20,22,24,26],
    'WA':[28,31,34,37,39,42,45,48,51],
    'AM':[56,60,72],
    'LC':[54,59,64,67],
    'MC':[55,62,65,69],
    'ND':[63,66],
    'PfW':[58,68,71],
    'SE':[57,61,70,73],
    'E':[74,79,84,89,94,99,104,109,114,119],
    'A':[75,80,85,90,95,100,105,110,115,120],
    'C':[76,81,86,91,96,101,106,111,116,121],
    'N':[77,82,87,92,97,102,107,112,117,122],
    'O':[78,83,88,93,98,103,108,113,118,123]
}
def I_label(score):
    if(score==6 or score==7):
        return 0
    if(score>=2 and score <=5):
        return 1
    else:
        return 2
def C_label(score):
    if(score>=4 and score<=7):
        return 0
    elif(score==2 or score==3):
        return 1
    else:
        return 2
def A_label(score):
    if(score>=4 and score<=6):
        return 0
    if(score>=7 and score <=9):
        return 1
    else:
        return 2
def AM_LC_label(score):
    if(score>=4 and score<=5):
        return 0
    elif(score>=2.6 and score<=3.9):
        return 1
    else:
        return 2
def rest_label(score):
    if(score>=4 and score<=6):
        return 0
    elif(score>=2.6 and score<=3.9):
        return 1
    else:
        return 2
def EN_label(score):
    if(score>=0 and score<=25):
        return 0
    elif(score>=26 and score<=40):
        return 1
    else:
        return 2
def AO_label(score):
    if(score>=0 and score<=30):
        return 0
    elif(score>=31 and score<=40):
        return 1
    else:
        return 2
def C3_label(score):
    if(score>=36 and score<=50):
        return 0
    elif(score>=26 and score<=35):
        return 1
    else:
        return 2
def I_smoothing(x):
    return(((8-x)*math.exp(0.6*x)+221.406)/29.591)
def C_smoothing(x):
    return(((12-x)*math.exp(0.15*x)-11.572)/0.327)
def A_smoothing(x):
    return((2+((x-2)*math.exp(-0.32*x)))/0.261)
def identity(x):
    return x
functions={
    'EI':I_label,
    'WI':I_label,
    'EC':C_label,
    'WC':C_label,
    'EA':A_label,
    'WA':A_label,
    'AM':AM_LC_label,
    'LC':AM_LC_label,
    'MC':rest_label,
    'ND':rest_label,
    'PfW':rest_label,
    'SE':rest_label,
    'E':EN_label,
    'A':AO_label,
    'C':C3_label,
    'N':EN_label,
    'O':AO_label
}
smoothing={
    'EI':I_smoothing,
    'WI':I_smoothing,
    'EC':C_smoothing,
    'WC':C_smoothing,
    'EA':A_smoothing,
    'WA':A_smoothing,
    'AM':identity,
    'LC':identity,
    'MC':identity,
    'ND':identity,
    'PfW':identity,
    'SE':identity,
    'E':identity,
    'A':identity,
    'C':identity,
    'N':identity,
    'O':identity
}

In [4]:
def ml(X,labels):
    split_index = int(0.6*df.shape[0])
    labels=np.array(labels)
    X_train, X_test = X[:split_index], X[split_index:]
    y_train, y_test = labels[:split_index], labels[split_index:]
    logistic_regression_classifier = LogisticRegression(max_iter=1000, random_state=42)
    logistic_regression_classifier.fit(X_train, y_train)
    y_pred = logistic_regression_classifier.predict(X_train)
    probabilities = logistic_regression_classifier.predict_proba(X_train)
    confidences = np.max(probabilities, axis=1)
    logistic_confidence_train=np.mean(confidences)
    acc_logistic_train=accuracy_score(y_train, y_pred)
    y_pred = logistic_regression_classifier.predict(X_test)
    probabilities = logistic_regression_classifier.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    logistic_confidence=np.mean(confidences)
    acc_logistic=accuracy_score(y_test, y_pred)
    svm_classifier = SVC(kernel='rbf', C=1.0, random_state=42)
    svm_classifier.fit(X_train, y_train)
    y_pred = svm_classifier.predict(X_train)
    confidence_scores = svm_classifier.decision_function(X_train)
    svm_confidence_train=np.mean(np.abs(confidence_scores))
    acc_svm_train=accuracy_score(y_train, y_pred)
    y_pred = svm_classifier.predict(X_test)
    confidence_scores = svm_classifier.decision_function(X_test)
    svm_confidence=np.mean(np.abs(confidence_scores))
    acc_svm=accuracy_score(y_test, y_pred)
    h1=len(X_train[0])
    mlp = MLPClassifier(hidden_layer_sizes=(2*h1,6), max_iter=500, random_state=42)
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_train)
    probabilities = mlp.predict_proba(X_train)
    confidences = np.max(probabilities, axis=1)
    mlp_confidence_train=np.mean(confidences)
    acc_mlp_train=accuracy_score(y_train, y_pred)
    y_pred = mlp.predict(X_test)
    probabilities = mlp.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    mlp_confidence=np.mean(confidences)
    acc_mlp=accuracy_score(y_test, y_pred)
    random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    random_forest_classifier.fit(X_train, y_train)
    y_pred = random_forest_classifier.predict(X_train)
    probabilities = random_forest_classifier.predict_proba(X_train)
    confidences = np.max(probabilities, axis=1)
    rf_confidence_train=np.mean(confidences)
    acc_rf_train=accuracy_score(y_train, y_pred)
    y_pred = random_forest_classifier.predict(X_test)
    probabilities = random_forest_classifier.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    rf_confidence=np.mean(confidences)
    acc_rf=accuracy_score(y_test, y_pred)
    data=[acc_logistic_train,acc_logistic,acc_svm_train,acc_svm,acc_mlp_train,acc_mlp,acc_rf_train,acc_rf]
    confidence=[logistic_confidence_train,logistic_confidence,svm_confidence_train,svm_confidence_train, svm_confidence,mlp_confidence_train,mlp_confidence,rf_confidence_train,rf_confidence]
    return confidence,data

In [8]:
def get_results(trait):
    if(trait=='all'):
        indices=[i for i in range(124)]
        labels=list(df['Final Label'])
    else:
        indices=questions[trait]
        labels=[functions[trait](i) for i in list(df[trait])]
    cols=[f'feature{k}' for k in indices]
    X=df[cols].values
    con,info=ml(X,labels)
    print(trait)
    print(info)
    return con,info

In [10]:
traits=['EI','WI','EC','WC','EA','WA','AM','LC','MC','ND','PfW','SE','E','A','C','N','O','all']
data=[]
confidence=[]
headers=['Logistic-Train','Logistic-Test','SVM-Train','SVM-Test','MLP-Train','MLP-Test','Random Forest-Train','Random Forest-Test']
for trait in traits:
    con,info=get_results(trait)
    data.append(info)
    confidence.append(con)
print("Accuracy")
print(tabulate(data, headers=headers, tablefmt="grid"))
print("Confidence")
print(tabulate(confidence, headers=headers, tablefmt="grid"))



EI
[0.7697103043637697, 0.8053875755909841, 0.8687202053538687, 0.8576140736668499, 0.8804547121378804, 0.8658603628367235, 0.9955995599559956, 0.8911489829576691]




WI
[0.5786578657865786, 0.35843870258383725, 0.9083241657499084, 0.8548653106102254, 0.9094242757609095, 0.8636613523914238, 1.0, 0.9620670698185816]




EC
[0.7348734873487349, 0.576690489279824, 0.8683535020168683, 0.8647608576140736, 0.8698203153648698, 0.868609125893348, 0.9996332966629996, 0.8933479934029687]




WC
[0.6153281994866153, 0.371632765255635, 0.8874220755408874, 0.8510170423309511, 0.8822882288228823, 0.8482682792743266, 1.0, 0.920285871357889]




EA
[0.8096809680968097, 0.8092358438702584, 0.9086908690869087, 0.8642111050027488, 0.9061239457279061, 0.863111599780099, 1.0, 0.9433754810335349]




WA
[0.8258159149248259, 0.7910940076965366, 0.9178584525119179, 0.8785046728971962, 0.9196919691969196, 0.8840021990104453, 1.0, 0.9406267179769104]
AM
[0.9977997799779978, 0.9972512369433755, 0.9988998899889989, 0.9934029686641012, 1.0, 1.0, 1.0, 0.9956019791094007]
LC
[0.9834983498349835, 0.9928532160527762, 0.9864319765309865, 0.994502473886751, 0.9996332966629996, 0.9972512369433755, 1.0, 0.9917537108301264]
MC
[0.9981664833149981, 1.0, 0.9904657132379905, 0.9967014843320505, 1.0, 1.0, 1.0, 0.9824079164376031]
ND
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
PfW
[0.9992665933259993, 0.9967014843320505, 0.9977997799779978, 1.0, 1.0, 1.0, 1.0, 0.9978009895547004]
SE
[0.9981664833149981, 1.0, 0.991932526585992, 0.9972512369433755, 1.0, 1.0, 1.0, 0.9747113798790544]
E
[1.0, 1.0, 0.9933993399339934, 0.9901044529961517, 1.0, 0.9989004947773502, 1.0, 0.9582188015393073]
A
[1.0, 0.9983507421660253, 0.9933993399339934, 0.9906542056074766, 1.0, 0.9967014843320505, 0.9996332966629996, 0.9741616272