In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2
from scipy.stats import beta
import pickle
from scipy.stats import pearsonr
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,mean_squared_error
from itertools import combinations
from tabulate import tabulate

In [8]:
df=pd.read_csv('CSV/corrected.csv')

#mapping from trait to question indices
questions={
    'EI':[0,2,4,6,8,10,12,14,16],
    'WI':[27,30,33,36,38,41,44,47,50],
    'EC':[39,32,35,40,43,46,49,52,53],
    'WC':[1,5,9,13,17,19,21,23,25],
    'EA':[3,7,11,16,18,20,22,24,26],
    'WA':[28,31,34,37,39,42,45,48,51],
    'AM':[56,60,72],
    'LC':[54,59,64,67],
    'MC':[55,62,65,69],
    'ND':[63,66],
    'PfW':[58,68,71],
    'SE':[57,61,70,73],
    'E':[74,79,84,89,94,99,104,109,114,119],
    'A':[75,80,85,90,95,100,105,110,115,120],
    'C':[76,81,86,91,96,101,106,111,116,121],
    'N':[77,82,87,92,97,102,107,112,117,122],
    'O':[78,83,88,93,98,103,108,113,118,123]
}

#functions to compute the labels 'Champion', 'Master' and 'Rookie' based on scores
def I_label(score):
    if(score==6 or score==7):
        return 0
    if(score>=2 and score <=5):
        return 1
    else:
        return 2
def C_label(score):
    if(score>=4 and score<=7):
        return 0
    elif(score==2 or score==3):
        return 1
    else:
        return 2
def A_label(score):
    if(score>=4 and score<=6):
        return 0
    if(score>=7 and score <=9):
        return 1
    else:
        return 2
def AM_LC_label(score):
    if(score>=4 and score<=5):
        return 0
    elif(score>=2.6 and score<=3.9):
        return 1
    else:
        return 2
def rest_label(score):
    if(score>=4 and score<=6):
        return 0
    elif(score>=2.6 and score<=3.9):
        return 1
    else:
        return 2
def EN_label(score):
    if(score>=0 and score<=25):
        return 0
    elif(score>=26 and score<=40):
        return 1
    else:
        return 2
def AO_label(score):
    if(score>=0 and score<=30):
        return 0
    elif(score>=31 and score<=40):
        return 1
    else:
        return 2
def C3_label(score):
    if(score>=36 and score<=50):
        return 0
    elif(score>=26 and score<=35):
        return 1
    else:
        return 2
    
#functions to perform the non-linear transformation of the scores
def I_smoothing(x):
    return(((8-x)*math.exp(0.6*x)+221.406)/29.591)
def C_smoothing(x):
    return(((12-x)*math.exp(0.15*x)-11.572)/0.327)
def A_smoothing(x):
    return((2+((x-2)*math.exp(-0.32*x)))/0.261)
def identity(x):
    return x
functions={
    'EI':I_label,
    'WI':I_label,
    'EC':C_label,
    'WC':C_label,
    'EA':A_label,
    'WA':A_label,
    'AM':AM_LC_label,
    'LC':AM_LC_label,
    'MC':rest_label,
    'ND':rest_label,
    'PfW':rest_label,
    'SE':rest_label,
    'E':EN_label,
    'A':AO_label,
    'C':C3_label,
    'N':EN_label,
    'O':AO_label
}
smoothing={
    'EI':I_smoothing,
    'WI':I_smoothing,
    'EC':C_smoothing,
    'WC':C_smoothing,
    'EA':A_smoothing,
    'WA':A_smoothing,
    'AM':identity,
    'LC':identity,
    'MC':identity,
    'ND':identity,
    'PfW':identity,
    'SE':identity,
    'E':identity,
    'A':identity,
    'C':identity,
    'N':identity,
    'O':identity
}

In [9]:
#Machine Learning algorithm that returns confidence and accuracy
def ml(X,labels):
    split_index = int(0.6 * df.shape[0])
    labels=np.array(labels)
    X_train, X_test = X[:split_index], X[split_index:]
    y_train, y_test = labels[:split_index], labels[split_index:]
    logistic_regression_classifier = LogisticRegression(max_iter=1000, random_state=42)
    logistic_regression_classifier.fit(X_train, y_train)
    y_pred = logistic_regression_classifier.predict(X_test)
    probabilities = logistic_regression_classifier.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    logistic_confidence=np.mean(confidences)
    acc_logistic=accuracy_score(y_test, y_pred)
    svm_classifier = SVC(kernel='rbf', C=1.0, random_state=42)
    svm_classifier.fit(X_train, y_train)
    y_pred = svm_classifier.predict(X_test)
    confidence_scores = svm_classifier.decision_function(X_test)
    svm_confidence=np.mean(np.abs(confidence_scores))
    acc_svm=accuracy_score(y_test, y_pred)
    h1=len(X_train[0])
    mlp = MLPClassifier(hidden_layer_sizes=(2*h1,6), max_iter=500, random_state=42)
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    probabilities = mlp.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    mlp_confidence=np.mean(confidences)
    acc_mlp=accuracy_score(y_test, y_pred)
    random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    random_forest_classifier.fit(X_train, y_train)
    y_pred = random_forest_classifier.predict(X_test)
    probabilities = random_forest_classifier.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    rf_confidence=np.mean(confidences)
    acc_rf=accuracy_score(y_test, y_pred)
    data=[acc_logistic,acc_svm,acc_mlp,acc_rf]
    confidence=[logistic_confidence,svm_confidence,mlp_confidence,rf_confidence]
    return confidence,data

In [10]:
# function computes the algorithm to determine the order and call the ML algos using feature number ranging from 1 to f
def correlation_algo(trait):
    truncate_index = int(len(df) * 0.6)
    truncated_df = df.iloc[:truncate_index]
    indices=questions[trait]
    f=len(indices)
    y=[functions[trait](i) for i in list(truncated_df[trait])]
    values=[smoothing[trait](i) for i in list(truncated_df[trait])]
    X=[]
    for i in indices:
        X.append(list(truncated_df[f'feature{i}']))
    l=len(X[0])
    corr_matrix=np.zeros((f,3))
    weight_matrix=np.zeros((f,3))
    for i in range(3):
        c=[values[j] for j in range(l) if y[j]==i]
        if(len(c)>0):
            for j in range(f):
                x_temp=[X[j][k] for k in range(l) if y[k]==i]
                corr, _ = pearsonr(x_temp, c)
                corr_matrix[j][i]=abs(corr)
            x=list(corr_matrix[:,i])
            rank_indices=np.argsort(x)
            rank=1
            for idx in rank_indices:
                weight_matrix[idx][i]=rank
                rank=rank+1
    correlation=np.zeros((f,f))
    for i in range(f):
        for j in range(f):
            x1=X[i][:]
            x2=X[j][:]
            corr, _ = pearsonr(x1,x2)
            correlation[i][j]=corr
    scores=[]
    for i in range(f):
        weight_sum=0
        corr_sum=0
        for j in range(3):
            corr_sum=corr_sum+(corr_matrix[i][j]*weight_matrix[i][j])
            weight_sum=weight_sum+weight_matrix[i][j]
        scores.append((corr_sum*f)/(weight_sum*np.sum(correlation[i])))
    rank_features=np.argsort(scores)
    data=[]
    confidence=[]
    labels=[functions[trait](i) for i in list(df[trait])]
    l=len(scores)-1
    correct_order=[]
    for i in range(f+1):
        correct_order.append(indices[rank_features[l]])
        l=l-1
    for i in range(1,f+1):
        refined_set=correct_order[:i]
        discarded_set=correct_order[i:]
        print(f'Refined set :{refined_set}, Discarded set:{discarded_set}')
        columns=[f'feature{k}' for k in refined_set]
        X1=df[columns].values
        con,info=ml(X1,labels)
        print(max(info))
        data.append(info)
        confidence.append(con)
    return confidence,data

In [11]:
#call reductions
def call_reductions(trait):
    headers=['Logistic','SVM','MLP','Random Forest']
    indices=questions[trait]
    f=len(indices)
    labels=[functions[trait](i) for i in list(df[trait])]
    l=len(labels)
    confidence,data=correlation_algo(trait)
    print("Accuracy on augmented data")
    print(tabulate(data, headers=headers, tablefmt="grid"))
    print("Confidence on augmented data")
    print(tabulate(confidence, headers=headers, tablefmt="grid"))
    file_path1=f'trait-wise/corrected/{trait}_accuracy.pkl'
    file_path2=f'trait-wise/corrected/{trait}_confidence.pkl'
    with open(file_path1,'wb') as f:
        pickle.dump(data,f)
    with open(file_path2,'wb') as f:
        pickle.dump(confidence,f)

In [12]:
call_reductions('EI')

Refined set :[14], Discarded set:[8, 6, 12, 0, 10, 4, 2, 16]
0.4062671797691039
Refined set :[14, 8], Discarded set:[6, 12, 0, 10, 4, 2, 16]
0.6948873007146784
Refined set :[14, 8, 6], Discarded set:[12, 0, 10, 4, 2, 16]
0.7625068719076415
Refined set :[14, 8, 6, 12], Discarded set:[0, 10, 4, 2, 16]




0.8317757009345794
Refined set :[14, 8, 6, 12, 0], Discarded set:[10, 4, 2, 16]




0.8752061572292469
Refined set :[14, 8, 6, 12, 0, 10], Discarded set:[4, 2, 16]
0.8730071467839472
Refined set :[14, 8, 6, 12, 0, 10, 4], Discarded set:[2, 16]
0.8818031885651457
Refined set :[14, 8, 6, 12, 0, 10, 4, 2], Discarded set:[16]




0.8829026937877955




Accuracy on augmented data
+------------+----------+----------+-----------------+
|   Logistic |      SVM |      MLP |   Random Forest |
|   0.406267 | 0.406267 | 0.406267 |        0.406267 |
+------------+----------+----------+-----------------+
|   0.495877 | 0.694887 | 0.639912 |        0.693788 |
+------------+----------+----------+-----------------+
|   0.71083  | 0.750412 | 0.762507 |        0.75646  |
+------------+----------+----------+-----------------+
|   0.780099 | 0.831776 | 0.818582 |        0.820231 |
+------------+----------+----------+-----------------+
|   0.79989  | 0.875206 | 0.860913 |        0.868609 |
+------------+----------+----------+-----------------+
|   0.797691 | 0.86586  | 0.86696  |        0.873007 |
+------------+----------+----------+-----------------+
|   0.813634 | 0.873557 | 0.857064 |        0.881803 |
+------------+----------+----------+-----------------+
|   0.812534 | 0.869159 | 0.86696  |        0.882903 |
+------------+----------+----------+--

In [None]:
call_reductions('WI')

Refined set :[27], Discarded set:[47, 38, 30, 50, 44, 33, 36, 41]
0.3232545354590434
Refined set :[27, 47], Discarded set:[38, 30, 50, 44, 33, 36, 41]
0.7784496976360638
Refined set :[27, 47, 38], Discarded set:[30, 50, 44, 33, 36, 41]




0.7839472237493128
Refined set :[27, 47, 38, 30], Discarded set:[50, 44, 33, 36, 41]
0.7855964815832875
Refined set :[27, 47, 38, 30, 50], Discarded set:[44, 33, 36, 41]




0.7844969763606378
Refined set :[27, 47, 38, 30, 50, 44], Discarded set:[33, 36, 41]
0.7850467289719626
Refined set :[27, 47, 38, 30, 50, 44, 33], Discarded set:[36, 41]


In [None]:
call_reductions('EC')

In [None]:
call_reductions('WC')

In [None]:
call_reductions('EA')

In [None]:
call_reductions('WA')

In [None]:
call_reductions('E')

In [None]:
call_reductions('A')

In [None]:
call_reductions('C')

In [None]:
call_reductions('N')

In [None]:
call_reductions('O')

In [None]:
call_reductions('AM')

In [None]:
call_reductions('LC')

In [None]:
call_reductions('MC')

In [None]:
call_reductions('ND')

In [None]:
call_reductions('PfW')

In [None]:
call_reductions('SE')