In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2
from scipy.stats import beta
import pickle
from scipy.stats import pearsonr
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,mean_squared_error
from itertools import combinations
from tabulate import tabulate

In [22]:
similar_sets={
    'EI':[[2,4],[0,10],[6,15]],
    'WI':[[30,33,38,44,47,50]],
    'EC':[[29,46],[32,49,53],[35,52]],
    'WC':[[1,17],[5,21,25],[9,23]],
    'EA':[[3,7,20,26],[11,22],[18,24]],
    'WA':[[34,39,45,51],[31,42,28]],
    'E':[[79,99]],
    'A':[[80,105]],
    'N':[[92,122]],
    'O':[[88,103],[98,123]]
}
traits=['EI','WI','EC','WC','EA','WA','E','A','N','O']
df=pd.read_csv('CSV/Labelled_data.csv')
questions={
    'EI':[0,2,4,6,8,10,12,14,16],
    'WI':[27,30,33,36,38,41,44,47,50],
    'EC':[39,32,35,40,43,46,49,52,53],
    'WC':[1,5,9,13,17,19,21,23,25],
    'EA':[3,7,11,16,18,20,22,24,26],
    'WA':[28,31,34,37,39,42,45,48,51],
    'AM':[56,60,72],
    'LC':[54,59,64,67],
    'MC':[55,62,65,69],
    'ND':[63,66],
    'PfW':[58,68,71],
    'SE':[57,61,70,73],
    'E':[74,79,84,89,94,99,104,109,114,119],
    'A':[75,80,85,90,95,100,105,110,115,120],
    'C':[76,81,86,91,96,101,106,111,116,121],
    'N':[77,82,87,92,97,102,107,112,117,122],
    'O':[78,83,88,93,98,103,108,113,118,123]
}
def I_label(score):
    if(score==6 or score==7):
        return 0
    if(score>=2 and score <=5):
        return 1
    else:
        return 2
def C_label(score):
    if(score>=4 and score<=7):
        return 0
    elif(score==2 or score==3):
        return 1
    else:
        return 2
def A_label(score):
    if(score>=4 and score<=6):
        return 0
    if(score>=7 and score <=9):
        return 1
    else:
        return 2
def AM_LC_label(score):
    if(score>=4 and score<=5):
        return 0
    elif(score>=2.6 and score<=3.9):
        return 1
    else:
        return 2
def rest_label(score):
    if(score>=4 and score<=6):
        return 0
    elif(score>=2.6 and score<=3.9):
        return 1
    else:
        return 2
def EN_label(score):
    if(score>=0 and score<=25):
        return 0
    elif(score>=26 and score<=40):
        return 1
    else:
        return 2
def AO_label(score):
    if(score>=0 and score<=30):
        return 0
    elif(score>=31 and score<=40):
        return 1
    else:
        return 2
def C3_label(score):
    if(score>=36 and score<=50):
        return 0
    elif(score>=26 and score<=35):
        return 1
    else:
        return 2
def I_smoothing(x):
    return(((8-x)*math.exp(0.6*x)+221.406)/29.591)
def C_smoothing(x):
    return(((12-x)*math.exp(0.15*x)-11.572)/0.327)
def A_smoothing(x):
    return((2+((x-2)*math.exp(-0.32*x)))/0.261)
def identity(x):
    return x
functions={
    'EI':I_label,
    'WI':I_label,
    'EC':C_label,
    'WC':C_label,
    'EA':A_label,
    'WA':A_label,
    'AM':AM_LC_label,
    'LC':AM_LC_label,
    'MC':rest_label,
    'ND':rest_label,
    'PfW':rest_label,
    'SE':rest_label,
    'E':EN_label,
    'A':AO_label,
    'C':C3_label,
    'N':EN_label,
    'O':AO_label
}
smoothing={
    'EI':I_smoothing,
    'WI':I_smoothing,
    'EC':C_smoothing,
    'WC':C_smoothing,
    'EA':A_smoothing,
    'WA':A_smoothing,
    'AM':identity,
    'LC':identity,
    'MC':identity,
    'ND':identity,
    'PfW':identity,
    'SE':identity,
    'E':identity,
    'A':identity,
    'C':identity,
    'N':identity,
    'O':identity
}

In [23]:
def reliability1():
    l=df.shape[0]
    rel=np.zeros(1)
    for trait in traits:
        subsets=similar_sets[trait]
        scores=[]
        dissimilar=[(80,105),(92,122),(88,103),(105,80),(122,92),(103,88)]
        for ques in subsets:
            n=len(ques)
            r=np.zeros(1)
            s=np.zeros(l)
            cnt=0
            for i in range(n):
                for j in range(i+1,n):
                    cnt+=1
                    x1=np.array(list(df[f'feature{ques[i]}']))
                    x2=np.array(list(df[f'feature{ques[j]}']))
                    if((ques[i],ques[j]) in dissimilar):
                        r=np.abs(x1+x2)
                    else:
                        r=np.abs(x1-x2)
                    for k in range(l):
                        s[k]+=1-r[k]/np.max(r)
            for k in range(l):
                s[k]=s[k]/cnt
            scores.append(s)
        scores=np.array(scores)
        scores=np.transpose(scores)
        if(trait=='EI'):
            rel=scores
        else:
            rel=np.concatenate((rel,scores),axis=1)
    # sns.heatmap(rel,cmap='viridis')
    # plt.xlabel('Subsets')
    # plt.ylabel('Individuals')
    # plt.title('Reliability')
    # plt.show()
    return np.mean(rel,axis=1)

In [24]:
X=df.values
rel=reliability1()
l=X.shape[0]
limit=int(0.6*l)
scores=(rel-np.min(rel))/(np.max(rel)-np.min(rel))
X_train=X[:limit]
X_test=X[limit:]
train_scores=scores[:limit]
test_scores=scores[limit:]
augment_scale=10
new_train=[]
new_test=[]
for i in range(limit):
    scr=train_scores[i]
    freq=int(1+(augment_scale-1)*scr)
    for j in range(freq):
        new_train.append(X_train[i])
new_train=np.array(new_train)
np.random.shuffle(new_train)
for i in range(l-limit):
     scr=test_scores[i]
     freq=int(1+(augment_scale-1)*scr)
     for j in range(freq):
        new_test.append(X_test[i])
new_test=np.array(new_test)
np.random.shuffle(new_test)
X=np.vstack((new_train,new_test))
print(new_train.shape)
print(new_test.shape)

(13943, 149)
(10671, 149)


In [25]:
print(X.shape)

(24614, 149)


In [26]:
# def add_noise(row, score, n_columns=124):
#     # Generate random noise with the same shape as the first n columns of the row
#     noise = np.random.normal(scale=score, size=row[:n_columns].shape)
#     # Add the noise to the first n columns of the row
#     row_with_noise = row.copy()
#     row_with_noise[:n_columns] += noise
#     return row_with_noise

In [27]:
# dissimilar=[(80,105),(92,122),(88,103),(105,80),(122,92),(103,88)]
# X=df.values
# for i in range(df.shape[0]):
#     for trait in similar_sets:
#        subsets=similar_sets[trait]
#        for ques in subsets:
#            n=len(ques)
#            s=0
#            if ques in dissimilar:
#                s=s+X[i][ques[0]]
#                s=s-X[i][ques[1]]
#                s=s/2
#                X[i][ques[0]]=s
#                X[i][ques[1]]=-s
#            else:
#                for j in range(n):
#                    s=s+X[i][ques[j]]
#                s=s/n
#                for j in range(n):
#                    X[i][ques[j]]=s                

In [28]:
df=pd.DataFrame(X,columns=df.columns)

In [29]:
df.to_csv('CSV/augmented_data.csv',index=False)

In [30]:
df

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,N,O,Test-1,Test-2,Test-3,BXI,CXI,TXI,Composite,Final Label
0,1.0,-0.2,-0.2,1.0,-0.2,-1.0,0.6,-0.2,-1.0,-1.0,...,25.0,23.0,52.0,42.0,48.0,146.0,152.0,122.0,64.721444,1.0
1,1.0,-0.2,0.2,1.0,-0.2,-0.2,0.2,1.0,-0.2,-0.2,...,36.0,30.0,52.0,52.0,44.0,156.0,148.0,122.0,67.390289,1.0
2,1.0,1.0,0.2,0.2,1.0,-0.2,0.2,0.2,1.0,1.0,...,24.0,34.0,54.0,46.0,46.0,154.0,154.0,124.0,70.596016,1.0
3,0.6,-0.2,-0.2,0.6,0.6,0.6,0.6,0.6,0.6,0.6,...,8.0,28.0,56.0,46.0,46.0,158.0,158.0,129.0,77.384843,2.0
4,0.2,0.6,0.6,0.6,-0.2,1.0,-0.2,1.0,-0.2,-0.2,...,17.0,21.0,52.0,44.0,46.0,148.0,150.0,121.0,64.071740,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24609,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,8.0,28.0,42.0,52.0,46.0,136.0,130.0,102.0,37.402739,0.0
24610,0.6,0.6,-0.2,0.6,-0.2,0.2,1.0,0.6,0.2,-0.6,...,21.0,23.0,56.0,46.0,46.0,158.0,158.0,129.0,77.384843,2.0
24611,-0.2,-0.2,0.2,-0.2,0.2,0.2,0.6,0.6,-0.6,0.2,...,20.0,21.0,50.0,36.0,46.0,136.0,146.0,116.0,53.579209,1.0
24612,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6,1.0,...,8.0,29.0,48.0,50.0,46.0,146.0,142.0,114.0,55.057579,1.0
