In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr
from scipy.special import rel_entr
import pickle
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,mean_squared_error
from itertools import combinations

In [36]:
similar_sets={
    'EI':[[2,4],[0,10],[6,15]],
    'WI':[[30,33,38,44,47,50]],
    'EC':[[29,46],[32,49,53],[35,52]],
    'WC':[[1,17],[5,21,25],[9,23]],
    'EA':[[3,7,20,26],[11,22],[18,24]],
    'WA':[[34,39,45,51],[31,42,28]],
    'E':[[79,99]],
    'A':[[80,105]],
    'C':[],
    'N':[[92,122]],
    'O':[[88,103],[98,123]]
}
df=pd.read_csv('CSV/Labelled_data.csv')

In [37]:
def find_similarity(trait):
    sets=similar_sets[trait]
    m=df.shape[0]
    for ques_set in sets:
        l=len(ques_set)
        pear_corr=np.zeros((l,l),dtype=np.float64)
        dist=np.zeros((l,l),dtype=np.float64)
        cos_sim=np.zeros((l,l),dtype=np.float64)
        for i in range(l):
            for j in range(l):
                d1=np.array(list(df[f'feature{ques_set[i]}']))
                d2=np.array(list(df[f'feature{ques_set[j]}']))
                pear_corr[i][j],_=pearsonr(d1,d2)
#                 pear_corr[i][j]==pear_corr[i][j]
                dist[i][j]=sum(abs(x - y) for x, y in zip(d1,d2))/m
                dot_product = np.dot(d1,d2)
                magnitude1 = np.linalg.norm(d1)
                magnitude2 = np.linalg.norm(d2)
                cos_sim[i][j] = dot_product / (magnitude1 * magnitude2)
        print(ques_set)
#         plt.imshow(pear_corr, cmap='viridis', interpolation='nearest')
#         plt.colorbar()
#         plt.title('Pearson Correlation')
#         plt.show()
        print(f'Correlation matrix {pear_corr}')
        print(f'Average Distance {dist}')
        print(f'Cosine similarity {cos_sim}')

In [38]:
find_similarity('EI')

[2, 4]
Correlation matrix [[1.         0.37915974]
 [0.37915974 1.        ]]
Average Distance [[0.         0.39155301]
 [0.39155301 0.        ]]
Cosine similarity [[1.         0.59286057]
 [0.59286057 1.        ]]
[0, 10]
Correlation matrix [[1.         0.31767717]
 [0.31767717 1.        ]]
Average Distance [[0.         0.39058513]
 [0.39058513 0.        ]]
Cosine similarity [[1.         0.65724471]
 [0.65724471 1.        ]]
[6, 15]
Correlation matrix [[1.         0.32029451]
 [0.32029451 1.        ]]
Average Distance [[0.         0.43625165]
 [0.43625165 0.        ]]
Cosine similarity [[1.         0.54737125]
 [0.54737125 1.        ]]


In [39]:
find_similarity('WI')

[30, 33, 38, 44, 47, 50]
Correlation matrix [[1.         0.35926768 0.41755049 0.3781418  0.36733391 0.34893228]
 [0.35926768 1.         0.39806349 0.33695781 0.38997586 0.34781575]
 [0.41755049 0.39806349 1.         0.39402721 0.388694   0.41033173]
 [0.3781418  0.33695781 0.39402721 1.         0.42074583 0.46237444]
 [0.36733391 0.38997586 0.388694   0.42074583 1.         0.45378122]
 [0.34893228 0.34781575 0.41033173 0.46237444 0.45378122 1.        ]]
Average Distance [[0.         0.39046403 0.36541688 0.40802151 0.41349654 0.42947757]
 [0.39046403 0.         0.37247849 0.43512712 0.40047062 0.42851016]
 [0.36541688 0.37247849 0.         0.40252567 0.39801078 0.39885756]
 [0.40802151 0.43512712 0.40252567 0.         0.39132209 0.37404311]
 [0.41349654 0.40047062 0.39801078 0.39132209 0.         0.38190722]
 [0.42947757 0.42851016 0.39885756 0.37404311 0.38190722 0.        ]]
Cosine similarity [[1.         0.60361983 0.64171916 0.57630463 0.57510503 0.54392538]
 [0.60361983 1.       

In [40]:
find_similarity('EC')

[29, 46]
Correlation matrix [[1.         0.41127556]
 [0.41127556 1.        ]]
Average Distance [[0.         0.47776636]
 [0.47776636 0.        ]]
Cosine similarity [[1.         0.46512625]
 [0.46512625 1.        ]]
[32, 49, 53]
Correlation matrix [[1.         0.39804655 0.38099072]
 [0.39804655 1.         0.49410913]
 [0.38099072 0.49410913 1.        ]]
Average Distance [[0.         0.43717076 0.44218596]
 [0.43717076 0.         0.37898984]
 [0.44218596 0.37898984 0.        ]]
Cosine similarity [[1.         0.55334312 0.53637456]
 [0.55334312 1.         0.59380104]
 [0.53637456 0.59380104 1.        ]]
[35, 52]
Correlation matrix [[1.         0.34832401]
 [0.34832401 1.        ]]
Average Distance [[0.         0.44770826]
 [0.44770826 0.        ]]
Cosine similarity [[1.         0.53635987]
 [0.53635987 1.        ]]


In [41]:
find_similarity('WC')

[1, 17]
Correlation matrix [[1.       0.367933]
 [0.367933 1.      ]]
Average Distance [[0.        0.4695414]
 [0.4695414 0.       ]]
Cosine similarity [[1.         0.43938945]
 [0.43938945 1.        ]]
[5, 21, 25]
Correlation matrix [[1.         0.51784316 0.43060729]
 [0.51784316 1.         0.50271146]
 [0.43060729 0.50271146 1.        ]]
Average Distance [[0.         0.46095863 0.51558554]
 [0.46095863 0.         0.45630729]
 [0.51558554 0.45630729 0.        ]]
Cosine similarity [[1.         0.51800868 0.43075541]
 [0.51800868 1.         0.50486266]
 [0.43075541 0.50486266 1.        ]]
[9, 23]
Correlation matrix [[1.         0.52800536]
 [0.52800536 1.        ]]
Average Distance [[0.         0.45363169]
 [0.45363169 0.        ]]
Cosine similarity [[1.         0.52542273]
 [0.52542273 1.        ]]


In [42]:
find_similarity('EA')

[3, 7, 20, 26]
Correlation matrix [[1.         0.43774878 0.38489712 0.35868848]
 [0.43774878 1.         0.41563505 0.44091307]
 [0.38489712 0.41563505 1.         0.46507863]
 [0.35868848 0.44091307 0.46507863 1.        ]]
Average Distance [[0.         0.37184338 0.38600314 0.41320638]
 [0.37184338 0.         0.3926493  0.39922842]
 [0.38600314 0.3926493  0.         0.34768816]
 [0.41320638 0.39922842 0.34768816 0.        ]]
Cosine similarity [[1.         0.66130425 0.66962057 0.62254905]
 [0.66130425 1.         0.6505042  0.64395592]
 [0.66962057 0.6505042  1.         0.68674818]
 [0.62254905 0.64395592 0.68674818 1.        ]]
[11, 22]
Correlation matrix [[1.         0.40379769]
 [0.40379769 1.        ]]
Average Distance [[0.         0.40780555]
 [0.40780555 0.        ]]
Cosine similarity [[1.         0.61942923]
 [0.61942923 1.        ]]
[18, 24]
Correlation matrix [[1.        0.3788138]
 [0.3788138 1.       ]]
Average Distance [[0.         0.47731984]
 [0.47731984 0.        ]]
Cosin

In [43]:
find_similarity('WA')

[34, 39, 45, 51]
Correlation matrix [[1.         0.30372007 0.32141081 0.32199698]
 [0.30372007 1.         0.21001353 0.21031083]
 [0.32141081 0.21001353 1.         0.40487908]
 [0.32199698 0.21031083 0.40487908 1.        ]]
Average Distance [[0.         0.52123343 0.55094436 0.58767587]
 [0.52123343 0.         0.61473809 0.67483022]
 [0.55094436 0.61473809 0.         0.52952046]
 [0.58767587 0.67483022 0.52952046 0.        ]]
Cosine similarity [[1.         0.3715799  0.34000646 0.29064754]
 [0.3715799  1.         0.24366737 0.14555328]
 [0.34000646 0.24366737 1.         0.38884409]
 [0.29064754 0.14555328 0.38884409 1.        ]]
[31, 42, 28]
Correlation matrix [[1.         0.38534329 0.32816608]
 [0.38534329 1.         0.31761594]
 [0.32816608 0.31761594 1.        ]]
Average Distance [[0.         0.40089726 0.41215423]
 [0.40089726 0.         0.43825009]
 [0.41215423 0.43825009 0.        ]]
Cosine similarity [[1.         0.6329828  0.61820306]
 [0.6329828  1.         0.57217411]
 [0.6

In [44]:
find_similarity('E')

[79, 99]
Correlation matrix [[1.         0.44677279]
 [0.44677279 1.        ]]
Average Distance [[0.        0.4679331]
 [0.4679331 0.       ]]
Cosine similarity [[1.         0.49536567]
 [0.49536567 1.        ]]


In [45]:
find_similarity('A')

[80, 105]
Correlation matrix [[1.         0.01771221]
 [0.01771221 1.        ]]
Average Distance [[0.         0.83347998]
 [0.83347998 0.        ]]
Cosine similarity [[ 1.         -0.04174163]
 [-0.04174163  1.        ]]


In [46]:
find_similarity('N')

[92, 122]
Correlation matrix [[1.         0.16674323]
 [0.16674323 1.        ]]
Average Distance [[0.         0.70325561]
 [0.70325561 0.        ]]
Cosine similarity [[1.         0.12749442]
 [0.12749442 1.        ]]


In [47]:
find_similarity('O')

[88, 103]
Correlation matrix [[1.         0.00325436]
 [0.00325436 1.        ]]
Average Distance [[0.        0.8324901]
 [0.8324901 0.       ]]
Cosine similarity [[ 1.         -0.07648418]
 [-0.07648418  1.        ]]
[98, 123]
Correlation matrix [[1.         0.46527531]
 [0.46527531 1.        ]]
Average Distance [[0.        0.4058513]
 [0.4058513 0.       ]]
Cosine similarity [[1.         0.61401793]
 [0.61401793 1.        ]]
