In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2
from scipy.stats import beta
import pickle
from scipy.stats import pearsonr
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,mean_squared_error
from itertools import combinations
from tabulate import tabulate

In [2]:
split=13943

In [17]:
def ml(X,labels,split_index):
    labels=np.array(labels)
    X_train, X_test = X[:split_index], X[split_index:]
    y_train, y_test = labels[:split_index], labels[split_index:]
    logistic_regression_classifier = LogisticRegression(max_iter=1000, random_state=42)
    logistic_regression_classifier.fit(X_train, y_train)
    y_pred = logistic_regression_classifier.predict(X_test)
    probabilities = logistic_regression_classifier.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    logistic_confidence=np.mean(confidences)
    acc_logistic=accuracy_score(y_test, y_pred)
    svm_classifier = SVC(kernel='rbf', C=1.0, random_state=42)
    svm_classifier.fit(X_train, y_train)
    y_pred = svm_classifier.predict(X_test)
    confidence_scores = svm_classifier.decision_function(X_test)
    svm_confidence=np.mean(np.abs(confidence_scores))
    acc_svm=accuracy_score(y_test, y_pred)
    h1=len(X_train[0])
    mlp = MLPClassifier(hidden_layer_sizes=(2*h1,6), max_iter=500, random_state=42)
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    probabilities = mlp.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    mlp_confidence=np.mean(confidences)
    acc_mlp=accuracy_score(y_test, y_pred)
    random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    random_forest_classifier.fit(X_train, y_train)
    y_pred = random_forest_classifier.predict(X_test)
    probabilities = random_forest_classifier.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    rf_confidence=np.mean(confidences)
    acc_rf=accuracy_score(y_test, y_pred)
    data=[acc_logistic,acc_svm,acc_mlp,acc_rf]
    print(data)
    confidence=[logistic_confidence,svm_confidence,mlp_confidence,rf_confidence]
    return confidence,data

In [19]:
def find_order():
    truncate_index = split
    truncated_df = df.iloc[:truncate_index]
    indices=[i for i in range(124)]
    f=len(indices)
    y= list(truncated_df['Final Label'])
    values=list(truncated_df['Composite'])
    X=[]
    for i in indices:
        X.append(list(truncated_df[f'feature{i}']))
    l=len(X[0])
    corr_matrix=np.zeros((f,3))
    weight_matrix=np.zeros((f,3))
    for i in range(3):
        c=[values[j] for j in range(l) if y[j]==i]
        if(len(c)>0):
            for j in range(f):
                x_temp=[X[j][k] for k in range(l) if y[k]==i]
                corr, _ = pearsonr(x_temp, c)
                corr_matrix[j][i]=abs(corr)
            x=list(corr_matrix[:,i])
            rank_indices=np.argsort(x)
            rank=1
            for idx in rank_indices:
                weight_matrix[idx][i]=rank
                rank=rank+1
    correlation=np.zeros((f,f))
    for i in range(f):
        for j in range(f):
            x1=X[i][:]
            x2=X[j][:]
            corr, _ = pearsonr(x1,x2)
            correlation[i][j]=corr
    scores=[]
    for i in range(f):
        weight_sum=0
        corr_sum=0
        for j in range(3):
            corr_sum=corr_sum+(corr_matrix[i][j]*weight_matrix[i][j])
            weight_sum=weight_sum+weight_matrix[i][j]
        scores.append((corr_sum*f)/(weight_sum*np.sum(correlation[i])))
    rank_features=np.argsort(scores)
    l=len(scores)-1
    correct_order=[]
    for i in range(f):
        correct_order.append(indices[rank_features[l]])
        l=l-1
    return correct_order
def correlation_algo(correct_order,index):
    labels=list(df['Final Label'])
    data=[]
    confidence=[]
    for i in range(1,124):
        refined_set=correct_order[:i]
        discarded_set=correct_order[i:]
        print(f'Number of features: {i}')
        columns=[f'feature{k}' for k in refined_set]
        X1=df[columns].values
        con,info=ml(X1,labels,index)
        print(max(info))
        data.append(info)
        confidence.append(con)
    return confidence,data

In [None]:
df=pd.read_csv('CSV/Labelled_data.csv')
headers=['Logistic','SVM','MLP','Random Forest']
indices=[i for i in range(124)]
f=len(indices)
labels=list(df['Final Label'])
l=len(labels)
# correct_order=find_order()
# print(correct_order)
confidence,data=correlation_algo(correct_order,int(0.6*df.shape[0]))
cols=[f'feature{k}' for k in indices]
X=df[cols].values
con,info=ml(X,labels,int(0.6*df.shape[0]))
data.append(info)
confidence.append(con)
print("Accuracy on augmented data")
print(tabulate(data, headers=headers, tablefmt="grid"))
print("Confidence on augmented data")
print(tabulate(confidence, headers=headers, tablefmt="grid"))
file_path1='combined_accuracy.pkl'
file_path2='combined_confidence.pkl'
with open(file_path1,'wb') as f:
    pickle.dump(data,f)
with open(file_path2,'wb') as f:
    pickle.dump(confidence,f)

Number of features: 1
[0.6190214403518417, 0.6190214403518417, 0.6190214403518417, 0.6190214403518417]
0.6190214403518417
Number of features: 2
[0.6190214403518417, 0.6190214403518417, 0.6190214403518417, 0.6190214403518417]
0.6190214403518417
Number of features: 3
[0.6190214403518417, 0.6190214403518417, 0.6190214403518417, 0.6063771302913689]
0.6190214403518417
Number of features: 4
[0.6190214403518417, 0.6190214403518417, 0.6190214403518417, 0.5898845519516218]
0.6190214403518417
Number of features: 5
[0.6190214403518417, 0.6190214403518417, 0.6382627817482133, 0.5981308411214953]
0.6382627817482133
Number of features: 6




[0.6190214403518417, 0.6190214403518417, 0.6443100604727873, 0.6091258933479934]
0.6443100604727873
Number of features: 7
[0.6190214403518417, 0.6190214403518417, 0.6542056074766355, 0.6278174821330401]
0.6542056074766355
Number of features: 8




[0.6195711929631665, 0.6190214403518417, 0.6525563496426608, 0.6377130291368884]
0.6525563496426608
Number of features: 9




[0.6201209455744915, 0.6190214403518417, 0.6542056074766355, 0.6608026388125343]
0.6608026388125343
Number of features: 10




[0.6201209455744915, 0.6212204507971413, 0.6586036283672347, 0.6536558548653106]
0.6586036283672347
Number of features: 11




[0.622869708631116, 0.6201209455744915, 0.6558548653106102, 0.645959318306762]
0.6558548653106102
Number of features: 12




[0.622869708631116, 0.6217702034084662, 0.6432105552501375, 0.6470588235294118]
0.6470588235294118
Number of features: 13




[0.622869708631116, 0.6234194612424409, 0.6437603078614623, 0.6553051126992854]
0.6553051126992854
Number of features: 14




[0.6245189664650908, 0.6327652556349642, 0.6432105552501375, 0.6635514018691588]
0.6635514018691588
Number of features: 15




[0.6278174821330401, 0.6349642660802639, 0.657504123144585, 0.708631115997801]
0.708631115997801
Number of features: 16




[0.6250687190764156, 0.6432105552501375, 0.6448598130841121, 0.7212754260582738]
0.7212754260582738
Number of features: 17




[0.6250687190764156, 0.6393622869708631, 0.6377130291368884, 0.7130291368884002]
0.7130291368884002
Number of features: 18




[0.6272677295217153, 0.6410115448048378, 0.6245189664650908, 0.7097306212204508]
0.7097306212204508
Number of features: 19




[0.6245189664650908, 0.6360637713029137, 0.5937328202308961, 0.6717976910390324]
0.6717976910390324
Number of features: 20




[0.6256184716877405, 0.6470588235294118, 0.5953820780648708, 0.6646509070918087]
0.6646509070918087
Number of features: 21




[0.6250687190764156, 0.6465090709180868, 0.576690489279824, 0.6701484332050577]
0.6701484332050577
Number of features: 22




[0.6250687190764156, 0.6437603078614623, 0.6151731720725674, 0.6602528862012095]
0.6602528862012095
Number of features: 23




[0.6250687190764156, 0.6454095656954371, 0.6091258933479934, 0.6657504123144585]
0.6657504123144585
Number of features: 24




[0.6294667399670149, 0.6531061022539857, 0.6036283672347443, 0.6668499175371083]
0.6668499175371083
Number of features: 25




[0.6283672347443651, 0.6558548653106102, 0.5887850467289719, 0.6602528862012095]
0.6602528862012095
Number of features: 26




[0.6289169873556899, 0.6591533809785597, 0.6261682242990654, 0.6706981858163826]
0.6706981858163826
Number of features: 27




[0.6283672347443651, 0.6613523914238593, 0.594282572842221, 0.6723474436503574]
0.6723474436503574
Number of features: 28




[0.6289169873556899, 0.6580538757559098, 0.5849367784496976, 0.6739967014843321]
0.6739967014843321
Number of features: 29




[0.6322155030236394, 0.65695437053326, 0.6041781198460693, 0.6673996701484332]
0.6673996701484332
Number of features: 30




[0.6322155030236394, 0.657504123144585, 0.5689939527212754, 0.6673996701484332]
0.6673996701484332
Number of features: 31




[0.6360637713029137, 0.662451896646509, 0.5926333150082463, 0.6690489279824079]
0.6690489279824079
Number of features: 32




[0.6311159978009896, 0.6602528862012095, 0.5684442001099506, 0.6734469488730072]
0.6734469488730072
Number of features: 33




[0.6305662451896646, 0.6602528862012095, 0.5816382627817482, 0.6706981858163826]
0.6706981858163826
Number of features: 34




[0.6322155030236394, 0.6580538757559098, 0.5667949422759758, 0.6690489279824079]
0.6690489279824079
Number of features: 35




[0.6327652556349642, 0.6564046179219352, 0.6014293567894448, 0.6712479384277076]
0.6712479384277076
Number of features: 36
[0.6311159978009896, 0.657504123144585, 0.5722924683892249, 0.6739967014843321]
0.6739967014843321
Number of features: 37
[0.6371632765255635, 0.662451896646509, 0.5948323254535459, 0.6690489279824079]
0.6690489279824079
Number of features: 38




[0.6349642660802639, 0.6641011544804838, 0.6047278724573941, 0.6690489279824079]
0.6690489279824079
Number of features: 39
[0.6338647608576141, 0.6646509070918087, 0.5854865310610226, 0.6734469488730072]
0.6734469488730072
Number of features: 40
[0.6333150082462892, 0.6668499175371083, 0.6102253985706432, 0.6734469488730072]
0.6734469488730072
Number of features: 41
[0.634414513468939, 0.6652006597031336, 0.5893347993402969, 0.6761957119296317]
0.6761957119296317
Number of features: 42
[0.634414513468939, 0.6641011544804838, 0.6107751511819681, 0.6739967014843321]
0.6739967014843321
Number of features: 43
[0.6294667399670149, 0.6602528862012095, 0.5948323254535459, 0.6745464540956569]
0.6745464540956569
Number of features: 44




[0.6322155030236394, 0.662451896646509, 0.5849367784496976, 0.6695986805937328]
0.6695986805937328
Number of features: 45
[0.6333150082462892, 0.663001649257834, 0.6091258933479934, 0.6679494227597581]
0.6679494227597581
Number of features: 46
[0.6327652556349642, 0.6663001649257834, 0.6047278724573941, 0.6739967014843321]
0.6739967014843321
Number of features: 47
[0.6316657504123144, 0.6652006597031336, 0.5992303463441452, 0.6706981858163826]
0.6706981858163826
Number of features: 48
[0.6305662451896646, 0.6635514018691588, 0.6239692138537658, 0.6701484332050577]
0.6701484332050577
Number of features: 49
[0.6311159978009896, 0.662451896646509, 0.5981308411214953, 0.6728971962616822]
0.6728971962616822
Number of features: 50
[0.6305662451896646, 0.6646509070918087, 0.6030786146234195, 0.6767454645409565]
0.6767454645409565
Number of features: 51
[0.6311159978009896, 0.6663001649257834, 0.582737768004398, 0.6745464540956569]
0.6745464540956569
Number of features: 52
[0.6305662451896646,

In [None]:
#replaced the similar questions with the average response and used that for the analysis
df=pd.read_csv('CSV/corrected.csv')
headers=['Logistic','SVM','MLP','Random Forest']
indices=[i for i in range(124)]
f=len(indices)
labels=list(df['Final Label'])
l=len(labels)
correct_order=find_order()
print(correct_order)
confidence,data=correlation_algo(correct_order,int(0.6*df.shape[0]))
cols=[f'feature{k}' for k in indices]
X=df[cols].values
con,info=ml(X,labels,int(0.6*df.shape[0]))
data.append(info)
confidence.append(con)
print("Accuracy on augmented data")
print(tabulate(data, headers=headers, tablefmt="grid"))
print("Confidence on augmented data")
print(tabulate(confidence, headers=headers, tablefmt="grid"))
file_path1='corrected_combined_accuracy.pkl'
file_path2='corrected_combined_confidence.pkl'
with open(file_path1,'wb') as f:
    pickle.dump(data,f)
with open(file_path2,'wb') as f:
    pickle.dump(confidence,f)

In [None]:
#using augmented data
df=pd.read_csv('CSV/augmented_data.csv')
headers=['Logistic','SVM','MLP','Random Forest']
indices=[i for i in range(124)]
f=len(indices)
labels=list(df['Final Label'])
l=len(labels)
correct_order=find_order()
print(correct_order)
confidence,data=correlation_algo(correct_order,split)
cols=[f'feature{k}' for k in indices]
X=df[cols].values
con,info=ml(X,labels,split)
data.append(info)
confidence.append(con)
print("Accuracy on augmented data")
print(tabulate(data, headers=headers, tablefmt="grid"))
print("Confidence on augmented data")
print(tabulate(confidence, headers=headers, tablefmt="grid"))
file_path1='augmented_combined_accuracy.pkl'
file_path2='augmented_combined_confidence.pkl'
with open(file_path1,'wb') as f:
    pickle.dump(data,f)
with open(file_path2,'wb') as f:
    pickle.dump(confidence,f)