In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2
from scipy.stats import beta
import pickle
from scipy.stats import pearsonr
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,mean_squared_error
from itertools import combinations
from tabulate import tabulate

In [2]:
df=pd.read_csv('CSV/bert_data.csv')

In [3]:
def ml(X,labels):
    split_index = 17838
    labels=np.array(labels)
    X_train, X_test = X[:split_index], X[split_index:]
    y_train, y_test = labels[:split_index], labels[split_index:]
    logistic_regression_classifier = LogisticRegression(max_iter=1000, random_state=42)
    logistic_regression_classifier.fit(X_train, y_train)
    y_pred = logistic_regression_classifier.predict(X_test)
    probabilities = logistic_regression_classifier.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    logistic_confidence=np.mean(confidences)
    acc_logistic=accuracy_score(y_test, y_pred)
    svm_classifier = SVC(kernel='rbf', C=1.0, random_state=42)
    svm_classifier.fit(X_train, y_train)
    y_pred = svm_classifier.predict(X_test)
    confidence_scores = svm_classifier.decision_function(X_test)
    svm_confidence=np.mean(np.abs(confidence_scores))
    acc_svm=accuracy_score(y_test, y_pred)
    h1=len(X_train[0])
    mlp = MLPClassifier(hidden_layer_sizes=(2*h1,6), max_iter=500, random_state=42)
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    probabilities = mlp.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    mlp_confidence=np.mean(confidences)
    acc_mlp=accuracy_score(y_test, y_pred)
    random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    random_forest_classifier.fit(X_train, y_train)
    y_pred = random_forest_classifier.predict(X_test)
    probabilities = random_forest_classifier.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    rf_confidence=np.mean(confidences)
    acc_rf=accuracy_score(y_test, y_pred)
    data=[acc_logistic,acc_svm,acc_mlp,acc_rf]
    print(data)
    confidence=[logistic_confidence,svm_confidence,mlp_confidence,rf_confidence]
    return confidence,data

In [4]:
def find_order():
    truncate_index = 17838
    truncated_df = df.iloc[:truncate_index]
    indices=[i for i in range(124)]
    f=len(indices)
    y= list(truncated_df['Final Label'])
    values=list(truncated_df['Composite'])
    X=[]
    for i in indices:
        X.append(list(truncated_df[f'feature{i}']))
    l=len(X[0])
    corr_matrix=np.zeros((f,3))
    weight_matrix=np.zeros((f,3))
    for i in range(3):
        c=[values[j] for j in range(l) if y[j]==i]
        if(len(c)>0):
            for j in range(f):
                x_temp=[X[j][k] for k in range(l) if y[k]==i]
                corr, _ = pearsonr(x_temp, c)
                corr_matrix[j][i]=abs(corr)
            x=list(corr_matrix[:,i])
            rank_indices=np.argsort(x)
            rank=1
            for idx in rank_indices:
                weight_matrix[idx][i]=rank
                rank=rank+1
    correlation=np.zeros((f,f))
    for i in range(f):
        for j in range(f):
            x1=X[i][:]
            x2=X[j][:]
            corr, _ = pearsonr(x1,x2)
            correlation[i][j]=corr
    scores=[]
    for i in range(f):
        weight_sum=0
        corr_sum=0
        for j in range(3):
            corr_sum=corr_sum+(corr_matrix[i][j]*weight_matrix[i][j])
            weight_sum=weight_sum+weight_matrix[i][j]
        scores.append((corr_sum*f)/(weight_sum*np.sum(correlation[i])))
    rank_features=np.argsort(scores)
    l=len(scores)-1
    correct_order=[]
    for i in range(f):
        correct_order.append(indices[rank_features[l]])
        l=l-1
    return correct_order
def correlation_algo(correct_order):
    labels=list(df['Final Label'])
    data=[]
    confidence=[]
    for i in range(1,124):
        refined_set=correct_order[:i]
        discarded_set=correct_order[i:]
        print(f'Number of features: {i}')
        columns=[f'feature{k}' for k in refined_set]
        X1=df[columns].values
        con,info=ml(X1,labels)
        print(max(info))
        data.append(info)
        confidence.append(con)
    return confidence,data

In [5]:
headers=['Logistic','SVM','MLP','Random Forest']
indices=[i for i in range(124)]
f=len(indices)
labels=list(df['Final Label'])
l=len(labels)
correct_order=find_order()
print(correct_order)
confidence,data=correlation_algo(correct_order)
cols=[f'feature{k}' for k in indices]
X=df[cols].values
con,info=ml(X,labels)
data.append(info)
confidence.append(con)
print("Accuracy on augmented data")
print(tabulate(data, headers=headers, tablefmt="grid"))
print("Confidence on augmented data")
print(tabulate(confidence, headers=headers, tablefmt="grid"))
file_path1='bert_accuracy.pkl'
file_path2='bert_confidence.pkl'
with open(file_path1,'wb') as f:
    pickle.dump(data,f)
with open(file_path2,'wb') as f:
    pickle.dump(confidence,f)

[105, 102, 51, 91, 40, 111, 54, 117, 122, 14, 72, 25, 1, 45, 21, 5, 29, 9, 87, 17, 2, 101, 4, 95, 19, 15, 13, 77, 112, 3, 36, 81, 79, 16, 34, 24, 20, 23, 0, 53, 37, 113, 106, 22, 55, 65, 31, 12, 33, 83, 66, 46, 50, 97, 108, 32, 63, 27, 58, 73, 35, 38, 30, 56, 68, 42, 8, 70, 62, 49, 26, 52, 114, 67, 43, 11, 18, 7, 80, 64, 75, 28, 47, 10, 121, 71, 48, 61, 116, 39, 59, 6, 104, 44, 94, 110, 78, 107, 69, 60, 57, 86, 74, 90, 96, 120, 93, 99, 76, 109, 115, 92, 89, 98, 41, 88, 119, 100, 82, 84, 123, 118, 85, 103]
Number of features: 1
[0.6847241336169148, 0.6847241336169148, 0.6847241336169148, 0.6847241336169148]
0.6847241336169148
Number of features: 2
[0.6847241336169148, 0.6847241336169148, 0.6847241336169148, 0.6847241336169148]
0.6847241336169148
Number of features: 3
[0.6847241336169148, 0.6847241336169148, 0.6847241336169148, 0.6847241336169148]
0.6847241336169148
Number of features: 4
[0.6847241336169148, 0.6847241336169148, 0.6847241336169148, 0.6170399099211811]
0.6847241336169148
N



[0.6847241336169148, 0.6842236957337671, 0.6550731890404103, 0.6983610659326911]
0.6983610659326911
Number of features: 11
[0.6847241336169148, 0.6839734767921931, 0.6540723132741149, 0.6956086575753785]
0.6956086575753785
Number of features: 12
[0.6847241336169148, 0.6779682221944201, 0.6506943575628675, 0.7001125985237082]
0.7001125985237082
Number of features: 13
[0.6847241336169148, 0.676216689603403, 0.653696984861754, 0.6981108469911173]
0.6981108469911173
Number of features: 14
[0.6847241336169148, 0.6769673464281246, 0.5996496934817965, 0.6988615038158389]
0.6988615038158389
Number of features: 15
[0.6847241336169148, 0.6744651570123859, 0.62116852245715, 0.6894782935068184]
0.6894782935068184
Number of features: 16
[0.6847241336169148, 0.6665832603528087, 0.5922682347053672, 0.6958588765169523]
0.6958588765169523
Number of features: 17
[0.6847241336169148, 0.6662079319404479, 0.625297134993119, 0.700613036406856]
0.700613036406856
Number of features: 18




[0.6847241336169148, 0.6682096834730389, 0.5816339296884774, 0.7011134742900037]
0.7011134742900037
Number of features: 19




[0.6847241336169148, 0.6585762542224446, 0.5616164143625673, 0.6974852996371825]
0.6974852996371825
Number of features: 20
[0.6847241336169148, 0.6695858876516952, 0.5942699862379582, 0.6986112848742649]
0.6986112848742649
Number of features: 21
[0.6847241336169148, 0.6633304141123483, 0.5491054672838734, 0.7003628174652822]
0.7003628174652822
Number of features: 22
[0.6847241336169148, 0.663955961466283, 0.5917677968222195, 0.7001125985237082]
0.7001125985237082
Number of features: 23
[0.6847241336169148, 0.6674590266483172, 0.5726260477918178, 0.7044914300012511]
0.7044914300012511
Number of features: 24
[0.6837232578506193, 0.6738396096584511, 0.5850118853997247, 0.7023645689978731]
0.7023645689978731
Number of features: 25
[0.6837232578506193, 0.6782184411359939, 0.5897660452896284, 0.7023645689978731]
0.7023645689978731
Number of features: 26
[0.6837232578506193, 0.6874765419742275, 0.5913924684098586, 0.7022394595270862]
0.7022394595270862
Number of features: 27
[0.68297260102589