In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2
from scipy.stats import beta
import pickle
from scipy.stats import pearsonr
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,mean_squared_error
from itertools import combinations
from tabulate import tabulate

In [2]:
df=pd.read_csv('CSV/glove_data.csv')

In [3]:
def ml(X,labels):
    split_index = 8759
    labels=np.array(labels)
    X_train, X_test = X[:split_index], X[split_index:]
    y_train, y_test = labels[:split_index], labels[split_index:]
    logistic_regression_classifier = LogisticRegression(max_iter=1000, random_state=42)
    logistic_regression_classifier.fit(X_train, y_train)
    y_pred = logistic_regression_classifier.predict(X_test)
    probabilities = logistic_regression_classifier.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    logistic_confidence=np.mean(confidences)
    acc_logistic=accuracy_score(y_test, y_pred)
    svm_classifier = SVC(kernel='rbf', C=1.0, random_state=42)
    svm_classifier.fit(X_train, y_train)
    y_pred = svm_classifier.predict(X_test)
    confidence_scores = svm_classifier.decision_function(X_test)
    svm_confidence=np.mean(np.abs(confidence_scores))
    acc_svm=accuracy_score(y_test, y_pred)
    h1=len(X_train[0])
    mlp = MLPClassifier(hidden_layer_sizes=(2*h1,6), max_iter=500, random_state=42)
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    probabilities = mlp.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    mlp_confidence=np.mean(confidences)
    acc_mlp=accuracy_score(y_test, y_pred)
    random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    random_forest_classifier.fit(X_train, y_train)
    y_pred = random_forest_classifier.predict(X_test)
    probabilities = random_forest_classifier.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    rf_confidence=np.mean(confidences)
    acc_rf=accuracy_score(y_test, y_pred)
    data=[acc_logistic,acc_svm,acc_mlp,acc_rf]
    print(data)
    confidence=[logistic_confidence,svm_confidence,mlp_confidence,rf_confidence]
    return confidence,data

In [4]:
def find_order():
    truncate_index = 8759
    truncated_df = df.iloc[:truncate_index]
    indices=[i for i in range(124)]
    f=len(indices)
    y= list(truncated_df['Final Label'])
    values=list(truncated_df['Composite'])
    X=[]
    for i in indices:
        X.append(list(truncated_df[f'feature{i}']))
    l=len(X[0])
    corr_matrix=np.zeros((f,3))
    weight_matrix=np.zeros((f,3))
    for i in range(3):
        c=[values[j] for j in range(l) if y[j]==i]
        if(len(c)>0):
            for j in range(f):
                x_temp=[X[j][k] for k in range(l) if y[k]==i]
                corr, _ = pearsonr(x_temp, c)
                corr_matrix[j][i]=abs(corr)
            x=list(corr_matrix[:,i])
            rank_indices=np.argsort(x)
            rank=1
            for idx in rank_indices:
                weight_matrix[idx][i]=rank
                rank=rank+1
    correlation=np.zeros((f,f))
    for i in range(f):
        for j in range(f):
            x1=X[i][:]
            x2=X[j][:]
            corr, _ = pearsonr(x1,x2)
            correlation[i][j]=corr
    scores=[]
    for i in range(f):
        weight_sum=0
        corr_sum=0
        for j in range(3):
            corr_sum=corr_sum+(corr_matrix[i][j]*weight_matrix[i][j])
            weight_sum=weight_sum+weight_matrix[i][j]
        scores.append((corr_sum*f)/(weight_sum*np.sum(correlation[i])))
    rank_features=np.argsort(scores)
    l=len(scores)-1
    correct_order=[]
    for i in range(f):
        correct_order.append(indices[rank_features[l]])
        l=l-1
    return correct_order
def correlation_algo(correct_order):
    labels=list(df['Final Label'])
    data=[]
    confidence=[]
    for i in range(1,124):
        refined_set=correct_order[:i]
        discarded_set=correct_order[i:]
        print(f'Number of features: {i}')
        columns=[f'feature{k}' for k in refined_set]
        X1=df[columns].values
        con,info=ml(X1,labels)
        print(max(info))
        data.append(info)
        confidence.append(con)
    return confidence,data

In [None]:
headers=['Logistic','SVM','MLP','Random Forest']
indices=[i for i in range(124)]
f=len(indices)
labels=list(df['Final Label'])
l=len(labels)
correct_order=find_order()
print(correct_order)
confidence,data=correlation_algo(correct_order)
cols=[f'feature{k}' for k in indices]
X=df[cols].values
con,info=ml(X,labels)
data.append(info)
confidence.append(con)
print("Accuracy on augmented data")
print(tabulate(data, headers=headers, tablefmt="grid"))
print("Confidence on augmented data")
print(tabulate(confidence, headers=headers, tablefmt="grid"))
file_path1='glove_accuracy.pkl'
file_path2='glove_confidence.pkl'
with open(file_path1,'wb') as f:
    pickle.dump(data,f)
with open(file_path2,'wb') as f:
    pickle.dump(confidence,f)

[85, 51, 103, 91, 102, 40, 111, 54, 105, 117, 72, 9, 14, 25, 4, 87, 17, 29, 3, 21, 101, 66, 122, 45, 31, 15, 37, 1, 55, 34, 16, 5, 97, 81, 112, 24, 27, 2, 95, 20, 13, 22, 30, 32, 113, 50, 19, 53, 56, 58, 65, 71, 80, 73, 63, 33, 8, 70, 0, 106, 77, 12, 18, 36, 104, 46, 62, 59, 52, 68, 11, 67, 121, 83, 28, 47, 7, 38, 42, 64, 26, 94, 23, 114, 10, 69, 57, 35, 107, 39, 61, 49, 60, 116, 115, 110, 6, 108, 48, 43, 90, 44, 74, 86, 75, 78, 120, 92, 119, 98, 76, 88, 96, 99, 41, 82, 89, 100, 84, 118, 79, 123, 109, 93]
Number of features: 1
[0.5577275503970678, 0.5577275503970678, 0.5577275503970678, 0.5577275503970678]
0.5577275503970678
Number of features: 2
[0.5577275503970678, 0.5577275503970678, 0.5577275503970678, 0.5577275503970678]
0.5577275503970678
Number of features: 3
[0.5577275503970678, 0.5577275503970678, 0.5577275503970678, 0.5733048259010385]
0.5733048259010385
Number of features: 4
[0.5577275503970678, 0.5577275503970678, 0.5829769904296477, 0.5451028303807779]
0.5829769904296477
N



[0.5577275503970678, 0.5893911626959886, 0.5553858684585624, 0.6013031969049073]
0.6013031969049073
Number of features: 13




[0.5577275503970678, 0.5967216452860924, 0.5554876807167582, 0.6017104459376909]
0.6017104459376909
Number of features: 14
[0.5577275503970678, 0.5874567297902668, 0.5373650987578904, 0.6001832620647526]
0.6001832620647526
Number of features: 15




[0.5579311749134596, 0.5950926491549583, 0.5634290368560375, 0.6130116065974344]
0.6130116065974344
Number of features: 16




[0.5579311749134596, 0.5992669517409897, 0.5672979026674811, 0.6131134188556302]
0.6131134188556302
Number of features: 17




[0.5579311749134596, 0.5987578904500102, 0.5519242516799022, 0.609753614335166]
0.609753614335166
Number of features: 18
[0.5579311749134596, 0.5966198330278966, 0.5302382406841784, 0.6069028711056811]
0.6069028711056811
Number of features: 19




[0.5582366116880473, 0.6127061698228466, 0.546731826511912, 0.6239055182243942]
0.6239055182243942
Number of features: 20




[0.5579311749134596, 0.6086336794950112, 0.5537568723274282, 0.6198330278965587]
0.6198330278965587
Number of features: 21
[0.5584402362044391, 0.6058847485237222, 0.5289146813276319, 0.6183058440236204]
0.6183058440236204
Number of features: 22




[0.5619018529830991, 0.6147424149867644, 0.5402158419873753, 0.6236000814498066]
0.6236000814498066
Number of features: 23




[0.5621054774994909, 0.6130116065974344, 0.5304418652005701, 0.6245163917735695]
0.6245163917735695
Number of features: 24




[0.561189167175728, 0.613317043372022, 0.5354306658521686, 0.62054571370393]
0.62054571370393
Number of features: 25
[0.5625127265322745, 0.6192221543473835, 0.5417430258603135, 0.6206475259621258]
0.6206475259621258
Number of features: 26
