In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2
from scipy.stats import beta
import pickle
from scipy.stats import pearsonr
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,mean_squared_error
from itertools import combinations
from tabulate import tabulate

In [2]:
df=pd.read_csv('CSV/use_data.csv')

In [3]:
def ml(X,labels):
    split_index = 17258
    labels=np.array(labels)
    X_train, X_test = X[:split_index], X[split_index:]
    y_train, y_test = labels[:split_index], labels[split_index:]
    logistic_regression_classifier = LogisticRegression(max_iter=1000, random_state=42)
    logistic_regression_classifier.fit(X_train, y_train)
    y_pred = logistic_regression_classifier.predict(X_test)
    probabilities = logistic_regression_classifier.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    logistic_confidence=np.mean(confidences)
    acc_logistic=accuracy_score(y_test, y_pred)
    svm_classifier = SVC(kernel='rbf', C=1.0, random_state=42)
    svm_classifier.fit(X_train, y_train)
    y_pred = svm_classifier.predict(X_test)
    confidence_scores = svm_classifier.decision_function(X_test)
    svm_confidence=np.mean(np.abs(confidence_scores))
    acc_svm=accuracy_score(y_test, y_pred)
    h1=len(X_train[0])
    mlp = MLPClassifier(hidden_layer_sizes=(2*h1,6), max_iter=500, random_state=42)
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    probabilities = mlp.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    mlp_confidence=np.mean(confidences)
    acc_mlp=accuracy_score(y_test, y_pred)
    random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    random_forest_classifier.fit(X_train, y_train)
    y_pred = random_forest_classifier.predict(X_test)
    probabilities = random_forest_classifier.predict_proba(X_test)
    confidences = np.max(probabilities, axis=1)
    rf_confidence=np.mean(confidences)
    acc_rf=accuracy_score(y_test, y_pred)
    data=[acc_logistic,acc_svm,acc_mlp,acc_rf]
    print(data)
    confidence=[logistic_confidence,svm_confidence,mlp_confidence,rf_confidence]
    return confidence,data

In [4]:
def find_order():
    truncate_index = 17258
    truncated_df = df.iloc[:truncate_index]
    indices=[i for i in range(124)]
    f=len(indices)
    y= list(truncated_df['Final Label'])
    values=list(truncated_df['Composite'])
    X=[]
    for i in indices:
        X.append(list(truncated_df[f'feature{i}']))
    l=len(X[0])
    corr_matrix=np.zeros((f,3))
    weight_matrix=np.zeros((f,3))
    for i in range(3):
        c=[values[j] for j in range(l) if y[j]==i]
        if(len(c)>0):
            for j in range(f):
                x_temp=[X[j][k] for k in range(l) if y[k]==i]
                corr, _ = pearsonr(x_temp, c)
                corr_matrix[j][i]=abs(corr)
            x=list(corr_matrix[:,i])
            rank_indices=np.argsort(x)
            rank=1
            for idx in rank_indices:
                weight_matrix[idx][i]=rank
                rank=rank+1
    correlation=np.zeros((f,f))
    for i in range(f):
        for j in range(f):
            x1=X[i][:]
            x2=X[j][:]
            corr, _ = pearsonr(x1,x2)
            correlation[i][j]=corr
    scores=[]
    for i in range(f):
        weight_sum=0
        corr_sum=0
        for j in range(3):
            corr_sum=corr_sum+(corr_matrix[i][j]*weight_matrix[i][j])
            weight_sum=weight_sum+weight_matrix[i][j]
        scores.append((corr_sum*f)/(weight_sum*np.sum(correlation[i])))
    rank_features=np.argsort(scores)
    l=len(scores)-1
    correct_order=[]
    for i in range(f):
        correct_order.append(indices[rank_features[l]])
        l=l-1
    return correct_order
def correlation_algo(correct_order):
    labels=list(df['Final Label'])
    data=[]
    confidence=[]
    for i in range(1,124):
        refined_set=correct_order[:i]
        discarded_set=correct_order[i:]
        print(f'Number of features: {i}')
        columns=[f'feature{k}' for k in refined_set]
        X1=df[columns].values
        con,info=ml(X1,labels)
        print(max(info))
        data.append(info)
        confidence.append(con)
    return confidence,data

In [None]:
headers=['Logistic','SVM','MLP','Random Forest']
indices=[i for i in range(124)]
f=len(indices)
labels=list(df['Final Label'])
l=len(labels)
correct_order=find_order()
print(correct_order)
confidence,data=correlation_algo(correct_order)
cols=[f'feature{k}' for k in indices]
X=df[cols].values
con,info=ml(X,labels)
data.append(info)
confidence.append(con)
print("Accuracy on augmented data")
print(tabulate(data, headers=headers, tablefmt="grid"))
print("Confidence on augmented data")
print(tabulate(confidence, headers=headers, tablefmt="grid"))
file_path1='use_accuracy.pkl'
file_path2='use_confidence.pkl'
with open(file_path1,'wb') as f:
    pickle.dump(data,f)
with open(file_path2,'wb') as f:
    pickle.dump(confidence,f)

[105, 102, 51, 91, 40, 111, 54, 117, 122, 72, 14, 25, 1, 45, 5, 21, 29, 101, 9, 87, 2, 17, 4, 19, 15, 95, 13, 3, 77, 79, 16, 112, 36, 81, 0, 34, 20, 24, 23, 37, 53, 65, 113, 22, 106, 31, 55, 12, 66, 33, 83, 46, 50, 108, 97, 63, 32, 27, 73, 58, 68, 35, 56, 30, 42, 8, 62, 38, 70, 49, 114, 52, 26, 11, 43, 67, 18, 80, 7, 64, 75, 28, 47, 10, 121, 48, 71, 61, 59, 116, 39, 6, 104, 110, 44, 69, 94, 78, 57, 86, 60, 107, 74, 90, 96, 120, 99, 109, 76, 98, 93, 115, 41, 92, 89, 88, 119, 82, 100, 84, 123, 118, 85, 103]
Number of features: 1
[0.6835280523815638, 0.6835280523815638, 0.6835280523815638, 0.6835280523815638]
0.6835280523815638
Number of features: 2
[0.6835280523815638, 0.6835280523815638, 0.6835280523815638, 0.6835280523815638]
0.6835280523815638
Number of features: 3
