In [9]:
import autosklearn.classification
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, LeaveOneOut, cross_val_score, KFold, cross_val_predict, StratifiedKFold
from sklearn.model_selection import StratifiedKFold, GridSearchCV, ParameterGrid
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from scipy import interp
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import roc_curve, auc, accuracy_score, roc_auc_score, precision_score, recall_score
from sklearn.utils import resample
import matplotlib.pyplot as plt
from scipy import interp
import sklearn.datasets
import sklearn.metrics
from autosklearn.metrics import accuracy, f1_macro, roc_auc, f1
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFE, SelectKBest, SelectPercentile, f_regression
import warnings
import seaborn as sns
from scipy.stats import chi2
from yellowbrick.features import RFECV
warnings.filterwarnings('ignore')
%matplotlib inline

## Spearman

In [10]:
#Removing correlated features (spearman)

df = pd.read_csv("SCI.csv")
df.set_index('StudyID', inplace=True)
y = df.FU_SA
X = df.drop('FU_SA', axis=1)

#plt.figure(figsize=(40,40))
#sns.heatmap(X.corr(method='spearman'), annot=True, cmap='coolwarm')
#plt.tight_layout()

def spearcorr(dataset, threshold):
    corr_matrix = dataset.corr(method='pearson').abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return(to_drop)
    #print(dataset)

cols = spearcorr(X, 0.75)
#X = X.drop(cols, axis = 1)

['STS13', 'STS23', 'STS34', 'STS40', 'STS44', 'STS47']

# Logistic Regression + SMOTE

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y)
os = SMOTE()
X_train_os, y_train_os = os.fit_sample(X_train, y_train)
logreg = LogisticRegression()
logreg.fit(X_train_os, y_train_os)
lr_preds_os = logreg.predict(X_test)

In [12]:
print(classification_report(y_test, lr_preds_os))

              precision    recall  f1-score   support

           0       0.97      0.90      0.93       172
           1       0.05      0.17      0.08         6

   micro avg       0.87      0.87      0.87       178
   macro avg       0.51      0.53      0.51       178
weighted avg       0.94      0.87      0.90       178



In [13]:
probs = logreg.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.7558139534883721

## Logistic Regression + SMOTE + Removing correlated features

In [14]:
print(X_train.shape)
correlated_features = set()  
correlation_matrix = df.corr()  

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.75:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
            
X_train.drop(labels=correlated_features, axis=1, inplace=True)  
X_test.drop(labels=correlated_features, axis=1, inplace=True)  
print(correlated_features)

(413, 49)
{'STS40', 'STS44', 'STS23', 'STS34', 'STS47', 'STS13'}


In [326]:
X_train_os, y_train_os = os.fit_sample(X_train, y_train)
logreg = LogisticRegression()
logreg.fit(X_train_os, y_train_os)
lr_preds_os = logreg.predict(X_test)
print(classification_report(y_test, lr_preds_os))

              precision    recall  f1-score   support

           0       0.97      0.85      0.91       172
           1       0.07      0.33      0.12         6

   micro avg       0.84      0.84      0.84       178
   macro avg       0.52      0.59      0.52       178
weighted avg       0.94      0.84      0.88       178



In [330]:
probs = logreg.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.8110465116279071