# Modeling

In [7]:
#!pip install fuzzywuzzy

In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import pickle
from sklearn.feature_selection import VarianceThreshold
from fuzzywuzzy import process
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, log_loss

import os
#Connecting the driver
from google.colab import drive

drive.mount('/content/drive/')

root_dir = '/content/drive/MyDrive/DAP/python_scripts'


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [9]:
# example of grid searching key hyperparametres for logistic regression
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [10]:
def accuracy_measure(y_test, y_pred):
    sensitivity_overall = recall_score(y_test, y_pred, average='weighted')
    print('Sensitivity Overall Data:',sensitivity_overall)

    precision_overall = precision_score(y_test, y_pred, average='weighted')
    print('Precision Overall Data:',precision_overall)

    f1_overall = f1_score(y_test, y_pred, average='weighted')
    print('F1 Score Overall Data:',f1_overall)

    # Cohen's Kappa statistic
    kappa = cohen_kappa_score(y_test, y_pred)
    print("Cohen's Kappa:", kappa)

    # Matthews Correlation Coefficient (MCC)
    mcc = matthews_corrcoef(y_test, y_pred)
    print("Matthews Correlation Coefficient (MCC):", mcc)

    """# Compute log loss
    logloss = log_loss(y_test, y_pred)
    print(f"Log Loss: {logloss}")

    # AUC and ROC
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')
    plt.show()

    # Visualize predicted probabilities distribution
    plt.figure(figsize=(8, 6))
    sns.histplot(y_pred, bins=20, kde=True, color='skyblue')
    plt.xlabel('Predicted Probabilities')
    plt.ylabel('Frequency')
    plt.title('Distribution of Predicted Probabilities')
    plt.show()"""

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    """ # Printing Confusion Matrix calsswise
    unique_classes = np.unique(y_test)



    # Initialize variables to store metrics
    total_tp = 0
    total_tn = 0
    total_fp = 0
    total_fn = 0

    # Iterate over each class
    for cls in unique_classes:
        idx = (y_test == cls)
        true_positive = np.sum((y_pred[idx] == cls))
        true_negative = np.sum((y_pred != cls) & (y_test != cls))
        false_positive = np.sum((y_pred == cls) & (y_test != cls))
        false_negative = np.sum((y_pred != cls) & (y_test == cls))

        total_tp += true_positive
        total_tn += true_negative
        total_fp += false_positive
        total_fn += false_negative

        print(f"\nClass {cls} Metrics:")
        print(f"True Positive (TP): {true_positive}")
        print(f"True Negative (TN): {true_negative}")
        print(f"False Positive (FP): {false_positive}")
        print(f"False Negative (FN): {false_negative}")

    # Consolidated results
    print("\nConsolidated Results:")
    print(f"Total True Positive (TP): {total_tp}")
    print(f"Total True Negative (TN): {total_tn}")
    print(f"Total False Positive (FP): {total_fp}")
    print(f"Total False Negative (FN): {total_fn}")"""


### Using various dimensionality reduction

In [11]:
def low_var():
    df_low_variance = pd.read_csv(os.path.join(root_dir,"low_variance_features.csv"))
    df_low_variance= df_low_variance.fillna(0)
    df_low_variance.isna().any()
    X = df_low_variance.drop('disease', axis=1)
    y = df_low_variance['disease']
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    return X,y,X_train,y_train,X_test,X_val,y_test, y_val

def rand_forest():
    df_training = pd.read_csv(os.path.join(root_dir,"training_dataset_final.csv"))
    df_feature = pd.read_csv(os.path.join(root_dir,"feature_importance_df.csv"))
    df_feature= df_feature.fillna(0)
    df_training= df_training.fillna(0)
    feature_list=(df_feature['Feature'].head(100)).tolist()
    df_training_important_Features = df_training[feature_list]
    df_training_important_Features['disease'] = df_training['disease']
    df_training_important_Features.info()
    X =df_training_important_Features.drop(columns=['disease'])
    y =df_training_important_Features['disease']
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    return X,y,X_train,y_train,X_test,X_val,y_test, y_val



# Logistic Regression

In [12]:
def logistic_regression(X,y,X_train,y_train,X_test,X_val,y_test, y_val,pickle_file_name,var):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train)

    classifier = LogisticRegression(max_iter=100)
    classifier.fit(X_scaled, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(scaler.transform(X_test))
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("F1 Score:", f1)
    logModel = LogisticRegression()

    # Performing Grid Search using hyperparamters
    param_grid = [
        {'penalty' : ['l2'],
        'C' : np.logspace(-4, 4, 20),
        'solver': ['lbfgs', 'liblinear'],
        'max_iter' : [20,50, 100,1000]
        }
    ]
    clf = GridSearchCV(logModel, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)
    try:
        best_clf = clf.fit(X_train, y_train)

    except UserWarning as warning:
        # Handle the warning as needed
        print(f"Caught a UserWarning: {warning}")

    cv_results = pd.DataFrame(best_clf.cv_results_)
    cv_results.to_csv(os.path.join(root_dir,'LR_hyperparamters_results_'+var+'.csv'))
    print (f'Accuracy - : {best_clf.score(X,y):.3f}')
    print(best_clf)
    best_params = best_clf.best_params_
    print("Best hyperparameters:", best_clf.best_params_)
    print("Best model:", best_clf.best_estimator_)

    #best_params = {'C':29.763514416313132, 'max_iter': 20, 'penalty': 'l2', 'solver': 'lbfgs'}
    best_params = best_clf.best_params_
    # Create a Logistic Regression model with the best hyperparameters
    best_log_model = LogisticRegression(**best_params)

    # Train the model on the training data
    model = best_log_model.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred = best_log_model.predict(X_test)

    # Evaluate the performance of the model
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    accuracy_measure(y_test, y_pred)

    ### Validating the model
    print( "####### Validating the model" )
    predicted_class = best_log_model.predict(X_val)
    print("Predicted class:", predicted_class)

    accuracy = accuracy_score(y_val, predicted_class)
    print("Accuracy on the validation set:", accuracy)
    accuracy_measure(y_val, predicted_class)

    ### Saving the model with best parameters logistic regression
    print( "####### Saving the model with best parameters logistic regression" )
    # Save the model to a file using pickle
    with open(os.path.join(root_dir,pickle_file_name), 'wb') as file:
        pickle.dump(model, file)
    return y_pred


In [13]:
X,y,X_train,y_train,X_test,X_val,y_test, y_val = low_var()

In [14]:
file_name = os.path.join(root_dir,'low_var_features_lr.txt')
with open(file_name, 'w') as fp:
    for item in X_train:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

Done


In [15]:
y_test_lv =y_test

In [None]:
y_pred_lv = logistic_regression(X,y,X_train,y_train,X_test,X_val,y_test, y_val,'logistic_regression_model_lv.pkl','lv')

F1 Score: 0.8932979676388365
Fitting 3 folds for each of 160 candidates, totalling 480 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
X,y,X_train,y_train,X_test,X_val,y_test, y_val = rand_forest()

In [None]:
file_name = os.path.join(root_dir,'rand_for_features_lr.txt')
with open(file_name, 'w') as fp:
    for item in X_train:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

In [None]:
y_test_rf =y_test

In [None]:
y_pred_rf = logistic_regression(X,y,X_train,y_train,X_test,X_val,y_test, y_val,'logistic_regression_model_rd.pkl','rf')