In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier


In [3]:
df=pd.read_csv("../data/GMS_ProjectDataset_V2.csv")
df.head()

df = df.drop("person_id", axis = 1)

In [4]:
columns_with_nan = df.columns[df.isin(['', np.nan]).any()]
columns_with_nan

Index(['Glipizide_Final', 'Glimepiride_Final', 'Glyburide_Final',
       'Metformin_Final', 'Pioglitazone_Final', 'Rosiglitazone_Final',
       'Beta_Blockers_Final', 'ACE_Inhibitors_Final', 'ARB_Final',
       'Diuretics_Final', 'PPI_Final', 'Levothyroxine_Final', 'CCB_Final',
       'Vasodilators_Final', 'Alpha_Blockers_Final',
       'Centrally_Acting_Agents_Final', 'Statins_Final',
       'Anti_Platelets_Final', 'Anti_Coagulants_Final', 'Steroids_Final',
       'Heart_Disease_Final', 'Hypothyroid_Final', 'Anemia_Final',
       'Kidney_Disease_Final', 'GERD_Final', 'Neuropathy_Final',
       'Eye_Disorder_Final', 'Atherosclerosis_Final', 'Alzheimer_Final',
       'FootUlcer_Final', 'Abnormal_Glucose_Final', 'DMScreen_Final',
       'A1C_Final', 'GlucoseTest_Final', 'InsulinTest_Final',
       'Diabetes_Indicator'],
      dtype='object')

In [5]:
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

# Initialize the Linear Regression model
model = LinearRegression()

# Define the columns with missing values
columns_with_missing = columns_with_nan[:-1]

# Impute missing values using SimpleImputer
#imputer = SimpleImputer(strategy='KNN') 
imputer = KNNImputer(n_neighbors=5)
#for col in columns_with_missing:
for col in columns_with_missing:
    df[col] = imputer.fit_transform(df[[col]]).round().astype(int)


In [6]:
columns_with_nan = df.columns[df.isin(['', np.nan]).any()]
columns_with_nan

Index(['Diabetes_Indicator'], dtype='object')

In [7]:
def eval(y_true, y_pred, y_pred_proba):
    acc_score = accuracy_score(y_true, y_pred)
    conf_mat = confusion_matrix(y_true,y_pred)

    prec_score = precision_score(y_true, y_pred) 
    rec_score = recall_score(y_true, y_pred)
    F1_score = f1_score(y_true, y_pred)

    roc_auc = roc_auc_score(y_true, y_pred)
    roc_auc_predict_proba = roc_auc_score(y_true, y_pred_proba)

    print(f'Accuracy: {acc_score}')

    print("Precision_score :", prec_score)
    print("recall_score :", rec_score)
    print("f1_score:", F1_score)

    print(f'ROC AUC score: {roc_auc}')
    print(f'ROC AUC PROBA: {roc_auc_predict_proba}')

    print(conf_mat)

    print("==" * 25)

In [8]:
from sklearn.linear_model import LogisticRegression

# Define the column with missing values
column_with_missing = 'Diabetes_Indicator'

# Create a copy of the DataFrame for imputation
df_copy = df.copy()

# Split the data into features (X) and target (y)
X = df_copy.drop(columns=[column_with_missing])  # Features
y = df_copy[column_with_missing]  # Target column

# Identify rows with missing values in the target column
missing_rows = y.isna()

# Initialize the Logistic Regression model
logistic_model = LGBMClassifier(random_state = 42)

# Fit the logistic regression model on non-missing data
logistic_model.fit(X[~missing_rows], y[~missing_rows])
y_pred = logistic_model.predict(X[~missing_rows])
y_pred_proba = logistic_model.predict_proba(X[~missing_rows])[:,1]
eval(y[~missing_rows], y_pred, y_pred_proba)

# Predict and impute missing values for the target column
imputed_values = logistic_model.predict(X[missing_rows])
df.loc[missing_rows, column_with_missing] = imputed_values

# Now, the missing values in the 'Diabetes_Indicator' column have been imputed using logistic regression


[LightGBM] [Info] Number of positive: 771, number of negative: 159
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000863 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 134
[LightGBM] [Info] Number of data points in the train set: 930, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.829032 -> initscore=1.578784
[LightGBM] [Info] Start training from score 1.578784
Accuracy: 0.9956989247311828
Precision_score : 0.9948387096774194
recall_score : 1.0
f1_score: 0.9974126778783958
ROC AUC score: 0.9874213836477987
ROC AUC PROBA: 0.9999836853225004
[[155   4]
 [  0 771]]


In [9]:
columns_with_nan = df.columns[df.isin(['', np.nan]).any()]
columns_with_nan

Index([], dtype='object')

In [10]:
print(df['Diabetes_Indicator'].unique())

[0. 1.]


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# Split the dataset into features (X) and target (y)
X = df.drop(columns=['Diabetes_Indicator'])
y = df['Diabetes_Indicator']

# Split the dataset into training (2/3) and testing (1/3) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [13]:
X_train.to_csv("../outputs/imputed_X_train.csv", index=False)
X_test.to_csv("../outputs/imputed_X_test.csv", index=False)
y_train.to_csv("../outputs/imputed_y_train.csv", index=False)
y_test.to_csv("../outputs/imputed_y_test.csv", index=False)