In [51]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix


In [52]:
df_bd = pd.read_csv('DiaBD_A Diabetes Dataset for Enhanced Risk Analysis and Research in Bangladesh.csv')
df_iq = pd.read_csv('Dataset of Diabetes .csv')

In [53]:
df_iq['source_country'] = 'Iraq'
df_bd['source_country'] = 'Bangladesh'

In [54]:
df_iq.rename(columns={
    'AGE': 'age',
    'Gender': 'gender',
    'BMI': 'bmi',
    'HbA1c': 'hba1c',
    'Chol': 'cholesterol',
    'CLASS': 'diabetic'
},inplace=True)

In [55]:
iraq_cols = set(df_iq.columns)
bd_cols = set(df_bd.columns)
missing_in_bd = iraq_cols - bd_cols
missing_in_iraq = bd_cols - iraq_cols

In [56]:
for col in missing_in_bd:
    df_bd[col] = None

In [57]:
for col in missing_in_iraq:
    df_iq[col] = None

In [58]:
df_bd = df_bd[df_iq.columns]

In [59]:
merged_df = pd.concat([df_iq, df_bd], ignore_index=True)

In [60]:
merged_df.head()

Unnamed: 0,ID,No_Pation,gender,age,Urea,Cr,hba1c,cholesterol,TG,HDL,...,family_hypertension,glucose,cardiovascular_disease,systolic_bp,family_diabetes,hypertensive,pulse_rate,stroke,height,weight
0,502,17975,F,50,4.7,46,4.9,4.2,0.9,2.4,...,,,,,,,,,,
1,735,34221,M,26,4.5,62,4.9,3.7,1.4,1.1,...,,,,,,,,,,
2,420,47975,F,50,4.7,46,4.9,4.2,0.9,2.4,...,,,,,,,,,,
3,680,87656,F,50,4.7,46,4.9,4.2,0.9,2.4,...,,,,,,,,,,
4,504,34223,M,33,7.1,46,4.9,4.9,1.0,0.8,...,,,,,,,,,,


In [61]:
print("Column names:", merged_df.columns.tolist())


Column names: ['ID', 'No_Pation', 'gender', 'age', 'Urea', 'Cr', 'hba1c', 'cholesterol', 'TG', 'HDL', 'LDL', 'VLDL', 'bmi', 'diabetic', 'source_country', 'diastolic_bp', 'family_hypertension', 'glucose', 'cardiovascular_disease', 'systolic_bp', 'family_diabetes', 'hypertensive', 'pulse_rate', 'stroke', 'height', 'weight']


In [62]:
missing_summary = merged_df.isnull().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)
print("Missing values per column:")
print(missing_summary)


Missing values per column:
ID                        5288
VLDL                      5288
No_Pation                 5288
Urea                      5288
Cr                        5288
hba1c                     5288
cholesterol               5288
TG                        5288
HDL                       5288
LDL                       5288
weight                    1000
height                    1000
family_hypertension       1000
glucose                   1000
cardiovascular_disease    1000
systolic_bp               1000
family_diabetes           1000
hypertensive              1000
pulse_rate                1000
stroke                    1000
diastolic_bp              1000
dtype: int64


In [28]:
# fig, ax = plt.subplots(figsize=(10, 8))
# numeric_df = merged_df.select_dtypes(include='number')
# corr_matrix = numeric_df.corr()

# sns.heatmap(corr_matrix, ax=ax, annot=True, cmap='coolwarm', fmt=".2f", square=True)

# ax.set_title("Feature Correlation Heatmap", fontsize=14,)
# plt.tight_layout()

# plt.show()


In [63]:
merged_df.drop(columns=[
    'ID', 'No_Pation', 'Urea', 'Cr', 'VLDL', 'TG', 'HDL', 'LDL',
    'cholesterol', 'height', 'weight', 'pulse_rate',
    'family_diabetes', 'family_hypertension'
], inplace=True)


In [64]:
merged_df.columns

Index(['gender', 'age', 'hba1c', 'bmi', 'diabetic', 'source_country',
       'diastolic_bp', 'glucose', 'cardiovascular_disease', 'systolic_bp',
       'hypertensive', 'stroke'],
      dtype='object')

In [65]:
merged_df.dtypes

gender                     object
age                         int64
hba1c                      object
bmi                       float64
diabetic                   object
source_country             object
diastolic_bp               object
glucose                    object
cardiovascular_disease     object
systolic_bp                object
hypertensive               object
stroke                     object
dtype: object

In [66]:
merged_df['gender'] = merged_df['gender'].str.strip().str.upper()
merged_df['diabetic'] = merged_df['diabetic'].str.strip().str.title()

In [67]:
print("Gender values:", merged_df['gender'].unique())
print("Diabetic values:", merged_df['diabetic'].unique())


Gender values: ['F' 'M' 'FEMALE' 'MALE']
Diabetic values: ['N' 'P' 'Y' 'No' 'Yes']


In [68]:

merged_df['glucose'] = pd.to_numeric(merged_df['glucose'], errors='coerce')
merged_df['hba1c'] = pd.to_numeric(merged_df['hba1c'], errors='coerce')
merged_df['diastolic_bp'] = pd.to_numeric(merged_df['diastolic_bp'], errors='coerce')
merged_df['systolic_bp'] = pd.to_numeric(merged_df['systolic_bp'], errors='coerce')
merged_df['cardiovascular_disease'] = pd.to_numeric(merged_df['cardiovascular_disease'], errors='coerce')
merged_df['hypertensive'] = pd.to_numeric(merged_df['hypertensive'], errors='coerce')
merged_df['stroke'] = pd.to_numeric(merged_df['stroke'], errors='coerce')
merged_df['gender'] = merged_df['gender'].replace({'FEMALE': 'F', 'MALE': 'M'})
merged_df['diabetic'] = merged_df['diabetic'].str.strip().str.upper()
merged_df['diabetic'] = merged_df['diabetic'].replace({'YES': 1, 'NO': 0, 'Y': 1, 'N': 0, 'P': 1})

# merged_df['gender'] = merged_df['gender'].map({'M': 1, 'F': 0})  # binary encoding
# merged_df['diabetic'] = merged_df['diabetic'].map({'Yes': 1, 'No': 0})  # target encoding


In [69]:
merged_df.dtypes

gender                     object
age                         int64
hba1c                     float64
bmi                       float64
diabetic                    int64
source_country             object
diastolic_bp              float64
glucose                   float64
cardiovascular_disease    float64
systolic_bp               float64
hypertensive              float64
stroke                    float64
dtype: object

In [39]:
# merged_df['gender'] = merged_df['gender'].astype(int)
# merged_df['diabetic'] = merged_df['diabetic'].astype(int)
# merged_df['cardiovascular_disease'] = merged_df['cardiovascular_disease'].astype(int)
# merged_df['hypertensive'] = merged_df['hypertensive'].astype(int)
# merged_df['stroke'] = merged_df['stroke'].astype(int)
# merged_df['diastolic_bp'] = merged_df['diastolic_bp'].astype(int)
# merged_df['systolic_bp'] = merged_df['stroke'].astype(int)



In [70]:
from sklearn.impute import SimpleImputer

# Define numeric columns with missing values
numeric_cols = ['glucose', 'hba1c', 'diastolic_bp', 'systolic_bp']

# Initialize and apply median imputer
num_imputer = SimpleImputer(strategy='median')
merged_df[numeric_cols] = num_imputer.fit_transform(merged_df[numeric_cols])


In [47]:
# merged_df[binary_cols] = merged_df[binary_cols].replace({None: np.nan})

In [71]:
binary_cols = ['hypertensive', 'stroke', 'cardiovascular_disease']

# Initialize and apply mode imputer
bin_imputer = SimpleImputer(strategy='most_frequent')
merged_df[binary_cols] = bin_imputer.fit_transform(merged_df[binary_cols])


In [43]:
# Check for any remaining missing values
missing_check = merged_df[['glucose', 'hba1c', 'diastolic_bp', 'systolic_bp',
                           'hypertensive', 'stroke', 'cardiovascular_disease']].isnull().sum()

print("Missing values after imputation:")
print(missing_check)


Missing values after imputation:
glucose                   0
hba1c                     0
diastolic_bp              0
systolic_bp               0
hypertensive              0
stroke                    0
cardiovascular_disease    0
dtype: int64


In [72]:
print("Numeric column summaries:")
print(merged_df[['glucose', 'hba1c', 'diastolic_bp', 'systolic_bp']].describe())


Numeric column summaries:
           glucose        hba1c  diastolic_bp  systolic_bp
count  6288.000000  6288.000000   6288.000000  6288.000000
mean      7.463108     8.044714     82.034033   133.361005
std       2.710018     1.015329     11.452440    20.439487
min       0.000000     0.900000     45.000000    62.000000
25%       6.140000     8.000000     75.000000   121.000000
50%       6.930000     8.000000     81.000000   130.000000
75%       7.880000     8.000000     88.000000   143.000000
max      33.460000    16.000000    119.000000   231.000000


In [73]:
median_glucose = merged_df['glucose'].median()
merged_df.loc[merged_df['glucose'] == 0, 'glucose'] = median_glucose


In [74]:
merged_df.dtypes # checking datatypes

gender                     object
age                         int64
hba1c                     float64
bmi                       float64
diabetic                    int64
source_country             object
diastolic_bp              float64
glucose                   float64
cardiovascular_disease    float64
systolic_bp               float64
hypertensive              float64
stroke                    float64
dtype: object

In [75]:
binary_cols = ['cardiovascular_disease', 'stroke', 'hypertensive']
merged_df[binary_cols] = merged_df[binary_cols].astype(int)


In [76]:
merged_df.dtypes # checking after converting

gender                     object
age                         int64
hba1c                     float64
bmi                       float64
diabetic                    int64
source_country             object
diastolic_bp              float64
glucose                   float64
cardiovascular_disease      int32
systolic_bp               float64
hypertensive                int32
stroke                      int32
dtype: object

In [77]:
print("Gender values:", merged_df['gender'].unique())
print("Source country values:", merged_df['source_country'].unique())
print("Diabetic values:", merged_df['diabetic'].unique())


Gender values: ['F' 'M']
Source country values: ['Iraq' 'Bangladesh']
Diabetic values: [0 1]


In [78]:
merged_df['gender'] = (
    merged_df['gender'].astype(str).str.strip().str.upper().map({'M': 1, 'MALE': 1, 'F': 0, 'FEMALE': 0})
)



In [77]:
# merged_df['diabetic'] = merged_df['diabetic'].str.strip().str.upper().map({
#     'Y': 1, 'YES': 1,
#     'N': 0, 'NO': 0
# })




In [79]:
print("Gender values:", merged_df['gender'].unique())
print("Source country values:", merged_df['source_country'].unique())
print("Diabetic values:", merged_df['diabetic'].unique())

Gender values: [0 1]
Source country values: ['Iraq' 'Bangladesh']
Diabetic values: [0 1]


In [80]:
merged_df = merged_df.drop(columns=['source_country'])


In [81]:
merged_df.head()

Unnamed: 0,gender,age,hba1c,bmi,diabetic,diastolic_bp,glucose,cardiovascular_disease,systolic_bp,hypertensive,stroke
0,0,50,4.9,24.0,0,81.0,6.93,0,130.0,0,0
1,1,26,4.9,23.0,0,81.0,6.93,0,130.0,0,0
2,0,50,4.9,24.0,0,81.0,6.93,0,130.0,0,0
3,0,50,4.9,24.0,0,81.0,6.93,0,130.0,0,0
4,1,33,4.9,21.0,0,81.0,6.93,0,130.0,0,0


In [82]:
X = merged_df.drop(columns=['diabetic'])  # Features
y = merged_df['diabetic']                # Target

In [83]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: diabetic, dtype: int64

In [84]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [85]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1258 entries, 1017 to 2403
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender                  1258 non-null   int64  
 1   age                     1258 non-null   int64  
 2   hba1c                   1258 non-null   float64
 3   bmi                     1258 non-null   float64
 4   diastolic_bp            1258 non-null   float64
 5   glucose                 1258 non-null   float64
 6   cardiovascular_disease  1258 non-null   int32  
 7   systolic_bp             1258 non-null   float64
 8   hypertensive            1258 non-null   int32  
 9   stroke                  1258 non-null   int32  
dtypes: float64(5), int32(3), int64(2)
memory usage: 93.4 KB


In [86]:
continuous_cols = ['age', 'glucose', 'hba1c', 'bmi', 'diastolic_bp', 'systolic_bp']


In [87]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[continuous_cols] = scaler.fit_transform(X_train[continuous_cols])
X_test[continuous_cols] = scaler.transform(X_test[continuous_cols])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[continuous_cols] = scaler.fit_transform(X_train[continuous_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[continuous_cols] = scaler.transform(X_test[continuous_cols])
A value is trying to be set 

In [88]:
X_train[continuous_cols].describe()


Unnamed: 0,age,glucose,hba1c,bmi,diastolic_bp,systolic_bp
count,5030.0,5030.0,5030.0,5030.0,5030.0,5030.0
mean,-2.3579e-16,-2.768936e-16,-5.410213e-16,1.692925e-16,1.159885e-16,-4.836754e-16
std,1.000099,1.000099,1.000099,1.000099,1.000099,1.000099
min,-2.045054,-2.704923,-7.007919,-2.374462,-3.232432,-2.657644
25%,-0.9051948,-0.4881371,-0.03812669,-0.3658751,-0.6135956,-0.6023312
50%,0.006692613,-0.2010452,-0.03812669,-0.119839,-0.08982834,-0.1619072
75%,0.6906082,0.1587281,-0.03812669,0.253457,0.5212335,0.4742609
max,2.514383,9.440155,7.81516,58.38265,3.227364,4.78063


In [89]:
model = LogisticRegression()
model.fit(X_train, y_train)


LogisticRegression()

In [90]:
y_pred = model.predict(X_test)


In [91]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8831478537360891
Precision: 0.8258064516129032
Recall: 0.5161290322580645
F1 Score: 0.6352357320099256
Confusion Matrix:
 [[983  27]
 [120 128]]


In [92]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)


RandomForestClassifier(random_state=42)

In [93]:
y_rf_pred = rf_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_rf_pred))
print("Precision:", precision_score(y_test, y_rf_pred))
print("Recall:", recall_score(y_test, y_rf_pred))
print("F1 Score:", f1_score(y_test, y_rf_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_rf_pred))


Accuracy: 0.958664546899841
Precision: 0.9711538461538461
Recall: 0.8145161290322581
F1 Score: 0.8859649122807017
Confusion Matrix:
 [[1004    6]
 [  46  202]]
