In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
marshalpatel3558_diabetes_prediction_dataset_path = kagglehub.dataset_download('marshalpatel3558/diabetes-prediction-dataset')

print('Data source import complete.')


# **Adout Dataset**

The provided dataset, named diabetes_dataset.csv, contains information related to diabetes risk factors and associated health metrics. Below is a detailed description of the dataset

In [None]:
import requests
url = "https://storage.googleapis.com/kaggle-datasets-images/3102947/5344155/d4f2d9d63736fff7b6ba10f73774752e/dataset-card.png?t=2023-04-08-06-42-24"
filename = "my_image.png"

with open(filename, 'wb') as f:
    f.write(requests.get(url).content)


In [None]:
from IPython.display import Image, display
display(Image(filename="my_image.png"))


# **Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score ,classification_report



# **Load dataset**

In [None]:
df = pd.read_csv('/kaggle/input/diabetes-prediction-dataset/diabetes_dataset.csv')

In [None]:
df.head()

# **EDA**

In [None]:
df.describe()

In [None]:
df.describe(include = "object")

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop('Alcohol_Consumption', axis=1)

In [None]:
df = df.drop(columns=['Unnamed: 0'], axis=1)

# **Target**

In [None]:
df['Diabetes'] = ((df['Fasting_Blood_Glucose'] >= 126) | (df['HbA1c'] > 6.5)).astype(int)

In [None]:
sns.countplot(x='Diabetes', data=df)
plt.title("Target Class Distribution")
plt.show()

imbalanced data

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation (Numeric Only)")
plt.show()


In [None]:
df.hist(bins=20, figsize=(14, 10), color='skyblue', edgecolor='black')
plt.tight_layout()
plt.show()

In [None]:
glucose_by_sex = df.groupby('Sex')['Diabetes'].mean().reset_index()
plt.figure(figsize=(8, 5))
sns.barplot(x='Sex', y='Diabetes', data=glucose_by_sex)
plt.title('Average Diabetes Rate by Sex')
plt.xlabel('Sex')
plt.ylabel('Average Diabetes Rate')
plt.ylim(0, 1)
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='Diabetes', y='BMI', data=df, palette='Set2')
plt.title('Relationship Between BMI and Diabetes')
plt.xlabel('Diabetes')
plt.ylabel('BMI')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='Diabetes', y='Blood_Pressure_Systolic', data=df, palette='Set2')
plt.title('Relationship Between Blood_Pressure_Systolic and Diabetes')
plt.xlabel('Diabetes')
plt.ylabel('Blood_Pressure_Systolic')
plt.tight_layout()
plt.show()


In [None]:
bins = range(0, 101, 10)
labels = [f'{i}-{i+9}' for i in bins[:-1]]
df['age_group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
age_group_diabetes = df.groupby('age_group')['Diabetes'].mean().reset_index()
plt.figure(figsize=(10, 6))
sns.lineplot(x='age_group', y='Diabetes', data=age_group_diabetes, marker='o')
plt.title('Diabetes Rate by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Proportion with Diabetes')
plt.ylim(0, 1)
plt.grid(True)
plt.show()


# **Modeling**

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

df['Sex'] = label_encoder.fit_transform(df['Sex'])
df['Ethnicity'] = label_encoder.fit_transform(df['Ethnicity'])
df['Physical_Activity_Level'] = label_encoder.fit_transform(df['Physical_Activity_Level'])
df['Smoking_Status'] = label_encoder.fit_transform(df['Smoking_Status'])

In [None]:
df=df.drop('age_group', axis=1)

In [None]:
X = df.drop('Diabetes', axis=1)
y = df['Diabetes']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model = GaussianNB()
model.fit(X_train_scaled, y_train)

In [None]:
y_pred = model.predict(X_test_scaled)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


# **SMOTE**

In [None]:
from imblearn.over_sampling import SMOTE
df_encoded = pd.get_dummies(df, drop_first=True)
X = df.drop('Diabetes', axis=1)
y = df['Diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

nb = GaussianNB()
nb.fit(X_train_resampled, y_train_resampled)
y_pred = nb.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
! pip install imbalanced-learn