In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/content/diabetes.csv')
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.nan)
df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']]

In [None]:
df.fillna(df.mean(), inplace=True)
df

In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled
X_test_scaled

In [None]:
print("Preprocessed X_train:")
print(X_train_scaled[:5])
print("Preprocessed X_test:")
print(X_test_scaled[:5])
print("y_train:")
print(y_train[:5])
print("y_test:")
print(y_test[:5])

In [None]:
sns.set(style="whitegrid")
sns.pairplot(df, hue='Outcome', diag_kind='hist')
plt.suptitle("Pairplot of Diabetes Dataset", y=1.02)
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='Outcome', y='Age', data=df)
plt.title("Boxplot of Age vs. Outcome")
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df, x='BMI', hue='Outcome', kde=True, bins=20, palette='Set1')
plt.title("BMI Distribution for Diabetic and Non-Diabetic Individuals")
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
r_model = RandomForestClassifier(n_estimators=100, random_state=42)
r_model.fit(X_train, y_train)

In [None]:
r_y_pred = r_model.predict(X_test)
r_accuracy = accuracy_score(y_test, r_y_pred)
r_report = classification_report(y_test, r_y_pred)
r_accuracy

In [None]:
print("Random Forest Classification Report:")
print(r_report)