In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
organizations_uciml_pima_indians_diabetes_database_path = kagglehub.dataset_download('organizations/uciml/pima-indians-diabetes-database')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
# Set the style for seaborn
sns.set_style("whitegrid")

# 1. Histogram for all numeric columns
df.hist(figsize=(12, 8), bins=20, edgecolor='black')
plt.suptitle("Histograms of Numeric Features", fontsize=16)
plt.show()

In [None]:
# 2. Boxplot for all numeric columns
plt.figure(figsize=(12, 6))
df.boxplot()
plt.title("Boxplot of Features")
plt.xticks(rotation=45)
plt.show()

In [None]:
# 3. Pairplot for relationships
sns.pairplot(df, hue='Outcome', diag_kind="kde", palette="coolwarm")
plt.show()


In [None]:
# 4. Correlation Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# 5. Countplot for Outcome distribution
sns.countplot(x=df["Outcome"], palette="coolwarm")
plt.title("Distribution of Diabetes Outcome")
plt.xlabel("Outcome (0 = No Diabetes, 1 = Diabetes)")
plt.ylabel("Count")
plt.show()

In [None]:
# 6-9. KDE Plots for numerical variables
features = ["Glucose", "BloodPressure", "BMI", "Age"]
for feature in features:
    plt.figure(figsize=(6, 4))
    sns.kdeplot(df[feature], shade=True, color="blue")
    plt.title(f"Density Plot of {feature}")
    plt.xlabel(feature)
    plt.show()

In [None]:
# 10-13. Boxplots for numeric features grouped by Outcome
for feature in features:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x="Outcome", y=feature, data=df, palette="coolwarm")
    plt.title(f"{feature} Distribution by Outcome")
    plt.show()

In [None]:
# 14-17. Violin plots for distribution of key features by Outcome
for feature in features:
    plt.figure(figsize=(6, 4))
    sns.violinplot(x="Outcome", y=feature, data=df, palette="coolwarm")
    plt.title(f"Violin Plot of {feature} by Outcome")
    plt.show()

In [None]:
# 18. Swarmplot for BMI vs. Outcome
plt.figure(figsize=(8, 5))
sns.swarmplot(x="Outcome", y="BMI", data=df, palette="coolwarm")
plt.title("Swarmplot of BMI by Outcome")
plt.show()

In [None]:
# 19. Stripplot for Age vs. Outcome
plt.figure(figsize=(8, 5))
sns.stripplot(x="Outcome", y="Age", data=df, jitter=True, palette="coolwarm")
plt.title("Stripplot of Age by Outcome")
plt.show()

In [None]:
# 20. Countplot for Pregnancies
plt.figure(figsize=(8, 5))
sns.countplot(x="Pregnancies", data=df, palette="coolwarm")
plt.title("Countplot of Pregnancies")
plt.xticks(rotation=45)
plt.show()

In [None]:
# 21. Boxenplot for Insulin levels
plt.figure(figsize=(8, 5))
sns.boxenplot(x="Outcome", y="Insulin", data=df, palette="coolwarm")
plt.title("Boxenplot of Insulin Levels by Outcome")
plt.show()

In [None]:
# 22. Jointplot for Glucose and BMI
sns.jointplot(x="Glucose", y="BMI", data=df, kind="scatter", hue="Outcome", palette="coolwarm")
plt.show()

In [None]:
# 23. Jointplot for Age and DiabetesPedigreeFunction
sns.jointplot(x="Age", y="DiabetesPedigreeFunction", data=df, kind="kde", fill=True, cmap="coolwarm")
plt.show()


In [None]:
# 24. Stacked bar plot for Outcome vs. Pregnancies
cross_tab = pd.crosstab(df['Pregnancies'], df['Outcome'])
cross_tab.plot(kind='bar', stacked=True, figsize=(10, 6), colormap="coolwarm")
plt.title("Stacked Bar Chart of Pregnancies vs. Outcome")
plt.ylabel("Count")
plt.xlabel("Pregnancies")
plt.show()

In [None]:
# 25. Pie chart for Outcome
plt.figure(figsize=(6, 6))
df["Outcome"].value_counts().plot.pie(autopct='%1.1f%%', colors=["skyblue", "salmon"], explode=[0, 0.1])
plt.title("Outcome Distribution")
plt.ylabel("")
plt.show()

In [None]:
# 26. Relplot for Age vs. Glucose
sns.relplot(x="Age", y="Glucose", hue="Outcome", data=df, kind="scatter", palette="coolwarm")
plt.title("Age vs. Glucose")
plt.show()

In [None]:
# 27. Heatmap of missing values
plt.figure(figsize=(10, 5))
sns.heatmap(df.isnull(), cmap="viridis", cbar=False)
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
# 28. Scatterplot for DiabetesPedigreeFunction vs. BMI
sns.scatterplot(x="DiabetesPedigreeFunction", y="BMI", hue="Outcome", data=df, palette="coolwarm")
plt.title("Scatterplot of DiabetesPedigreeFunction vs. BMI")
plt.show()

In [None]:
# 29. Regression plot for BloodPressure vs. Age
sns.regplot(x="Age", y="BloodPressure", data=df, scatter_kws={'alpha':0.5}, line_kws={"color": "red"})
plt.title("Regression Plot of Age vs. Blood Pressure")
plt.show()

In [None]:
# 30. Residual plot for Insulin vs. Age
sns.residplot(x="Age", y="Insulin", data=df, lowess=True, color="blue")
plt.title("Residual Plot for Insulin vs. Age")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import lightgbm as lgb


In [None]:
# Step 5: Define features (X) and target (y)
X = df.drop("Outcome", axis=1)  # Features
y = df["Outcome"]  # Target variable

# Step 6: Normalize the dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 7: Split the dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Step 8: Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Support Vector Machine": SVC(probability=True),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Naive Bayes": GaussianNB(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": lgb.LGBMClassifier()
}

In [None]:
# Step 9: Train and evaluate models
model_results = {}

for name, model in models.items():
    print(f"Training {name}...")

    # Step 10: Train the model
    model.fit(X_train, y_train)

    # Step 11: Make predictions
    y_pred = model.predict(X_test)

    # Step 12: Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    model_results[name] = [accuracy, auc]

    # Step 13: Print classification report
    print(f"\n{name} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
    print(classification_report(y_test, y_pred))
    print("-" * 50)

# Step 14: Convert results to DataFrame
results_df = pd.DataFrame(model_results, index=["Accuracy", "AUC"]).T
results_df = results_df.sort_values(by="Accuracy", ascending=False)


In [None]:
# Step 15: Display model performance comparison
print("\nModel Performance Comparison:\n", results_df)

# Step 16: Plot Accuracy and AUC Scores
plt.figure(figsize=(12, 6))

# Step 17: Bar plot for Accuracy
plt.subplot(1, 2, 1)
sns.barplot(x=results_df.index, y=results_df["Accuracy"], palette="coolwarm")
plt.xticks(rotation=45)
plt.title("Model Accuracy Comparison")

In [None]:
# Step 18: Bar plot for AUC
plt.subplot(1, 1, 1)
sns.barplot(x=results_df.index, y=results_df["AUC"], palette="coolwarm")
plt.xticks(rotation=45)
plt.title("Model AUC Score Comparison")

plt.tight_layout()
plt.show()

In [None]:
# Step 19: Display Confusion Matrix for Best Model
best_model_name = results_df.index[0]
best_model = models[best_model_name]
y_pred_best = best_model.predict(X_test)

plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred_best), annot=True, fmt="d", cmap="Blues", xticklabels=["No Diabetes", "Diabetes"], yticklabels=["No Diabetes", "Diabetes"])
plt.title(f"Confusion Matrix for {best_model_name}")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Step 20: Save the best model
import joblib
joblib.dump(best_model, "best_diabetes_model.pkl")
print(f"\nBest Model ({best_model_name}) saved successfully!")

In [None]:
df.head()



---

## üßëüèª‚Äçüíª About the Author  
**Name:** Arif Miah  

üéì **Profession:** Machine Learning Engineer & Data Scientist  

---

### üî≠ **Career Objective**  
üöÄ My goal is to contribute to groundbreaking advancements in artificial intelligence and data science, empowering companies and individuals with data-driven solutions. I strive to simplify complex challenges, craft innovative projects, and pave the way for a smarter and more connected future.  

üîç As a **Machine Learning Engineer** and **Data Scientist**, I am passionate about using machine learning, deep learning, computer vision, and advanced analytics to solve real-world problems. My expertise lies in delivering impactful solutions by leveraging cutting-edge technologies.  

---

### üíª **Skills**  
- ü§ñ **Artificial Intelligence & Machine Learning**  
- üëÅÔ∏è‚Äçüó®Ô∏è **Computer Vision & Predictive Analytics**  
- üß† **Deep Learning & Natural Language Processing (NLP)**  
- üêç **Python Programming & Automation**  
- üìä **Data Visualization & Analysis**  
- üöÄ **End-to-End Model Development & Deployment**  

---

### üöß **Featured Projects**  

üìä **Lung Cancer Prediction with Deep Learning**  
Achieved 99% accuracy in a computer vision project using 12,000 medical images across three classes. This project involved data preprocessing, visualization, and model training to detect cancer effectively.  

üåæ **Ghana Crop Disease Detection Challenge**  
Developed a model using annotated images to identify crop diseases with bounding boxes, addressing real-world agricultural challenges and disease mitigation.  

üõ°Ô∏è **Global Plastic Waste Analysis**  
Utilized GeoPandas, Matplotlib, and machine learning models like RandomForestClassifier and CatBoostClassifier to analyze trends in plastic waste management.  

üéµ **Twitter Emotion Classification**  
Performed exploratory data analysis and built a hybrid machine learning model to classify Twitter sentiments, leveraging text data preprocessing and visualization techniques.  

---

### ‚öôÔ∏è **Technical Skills**  

- üíª **Programming Languages:** Python üêç, SQL üóÉÔ∏è, R üìà  
- üìä **Data Visualization Tools:** Matplotlib üìâ, Seaborn üåä, Tableau üìä, Power BI üìä  
- üß† **Machine Learning & Deep Learning:** Scikit-learn ü§ñ, TensorFlow üî•, PyTorch üß©  
- üóÇÔ∏è **Big Data Technologies:** Hadoop üèóÔ∏è, Spark ‚ö°  
- üöÄ **Model Deployment:** Flask üåê, FastAPI ‚ö°, Docker üê≥  

---

### üåê **Connect with Me**  

üìß **Email:** arifmiahcse@gmail.com

üîó **LinkedIn:** [www.linkedin.com/in/arif-miah-8751bb217](#)  

üê± **GitHub:** [https://github.com/Arif-miad](#)  



