# Diabetes Prediction Homework

In [None]:
import numpy as np # linear algebra
import matplotlib.pyplot as plt # librarie de plotare
import seaborn as sns # librărie construită peste matplotlib
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Read Data

In [None]:
dataset = pd.read_csv("/kaggle/input/diabetes-dataset/diabetes.csv")

## EDA - Exploratory Data Analysis

In [None]:
#!pip install ydata-profiling

In [None]:
#from ydata_profiling import ProfileReport

In [None]:
#report = ProfileReport(dataset, title="Diabetes Dataset")

In [None]:
#report

## Verify if missing data

In [None]:
dataset.isna().any()

## Verify if there are any duplicates in the dataset

In [None]:
dataset.duplicated().any()

In [None]:
dataset.info()

## Explore Data Ballance

In [None]:
plt.figure(figsize = (12, 6))
sns.countplot(x="Outcome", data=dataset)
plt.show();

In [None]:
print(f"% of patients having diabetes: {sum(dataset['Outcome']) / len(dataset) * 100:.2f}%")

## Visualize data distributions

In [None]:
dataset.columns

In [None]:
plt.figure(figsize = (12, 12))
for i, col in enumerate(dataset.columns):
    plt.subplot(3, 3, i+1)
    sns.histplot(x=col, data=dataset, kde=True)
plt.show();

## Visualize Feature Correlations
### Any value above 0.5 or below -0.5 signals high correlation between features

In [None]:
# Pearson Correlation Coefficient
dataset.corr()

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(dataset.corr(), vmin=-1.0, center=0, cmap='RdBu_r', annot=True)
plt.show()

## Data Train Validation Split

In [None]:
X = dataset.drop(['Outcome'], axis=1)
y = dataset['Outcome']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
X_train

In [None]:
sum(y_train) / len(y_train)

## Data Preprocessing

## Glucose - 0 is not a valid glucose level
## BloodPressure - 0 is not a valid BP value
## SkinThickness - 0 is unlikely and often missing
## Insulin - 0 means missing (not tested)
## BMI - 0 is not valid (BMI cannot be zero)

In [None]:
cols_to_fix = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_to_fix:
    X_train[col] = X_train[col].replace(0, np.nan)
    X_val[col] = X_val[col].replace(0, np.nan)
for col in cols_to_fix:
    median = X_train[col].median()
    X_train[col] = X_train[col].fillna(median)
    X_val[col] = X_val[col].fillna(median)

## Training and Validation

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix

gnb = GaussianNB()
gnb.fit(X_train, y_train)

train_preds = gnb.predict(X_train)
val_preds = gnb.predict(X_val)

train_acc = accuracy_score(y_train, train_preds)
val_acc = accuracy_score(y_val, val_preds)

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")


### Compute False Positive & False Negative Rate

In [None]:
from sklearn.metrics import confusion_matrix

# Compute Confusion Matrix pe predicțiile deja calculate
tn, fp, fn, tp = confusion_matrix(y_val, val_preds).ravel()

# Calcule FPR & FNR
fpr = fp / (fp + tn)  # False Positive Rate
fnr = fn / (fn + tp)  # False Negative Rate

print(f"False Positive Rate (FPR): {fpr:.4f}")
print(f"False Negative Rate (FNR): {fnr:.4f}")

## How to modify conditions when Naive Bayes predicts 1
### predict_proba - returns percentages for 0 and 1
### By changing 0.5 percent we modify the threshold for predicting 1

In [None]:
gnb_probs_proba = gnb.predict_proba(X_train)
gnb_probs = (gnb_probs_proba[:,1] >= 0.5).astype(int)

In [None]:
gnb_probs

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

gnb_probs_proba = gnb.predict_proba(X_train)[:, 1]

plt.figure(figsize=(10, 6))
sns.histplot(gnb_probs_proba, kde=True, bins=30, color='skyblue')

plt.axvline(x=0.5, color='red', linestyle='--', label='Threshold = 0.5')
plt.title("Distribuția probabilităților pentru clasa pozitivă (diabet = 1)")
plt.xlabel("Probabilitate prezisă (clasa 1)")
plt.ylabel("Frecvență")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

dataset = pd.read_csv('/kaggle/input/test-set/diabetes_testset.csv')

cols_to_fix = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_to_fix:
    dataset[col] = dataset[col].replace(0, np.nan)

for col in cols_to_fix:
    median = dataset[col].median()
    dataset[col] = dataset[col].fillna(median)

X = dataset.drop('Outcome', axis=1)
y = dataset['Outcome']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

model = GaussianNB()
model.fit(X_train, y_train)

predictions = model.predict(X)

results_df = dataset.copy()
results_df['Outcome_Predicted'] = predictions

results_df.to_csv("diabetes_predictions.csv", index=False)

In [None]:
results_df