# **Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix,accuracy_score

# Loading the data and Balanching the data

In [None]:
df1 = pd.read_csv('/content/diabetes_012_health_indicators_BRFSS2021.csv')
df1.shape

FileNotFoundError: ignored

In [None]:
df1.head()

In [None]:
df1.tail()

In [None]:
df1['Diabetes_012'].value_counts()

In [None]:
((df1['Diabetes_012'] == 1) | (df1['Diabetes_012'] == 2)).sum()

In [None]:
df1['Diabetes_012'] = df1['Diabetes_012'].replace({1: 1, 2: 1, 0: 0})

In [None]:
df1.rename(columns={'Diabetes_012': 'Diabetes_binary'}, inplace=True)

In [None]:
class_counts = df1['Diabetes_binary'].value_counts()
plt.figure(figsize=(4, 4))
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', colors=['lightcoral', 'lightblue'])
plt.title('Diabetes_binary')
plt.show()
((df1['Diabetes_binary'] == 1)).sum()

In [None]:
duplicate_count = (df1.duplicated()).sum()
print(duplicate_count)

total_rows,total_col = df1.shape

# Calculate the duplicate rate
duplicate_rate = duplicate_count / total_rows * 100

plt.figure(figsize=(6, 4))
plt.bar(['Duplicate', 'Non-duplicate'], [duplicate_rate, 100 - duplicate_rate], color=['lightcoral', 'lightblue'])
plt.title('Rate of Duplicate Data')
plt.ylabel('Percentage')
plt.show()

In [None]:
df1.drop_duplicates(inplace=True)

In [None]:
df1['Diabetes_binary'].value_counts()

In [None]:
df2 = pd.read_csv("/content/diabetes_binary_5050split_health_indicators_BRFSS2021.csv")

In [None]:
df2.head()

In [None]:
class_counts = df2['Diabetes_binary'].value_counts()
plt.figure(figsize=(4, 4))
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', colors=['lightcoral', 'lightblue'])
plt.title('Diabetes_binary')
plt.show()

In [None]:
duplicate_count = (df2.duplicated()).sum()
print(duplicate_count)

total_rows = len(df2)

duplicate_rate = duplicate_count / total_rows * 100

plt.figure(figsize=(6, 4))
plt.bar(['Duplicate', 'Non-duplicate'], [duplicate_rate, 100 - duplicate_rate], color=['lightcoral', 'lightblue'])
plt.title('Rate of Duplicate Data')
plt.ylabel('Percentage')
plt.show()

In [None]:
df2['Diabetes_binary'].value_counts()

In [None]:
df3 = df2[df2['Diabetes_binary'] == 1]

In [None]:
df3.head()

In [None]:
df3.describe()

In [None]:
def upscale_column(column):
    small_change = 0.1
    return column + small_change

# Apply the function to every numerical column except the first column
numerical_columns = df3.iloc[:, 1:].select_dtypes(include='number').columns
df3[numerical_columns] = df3[numerical_columns].apply(upscale_column)


In [None]:
df = pd.concat([df1, df3], ignore_index=True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.duplicated().sum()

In [None]:
df.describe()

In [None]:
class_counts = df['Diabetes_binary'].value_counts()
min_class_1 = int(class_counts[1])
min_class_0 = int(0.5 * class_counts[0])

balanced_df = df.groupby('Diabetes_binary').apply(
    lambda x: x.sample(min_class_1) if x['Diabetes_binary'].iloc[0] == 1.0 else x.sample(min_class_0)
).reset_index(drop=True)


In [None]:
class_counts = balanced_df['Diabetes_binary'].value_counts()

# Plotting a pie chart
plt.figure(figsize=(4, 4))
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', colors=['lightcoral', 'lightblue'])
plt.title('Class Distribution')
plt.show()

In [None]:
balanced_df.drop_duplicates(inplace=True)

In [None]:
duplicate_count = (balanced_df.duplicated()).sum()

total_rows = len(balanced_df)

duplicate_rate = duplicate_count / total_rows * 100

plt.figure(figsize=(8, 6))
plt.bar(['Duplicate', 'Non-duplicate'], [duplicate_rate, 100 - duplicate_rate], color=['lightcoral', 'lightblue'])
plt.title('Rate of Duplicate Data')
plt.ylabel('Percentage')
plt.show()


In [None]:
balanced_df.describe()

## **Visualizing and Type casting**

In [None]:
data_types = {
    "Diabetes_binary": "float32",
    "HighBP": "int32",
    "HighChol": "float32",
    "CholCheck": "int32",
    "BMI": "float32",
    "Smoker": "float32",
    "Stroke": "float32",
    "HeartDiseaseorAttack": "float32",
    "PhysActivity": "int32",
    "Fruits": "int32",
    "Veggies": "int32",
    "HvyAlcoholConsump": "int32",
    "AnyHealthcare": "int32",
    "NoDocbcCost": "float32",
    "GenHlth": "float32",
    "MentHlth": "float32",
    "PhysHlth": "float32",
    "DiffWalk": "float32",
    "Sex": "int32",
    "Age": "int32",
    "Education": "float32",
    "Income": "float32"
}

for column, dtype in data_types.items():
    balanced_df[column] = balanced_df[column].astype(dtype)

In [None]:
balanced_df.info()

In [None]:
plt.figure(figsize=(14, 14))
sns.heatmap(balanced_df.corr(), annot=True, cmap='coolwarm', linewidth=.8)
plt.show()

In [None]:
balanced_df.groupby('Diabetes_binary').mean()

In [None]:
balanced_df.drop('Diabetes_binary', axis=1).hist(bins=20, figsize=(14, 14))
plt.tight_layout()
plt.show()

In [None]:
balanced_df.describe()

# Scaling the data

In [None]:
X = balanced_df.drop('Diabetes_binary',axis=1)
y = balanced_df['Diabetes_binary']
print(len(X))

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(X)
x = pd.DataFrame(scaled_features,columns=balanced_df.columns[1:])
x.head(10)

In [None]:
X.isna().sum()

# Spliting the data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=7)

## Modeling **bold text**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
L_model = LogisticRegression(max_iter=1000)
L_model.fit(X_train,y_train)

In [None]:
y_pred = L_model.predict(X_test)
print("-------------------------------------------------------------------------")
print(f"The accuraccy score is: ------>>  {accuracy_score(y_test,y_pred)}")
print("-------------------------------------------------------------------------")
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Greens", cbar=False,
            xticklabels=[f"Class {i}" for i in range(conf_matrix.shape[1])],
            yticklabels=[f"Class {i}" for i in range(conf_matrix.shape[0])])
plt.title("Confusion Matrix of LogisticRegression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(n_estimators=200)
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

print("-------------------------------------------------------------------------")
print(f"The accuracy score is: ------>>  {accuracy_score(y_test, y_pred)}")
print("-------------------------------------------------------------------------")

conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=[f"Class {i}" for i in range(conf_matrix.shape[1])],
            yticklabels=[f"Class {i}" for i in range(conf_matrix.shape[0])])
plt.title("Confusion Matrix of RandomForestClassifier")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gradient_boosting_model = GradientBoostingClassifier(n_estimators=100)
gradient_boosting_model.fit(X_train, y_train)

In [None]:
y_pred_gb = gradient_boosting_model.predict(X_test)

accuracy_gb = accuracy_score(y_test, y_pred_gb)

print("-------------------------------------------------------------------------")
print(f"Gradient Boosting Accuracy: {accuracy_gb}")
print("-------------------------------------------------------------------------")

conf_matrix = confusion_matrix(y_test, y_pred_gb)

plt.figure(figsize=(6, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=[f"Class {i}" for i in range(conf_matrix.shape[1])],
            yticklabels=[f"Class {i}" for i in range(conf_matrix.shape[0])])
plt.title("Confusion Matrix of GradientBoostingClassifier")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
accuracy_scores = [accuracy_score(y_test, L_model.predict(X_test)),
                   accuracy_score(y_test, model.predict(X_test)),
                   accuracy_gb]

models = ['Logistic Regression', 'Random Forest', 'Gradient Boosting']

plt.figure(figsize=(8, 6))
plt.bar(models, accuracy_scores, color=['green', 'blue', 'purple'])
plt.ylim(0.7, 0.85)
plt.title('Comparison of Model Accuracy Scores')
plt.xlabel('Model')
plt.ylabel('Accuracy Score')
plt.show()
