In [1]:
import pandas as pd
import numpy as np

# Visualization
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objs as go
import matplotlib.pyplot as plt
colors = px.colors.sequential.Plasma_r

# Preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc

# Warnings
import warnings 
warnings. filterwarnings('ignore')


In [2]:
heart = pd.read_csv('C:\\Users\\mohan\\OneDrive\\Documents\\Desktop\\big data\\datamining\\datamining (1).csv')

In [3]:
heart.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150,32.66,14.54,Yes,0,30,16,12
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165,77.11,28.29,No,0,30,0,4
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163,88.45,33.47,No,4,12,3,16
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180,93.44,28.73,No,0,30,30,8
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191,88.45,24.37,Yes,0,8,4,0


In [4]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 799 entries, 0 to 798
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   General_Health                799 non-null    object 
 1   Checkup                       799 non-null    object 
 2   Exercise                      799 non-null    object 
 3   Heart_Disease                 799 non-null    object 
 4   Skin_Cancer                   799 non-null    object 
 5   Other_Cancer                  799 non-null    object 
 6   Depression                    799 non-null    object 
 7   Diabetes                      799 non-null    object 
 8   Arthritis                     799 non-null    object 
 9   Sex                           799 non-null    object 
 10  Age_Category                  799 non-null    object 
 11  Height_(cm)                   799 non-null    int64  
 12  Weight_(kg)                   799 non-null    float64
 13  BMI  

In [5]:
heart.shape

(799, 19)

In [6]:
# Checking if there are any null values in the dataset or not
heart.isnull().sum()

General_Health                  0
Checkup                         0
Exercise                        0
Heart_Disease                   0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Smoking_History                 0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
dtype: int64

In [7]:
# Convering the column names into lower case and replacing the space with an underscore
heart.columns = heart.columns.str.lower().str.replace(" ", "_")

#Changing the name of a big column

heart.rename(columns = {'height_(cm)' : 'height', 'weight_(kg)' : 'weight', 'green_vegetables_consumption' : 'vegetables_consumption', 'friedpotato_consumption' : 'potato_consumption'}, inplace = True)

In [8]:
# With the help of for loop, we will now check if there are any typos in the categorical columns or not
for col in heart.select_dtypes(include = "object"):
    print(f"Column name: {col}")
    print(heart[col].unique())
    print('\n', '-'*80, '\n')

Column name: general_health
['Poor' 'Very Good' 'Good' 'Fair' 'Excellent']

 -------------------------------------------------------------------------------- 

Column name: checkup
['Within the past 2 years' 'Within the past year' '5 or more years ago'
 'Within the past 5 years' 'Never']

 -------------------------------------------------------------------------------- 

Column name: exercise
['No' 'Yes']

 -------------------------------------------------------------------------------- 

Column name: heart_disease
['No' 'Yes']

 -------------------------------------------------------------------------------- 

Column name: skin_cancer
['No' 'Yes']

 -------------------------------------------------------------------------------- 

Column name: other_cancer
['No' 'Yes']

 -------------------------------------------------------------------------------- 

Column name: depression
['No' 'Yes']

 -------------------------------------------------------------------------------- 

Column name:

In [9]:
heart['checkup'] = heart['checkup'].replace('Within the past 2 years', 'Past 2 years')
heart['checkup'] = heart['checkup'].replace('Within the past year', 'Past 1 year')
heart['checkup'] = heart['checkup'].replace('Within the past 5 years', 'Past 5 years')
heart['checkup'] = heart['checkup'].replace('5 or more years ago', 'More than 5 years')


heart['diabetes'] = heart['diabetes'].replace('No, pre-diabetes or borderline diabetes', 'No Pre Diabetes')
heart['diabetes'] = heart['diabetes'].replace('Yes, but female told only during pregnancy', 'Only during pregnancy')

heart['age_category'] = heart['age_category'].replace('18-24', 'Young')
heart['age_category'] = heart['age_category'].replace('25-29', 'Adult')
heart['age_category'] = heart['age_category'].replace('30-34', 'Adult')
heart['age_category'] = heart['age_category'].replace('35-39', 'Adult')
heart['age_category'] = heart['age_category'].replace('40-44', 'Mid-Aged')
heart['age_category'] = heart['age_category'].replace('45-49', 'Mid-Aged')
heart['age_category'] = heart['age_category'].replace('50-54', 'Mid-Aged')
heart['age_category'] = heart['age_category'].replace('55-59', 'Senior-Adult')
heart['age_category'] = heart['age_category'].replace('60-64', 'Senior-Adult')
heart['age_category'] = heart['age_category'].replace('65-69', 'Elderly')
heart['age_category'] = heart['age_category'].replace('70-74', 'Elderly')
heart['age_category'] = heart['age_category'].replace('75-79', 'Elderly')
heart['age_category'] = heart['age_category'].replace('80+', 'Elderly')

In [10]:
col = ['alcohol_consumption', 'fruit_consumption', 'vegetables_consumption', 'potato_consumption']

for i in col:
    heart[i] = heart[i].astype(int)

In [11]:
# Define BMI ranges and labels for each group
bmi_bins = [12.02, 18.3, 26.85, 31.58, 37.8, 100]
bmi_labels = ['Underweight', 'Normal weight', 'Overweight', 'Obese I', 'Obese II']
heart['bmi_group'] = pd.cut(heart['bmi'], bins=bmi_bins, labels=bmi_labels, right=False)

In [12]:
column_to_move = heart.pop('bmi_group')
heart.insert(14, 'bmi_group', column_to_move)

In [13]:
heart['bmi_group'] = heart['bmi_group'].astype('object')

In [14]:
heart.describe(include = 'O')

Unnamed: 0,general_health,checkup,exercise,heart_disease,skin_cancer,other_cancer,depression,diabetes,arthritis,sex,age_category,bmi_group,smoking_history
count,799,799,799,799,799,799,799,799,799,799,799,799,799
unique,5,5,2,2,2,2,2,4,2,2,5,5,2
top,Good,Past 1 year,Yes,No,No,No,No,No,Yes,Female,Elderly,Normal weight,No
freq,275,710,464,679,651,669,672,579,427,568,580,333,493


In [15]:
heart.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
height,799.0,167.702128,9.936333,135.0,160.0,165.0,175.0,206.0
weight,799.0,80.076133,19.241521,32.66,66.905,77.56,90.72,158.76
bmi,799.0,28.406496,6.175612,14.06,24.21,27.46,31.63,51.81
alcohol_consumption,799.0,2.406758,6.412981,0.0,0.0,0.0,0.0,30.0
fruit_consumption,799.0,26.118899,22.068182,0.0,8.0,30.0,30.0,120.0
vegetables_consumption,799.0,11.787234,12.276974,0.0,4.0,8.0,16.0,120.0
potato_consumption,799.0,5.434293,8.214107,0.0,1.0,4.0,8.0,120.0


#### 1. Health Assessment

In [16]:
"""
Generates a histogram plot of the 'general_health' feature from the 'heart' dataset, colored by the 'general_health' feature, using the Plotly Express library.

The plot is titled "1. Distribution of General Health" and the plot background is set to white.
"""
fig1 = px.histogram(heart, x="general_health", color = 'general_health', color_discrete_sequence = colors, title="1. Distribution of General Health")
fig1.update_layout(plot_bgcolor='white')
fig1.show()
print('\n', "="*80, '\n')

fig2 = px.histogram(heart, x="general_health", color = 'heart_disease', color_discrete_sequence = colors, barmode = 'group', title="2. General Health with respect to Heart Disease")
fig2.update_layout(plot_bgcolor='white')
fig2.show()
print('\n', "="*80, '\n')









#### 2. Demographic Analysis

In [17]:
sex_counts = heart['sex'].value_counts()
age_category_counts = heart['age_category'].value_counts()

fig1 = px.bar(x=sex_counts.index, y=sex_counts.values, color=sex_counts.index, color_discrete_sequence = colors, labels={'x': 'Sex', 'y': 'Count'})
fig1.update_layout(title="1. Distribution of gender in the Dataset", xaxis_title="", yaxis_title="Count", plot_bgcolor='white')
fig1.show()
print('\n', "="*80, '\n')

fig2 = px.histogram(heart, x="sex", color='heart_disease', barmode='group', color_discrete_sequence= colors, title="2. Checking which gender is more susceptible to Heart Disease?")
fig2.update_layout(xaxis_title="Gender", yaxis_title="Count", legend_title="Heart Disease", xaxis_showgrid=False, yaxis_showgrid=False, plot_bgcolor='white')
fig2.show()
print('\n', "="*80, '\n')

grouped_data = heart.groupby(['sex', 'heart_disease'], as_index=False)['bmi'].median()
fig = px.bar(grouped_data, x='sex', y='bmi', color='heart_disease', color_discrete_sequence = colors, barmode = 'group', title="3. Checking  gender and their average bmi based on heart disease?")
fig.update_layout(xaxis_title="Gender", yaxis_title="Average BMI", legend_title="Heart Disease", xaxis_showgrid=False, yaxis_showgrid=False, plot_bgcolor='white')
fig.show()









In [18]:
fig1 = px.bar(x=age_category_counts.index, y=age_category_counts.values, color=age_category_counts.index, color_discrete_sequence = colors ,labels={'x': 'Age Category', 'y': 'Count'})
fig1.update_layout(title="1. Distribution of Age Categories in the Dataset", xaxis_title="", yaxis_title="Count", plot_bgcolor='white')
fig1.show()
print('\n', "="*80, '\n')

fig2 = px.histogram(heart, x="age_category", color='heart_disease', barmode='group', color_discrete_sequence= colors, title="2. Checking which age group is more susceptible to Heart Disease?")
fig2.update_layout(xaxis_title="age_category", yaxis_title="Count", legend_title="Heart Disease", xaxis_showgrid=False, yaxis_showgrid=False, plot_bgcolor='white')
fig2.show()
print('\n', "="*80, '\n')

grouped_data = heart.groupby(['age_category', 'heart_disease'], as_index=False)['bmi'].median()
fig3 = px.bar(grouped_data, x='age_category', y='bmi', color='heart_disease', color_discrete_sequence = colors, barmode = 'group', title="3. Checking  age groups and their average bmi based on heart disease?")
fig3.update_layout(xaxis_title="Age Group", yaxis_title="Average BMI", legend_title="Heart Disease", xaxis_showgrid=False, yaxis_showgrid=False, plot_bgcolor='white')
fig3.show()









#### 3. Impact of Lifestyle Analysis

In [19]:
def create_bar_chart(data, x_col, y_col, color_col, title, x_label, y_label):
    grouped_data = data.groupby([x_col, color_col]).size().reset_index(name='count')
    fig = px.bar(grouped_data, x=x_col, y=y_col, color=color_col, color_discrete_sequence=colors, title=title, labels={x_col: x_label, y_col: y_label}, barmode='group', category_orders={x_col: ["No", "Yes"], color_col: ["No", "Yes"]} )
    fig.update_layout(plot_bgcolor='white')
    fig.show()
    print('\n', "=" * 80, '\n')
    
create_bar_chart(heart, 'exercise', 'count', 'heart_disease', '1. Impact of Exercise on Heart Disease', 'Exercise', 'Count')
create_bar_chart(heart, 'smoking_history', 'count', 'heart_disease', '2. Impact of Smoking on Heart Disease', 'Smoking History', 'Count')

columns = ['alcohol_consumption', 'fruit_consumption', 'vegetables_consumption', 'potato_consumption']
titles = ["Alcohol Consumption", "Fruit Consumption", "Vegetables Consumption", "Potato Consumption"]

for i, col in enumerate(columns):
    grouped_data = heart.groupby(['age_category', 'heart_disease'], as_index=False)[col].median()  # Use median here
    fig = px.bar(grouped_data, x='age_category', y=col, color='heart_disease', color_discrete_sequence=colors, barmode='group', title=f"{i + 4}. Impact of {titles[i]} on Heart Disease")
    fig.update_layout(xaxis_title="Age Group", yaxis_title=f"Median {titles[i]}", legend_title="Heart Disease", xaxis_showgrid=False, yaxis_showgrid=False, plot_bgcolor='white')
    fig.show()
    print('\n', "="*80, '\n')

























---
### 1. One-Hot Encoding
---

In [20]:
heart['heart_disease'] = heart['heart_disease'].map({'Yes':1, 'No':0})
cat=['sex', 'smoking_history']

OH_Encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH = OH_Encoder.fit_transform(heart[cat])
cols = OH_Encoder.get_feature_names_out(cat)
OH = pd.DataFrame(OH, columns=cols)
heart = heart.drop(cat,axis=1)
heart = pd.concat([heart, OH], axis =1) 

---
### 2. Label Encoding
---

In [21]:
categorical_columns = ['general_health', 'checkup', 'exercise', 'skin_cancer', 'other_cancer', 'depression', 'diabetes', 'arthritis', 'age_category', 'bmi_group']

# Initialize LabelEncoder

label_encoder = LabelEncoder()

# Apply label encoding to each ordinal categorical column

for col in categorical_columns:
    heart[col] = label_encoder.fit_transform(heart[col])

---
### 3. Class Imbalance
---

In [22]:
# Checking the class Imbalance

heart['heart_disease'].value_counts()

0    679
1    120
Name: heart_disease, dtype: int64

In [23]:
X = heart.drop("heart_disease", axis = 1)
y = heart['heart_disease']

In [24]:
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

---
### 4. Splitting into training and testing
---

In [25]:
# Splitting the data into training and testing sets for diabetes balanced

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

---
### 5. Feature Scaling
---

In [26]:
# Feature Scaling on diabetes dataset

scaler_d = StandardScaler()
X_train_scaled = scaler_d.fit_transform(X_train)
X_test_scaled = scaler_d.transform(X_test)

In [27]:
# Create the models

lr_d = LogisticRegression()
rf_d = RandomForestClassifier()

# Fit the models

lr_d.fit(X_train_scaled, y_train)
rf_d.fit(X_train_scaled, y_train)

# Make predictions

lr_pred_d = lr_d.predict(X_test_scaled)
rf_pred_d = rf_d.predict(X_test_scaled)

In [28]:
def plot_classification_report(report, title):
    lines = report.split('\n')[2:-5]
    classes = []
    precision = []
    recall = []
    f1_score = []
    support = []
    for line in lines:
        row_data = line.split()
        classes.append(row_data[0])
        precision.append(float(row_data[1]))
        recall.append(float(row_data[2]))
        f1_score.append(float(row_data[3]))
        support.append(int(row_data[4]))

    fig = go.Figure()
    fig.add_trace(go.Bar(x=classes, y=precision, name='Precision', marker_color = colors[0]))
    fig.add_trace(go.Bar(x=classes, y=recall, name='Recall', marker_color = colors[1]))
    fig.add_trace(go.Bar(x=classes, y=f1_score, name='F1-Score', marker_color = colors[2]))

    fig.update_layout(title=title, xaxis_title='Class', yaxis_title='Score', barmode='group', xaxis={'categoryorder': 'total descending'}, plot_bgcolor='white')

    fig.show()

In [29]:
# Classification reports for different algorithms

lr_d_report = classification_report(y_test, lr_pred_d)

rf_d_report = classification_report(y_test, rf_pred_d)


In [30]:
# Plot classification reports

print("="*40, "Logistic regression report:", "="*45, '\n')
print(lr_d_report)
plot_classification_report(lr_d_report, "Logistic Regression Classification Report Visualization")


print("="*40, "Random forest report:", "="*45, '\n')
print(rf_d_report)
plot_classification_report(rf_d_report, "Random Forest Classification Report")


              precision    recall  f1-score   support

           0       0.76      0.73      0.74       201
           1       0.75      0.78      0.76       207

    accuracy                           0.75       408
   macro avg       0.76      0.75      0.75       408
weighted avg       0.76      0.75      0.75       408




              precision    recall  f1-score   support

           0       0.89      0.96      0.92       201
           1       0.96      0.88      0.92       207

    accuracy                           0.92       408
   macro avg       0.92      0.92      0.92       408
weighted avg       0.92      0.92      0.92       408



In [31]:
"""
Calculates the Receiver Operating Characteristic (ROC) curve and Area Under the Curve (AUC) for the Logistic Regression and Random Forest models.

The ROC curve is a graphical representation of the trade-off between the true positive rate (TPR) and false positive rate (FPR) at various decision thresholds. The AUC is a summary metric that represents the overall performance of the model, with a value of 1.0 indicating perfect classification and 0.5 indicating random guessing.

This code generates the ROC curve and calculates the AUC for both the Logistic Regression and Random Forest models, and then plots the ROC curves using Plotly.
"""
"""
Calculates the Receiver Operating Characteristic (ROC) curve and Area Under the Curve (AUC) for the Logistic Regression and Random Forest models.

The ROC curve is a graphical representation of the trade-off between the true positive rate (TPR) and false positive rate (FPR) at various decision thresholds. The AUC is a summary metric that represents the overall performance of the model, with a value of 1.0 indicating perfect classification and 0.5 indicating random guessing.

This code generates the ROC curve and calculates the AUC for both the Logistic Regression and Random Forest models, and then plots the ROC curves using Plotly.
"""
# Calculate ROC and AUC for each model

lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_pred_d)
lr_auc = auc(lr_fpr, lr_tpr)

rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_pred_d)
rf_auc = auc(rf_fpr, rf_tpr)

# Create ROC curve plot using Plotly

fig = go.Figure()
fig.add_trace(go.Scatter(x=lr_fpr, y=lr_tpr, mode='lines', name=f'Logistic Regression (AUC = {lr_auc:.2f})', line=dict(color=colors[1])))
fig.add_trace(go.Scatter(x=rf_fpr, y=rf_tpr, mode='lines', name=f'Random Forest (AUC = {rf_auc:.2f})', line=dict(color=colors[3])))

fig.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)
fig.update_layout(title='Receiver Operating Characteristic (ROC) Curve for Diabetes', xaxis=dict(title='False Positive Rate'), yaxis=dict(title='True Positive Rate'), legend=dict(x=0.7, y=0.2), autosize=False, width=900, height=500, plot_bgcolor='white')
fig.show()