In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


**1. Perform preliminary data inspection and report the findings as the structure of the data, missing values, duplicates, etc.**

In [None]:
# Load the dataset
data=pd.read_excel('/content/drive/MyDrive/Colab Notebooks/Healthcare/Data/data.xlsx')


# Load variable descriptions
var_desc = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/Healthcare/Data/variable description.xlsx')

# Create a rename mapping from variable to description (if both columns exist)
if 'Variable' in var_desc.columns and 'Description' in var_desc.columns:
    rename_map = dict(zip(var_desc['Variable'], var_desc['Description']))
else:
    rename_map = {}


# 4. Make a COPY for cleaning (so original data is safe)
cleaned_data = data.copy()

# Rename columns if mapping exists
cleaned_data.rename(columns=rename_map, inplace=True)

# Identify the target column (by description or fallback)
target_col = None
for col in data.columns:
    if 'disease' in col.lower() or 'target' in col.lower():
        target_col = col
        break

# Convert target to numeric binary if needed
if target_col:
    cleaned_data[target_col] = cleaned_data[target_col].replace({
        'yes': 1, 'Yes': 1, 'Y': 1, 'y': 1,
        'no': 0, 'No': 0, 'N': 0, 'n': 0
    })
    # Ensure integer
    cleaned_data[target_col] = pd.to_numeric(data[target_col], errors='coerce')

    # Now create CVD_Status
    cleaned_data['CVD_Status'] = cleaned_data[target_col].apply(
        lambda x: 'Diseased' if x == 1 else 'Healthy'
    )


# Ensure categorical columns are strings
categorical_cols = [col for col in cleaned_data.columns if cleaned_data[col].nunique() < 10]
cleaned_data[categorical_cols] = cleaned_data[categorical_cols].astype(str)

# Save the cleaned file for Tableau
output_path = '/content/drive/MyDrive/Colab Notebooks/Healthcare/Data/healthcare_cleaned.csv'
cleaned_data.to_csv(output_path, index=False)

output_path

In [None]:
data[data.duplicated()]

In [None]:
# show top few rows of dataset
data.head()


In [None]:
# summary of data
data.info()

In [None]:
# to view the columns names in the dataset
data.columns

In [None]:
# to check the Null values in the dataset
data.isnull().sum()

In [None]:
data.shape

In [None]:
#To check the duplicate values in dataset
data[data.duplicated()]
# dataset having only 1 duplicated row.


**2. Based on the findings from the previous question, remove duplicates (if any) and treat missing values using an appropriate strategy.**

In [None]:
#drop the duplicated rows from the dataset
data.drop_duplicates()



**3. Get a preliminary statistical summary of the data. Explore the measures of central tendencies and the spread of the data overall.**


In [None]:
# to check the statistical summary of numerical columns (descriptive statistics)
data.describe(include='all')

##  Performing EDA:

**Identify the data variables which might be categorical in nature. Describe and explore these variables using appropriate tools. For example: count plot.**


In [None]:
# Identify columns with low number of unique values (excluding ID-like or continuous vars)
categorical_cols = [col for col in data.columns if data[col].nunique() < 10 and data[col].dtype != 'float64']

print("Categorical Variables:")
print(categorical_cols)


In [None]:
# Count plots

for col in categorical_cols:
    plt.figure(figsize=(6, 3))
    sns.countplot(data=data, x=col, hue='target', palette="mako_r")
    plt.title(f'Count Plot of {col} by Target')
    plt.tight_layout()
    plt.show()


In [None]:
data['target'].value_counts()

In [None]:
sns.countplot(x="target", data=data, palette="mako_r")
plt.show()

**Study the Occurrence of CVD Across Different Ages**

In [None]:


pd.crosstab(data.age,data.target).plot(kind="bar",figsize=(20,6), color = ['g','r'])
plt.title('Heart Disease Distribution by Patient Age')
plt.xlabel('Age')
plt.ylabel('Counts')
plt.show()

**Detect heart attack based on anomalies in resting blood pressure of the patient?**




In [None]:
# step 3:
print('Detecting Heart attack based on Resting Blood Pressure')
plt.figure(figsize=(18,6))
sns.countplot(x = data['trestbps'], hue = data['target'], palette='bwr')
plt.xlabel("Resting Blood Pressure")
plt.show()

**Study the composition of overall patients w.r.t . gender.**





In [None]:

print('Analysing distribution of heart attack w.r.t gender/sex ')
sns.countplot(x = data['sex'], hue = data['target'], palette='bwr')
plt.xlabel("Sex (0 = female, 1= male)")
plt.show()

**Permorming EDA AND MODELLING**

Describe the relationship between cholesterol levels and our target variable.


In [None]:
#step 1:
plt.figure(figsize=(8,6))
sns.boxplot(x=data['target'], y=data['chol'], palette='Set2')
plt.xlabel('Heart Disease (0 = No, 1 = Yes)')
plt.ylabel('Cholesterol Level (mg/dl)')
plt.title('Cholesterol Level vs Heart Disease')
plt.show()

**What can be concluded about the relationship between peak exercising and occurrence of heart attack?**





In [None]:
# step 2:
sns.violinplot(data=data, x="target", y="oldpeak", hue="sex")
plt.xlabel('Heart Disease (0 = No, 1 = Yes)')
plt.ylabel('Peak Exercise')
plt.title('Peak Exercise vs Heart Disease')
plt.show()

**Is thalassemia a major cause of CVD? How are the other factors determining the occurrence of CVD?**




In [None]:
# step 3:

sns.barplot(x='target',y='thal', data=data,
            hue='sex')

Use a pair plot to understand the relationship between all the given




In [None]:
#step 4:
sns.pairplot(data)

**5. Perform logistic regression, predict the outcome for test data, and validate the results by using the confusion matrix.**






In [None]:
print(data.corr()['target'])
sns.heatmap(data.corr())
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Separate features and target (Preapare  the dataset)
# Features and target
X = data.drop('target', axis=1)  # All columns except target
y = data['target']               # The target column

In [None]:
# Train-Test Split
from sklearn.model_selection import train_test_split

# Split into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Scale the Features from sklearn.preprocessing

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [None]:
# Fit Logistic Regression Model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_scaled, y_train)


In [None]:
# Make Predictions
y_pred = model.predict(X_test_scaled)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Classification metrics
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))


In [None]:

# Visualize the confusion matrix
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No CVD', 'CVD'], yticklabels=['No CVD', 'CVD'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()
