# Loading Dataset

In [None]:
from google.colab import files
import pandas as pd

uploaded = files.upload()

# Assuming one file is uploaded
for fn in uploaded.keys():
    df = pd.read_excel(fn)


# Exploratory Data Analysis (EDA)
***this section has 6 steps***

**EDA.01**

In [None]:
# 📊 First Step of EDA: Data Overview
# 1. Shape
print("🔹 Dataset Shape:")
print(f"Rows: {df.shape[0]}, columns: {df.shape[1]}\n")

# 2. Column Names and Data Types
print("🔹 Column Names and Data Types:")
print(df.dtypes)
print("\n")

# 3. Missing Values
print("🔹 Missing Values per Column:")
missing = df.isnull().sum()
missing = missing[missing > 0]
if not missing.empty:
    print(missing)
else:
    print("No missing values found.")
print("\n")

# 4. Basic Statistics
print("🔹 Summary Statistics for Numerical Features:")
print(df.describe().T[['mean', 'std', 'min', 'max']])

**EDA.02**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Target variable
target = 'num'

# Value counts
print("🔹 Target Value Counts:")
print(df[target].value_counts())

# Relative frequency
print("\n🔹 Target Class Proportions:")
print(df[target].value_counts(normalize=True))

# Visualization
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x=target, palette='coolwarm')
plt.title("Distribution of Target Variable: Heart Disease Severity")
plt.xlabel("Heart Disease Class (num)")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

**EDA.03**

In [None]:
#Identify Feature Types ####################

# Drop target column
features = df.drop(columns=['num'])

# Separate numerical and categorical columns
numerical_cols = features.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = features.select_dtypes(include='object').columns

print("🔹 Numerical Features:", list(numerical_cols))
print("🔹 Categorical Features:", list(categorical_cols))


# Plot Numerical Features ####################

import matplotlib.pyplot as plt
import seaborn as sns

# Histograms
features[numerical_cols].hist(figsize=(15, 10), bins=20, color='skyblue', edgecolor='black')
plt.suptitle("Histograms of Numerical Features", fontsize=16)
plt.tight_layout()
plt.show()

# Boxplots
plt.figure(figsize=(15, 8))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(3, 5, i)
    sns.boxplot(y=features[col], color='lightgreen')
    plt.title(col)
plt.tight_layout()
plt.show()




#Plot Categorical Features ####################

for col in categorical_cols:
    print(f"\n🔹 Frequency Table for {col}")
    print(features[col].value_counts())

    plt.figure(figsize=(6, 4))
    sns.countplot(data=features, x=col, palette='pastel')
    plt.title(f"Bar Plot of {col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


**EDA.04**

In [None]:
# Identify Feature Types ####################
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('num')
categorical_cols = df.select_dtypes(include='object').columns


# Numerical vs Target – Boxplots & T-tests ####################
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns

for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(data=df, x='num', y=col, palette='coolwarm')
    plt.title(f"{col} vs num")
    plt.tight_layout()
    plt.show()

    group0 = df[df['num'] == 0][col]
    group1 = df[df['num'] != 0][col]
    t_stat, p_val = ttest_ind(group0, group1, nan_policy='omit')
    print(f"T-test for {col}: t-stat = {t_stat:.2f}, p-value = {p_val:.4f}")

    # Categorical vs Target – Stacked Bar Plots ####################
    for col in categorical_cols:
      ct = pd.crosstab(df[col], df['num'], normalize='index')
      ct.plot(kind='bar', stacked=True, figsize=(6, 4), colormap='Set2')
    plt.title(f"{col} vs num")
    plt.ylabel("Proportion")
    plt.tight_layout()
    plt.show()

EDA.05

In [None]:
# Correlation Matrix ####################
import matplotlib.pyplot as plt
import seaborn as sns

# Select numerical columns including target
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Correlation matrix
plt.figure(figsize=(12, 10))
corr = df[numerical_cols].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title("Correlation Matrix of Numerical Features")
plt.tight_layout()
plt.show()



# Pairplot (Optional but Insightful) ####################
selected = numerical_cols[:5].tolist() + ['num']  # Pick top 5 for clarity
sns.pairplot(df[selected], hue='num', palette='husl')
plt.suptitle("Pairplot of Selected Features", y=1.02)
plt.show()



**EDA.06**

In [None]:
# Skewness of Numerical Features ####################
from scipy.stats import skew

numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('num')

print("🔹 Skewness of Numerical Features:")
for col in numerical_cols:
    sk = skew(df[col].dropna())
    print(f"{col}: skewness = {sk:.2f}")


# Outlier Detection Using IQR ####################
print("\n🔹 Outlier Counts (IQR Method):")
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
    print(f"{col}: {len(outliers)} outliers")


# Spot Invalid Values ####################
print("\n🔹 Invalid Value Checks:")
print("Negative values in 'chol':", (df['chol'] < 0).sum())
print("Negative values in 'trestbps':", (df['trestbps'] < 0).sum())




# Preprocessing

In [None]:
# Step 1: Check for missing values
missing = df.isnull().sum()
missing = missing[missing > 0]

if not missing.empty:
    print("🔹 Missing Values Found:")
    print(missing)
else:
    print("✅ No missing values detected.")

    df.fillna(df.median(numeric_only=True), inplace=True)
    for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)