In [None]:
!pip install seaborn

# Preliminary EDA

In [None]:
# Exploratory Data Analysis for “Heart Prediction Quantum Dataset.csv”
# All figures are saved inside the local folder `output/`

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# ------------------------------------------------------------------
# 0. Configuration
# ------------------------------------------------------------------
sns.set(style="whitegrid", font_scale=1.1)
plt.rcParams["figure.figsize"] = (8, 5)

CSV_PATH   = Path("Heart Prediction Quantum Dataset.csv")
OUT_DIR    = Path("output")
OUT_DIR.mkdir(exist_ok=True)

# ------------------------------------------------------------------
# 1. Load dataset
# ------------------------------------------------------------------
df = pd.read_csv(CSV_PATH)

# ------------------------------------------------------------------
# 2. Basic inspection
# ------------------------------------------------------------------
print("\n--- Head --------------------------------------------------------")
print(df.head(), "\n")

print("\n--- Info --------------------------------------------------------")
df.info()

print("\n--- Missing values ---------------------------------------------")
print(df.isna().sum(), "\n")

# ------------------------------------------------------------------
# 3. Descriptive statistics
# ------------------------------------------------------------------
desc = df.describe().T
desc["iqr"] = desc["75%"] - desc["25%"]
print("\n--- Descriptive statistics -------------------------------------")
print(desc, "\n")
desc.to_csv(OUT_DIR / "descriptive_statistics.csv")

# ------------------------------------------------------------------
# 4. Target column
# ------------------------------------------------------------------
target = "HeartDisease" if "HeartDisease" in df.columns else df.columns[-1]

# ------------------------------------------------------------------
# 5. Univariate histograms + KDE
# ------------------------------------------------------------------
num_cols = df.select_dtypes(include=np.number).columns.tolist()
num_cols_no_target = [c for c in num_cols if c != target]

cols = 3
rows = int(np.ceil(len(num_cols_no_target) / cols))
fig, axes = plt.subplots(rows, cols, figsize=(5 * cols, 4 * rows))
axes = axes.ravel()

for i, col in enumerate(num_cols_no_target):
    sns.histplot(df[col].dropna(), kde=True, ax=axes[i], color="steelblue")
    axes[i].set_title(col)

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

fig.tight_layout()
fig.savefig(OUT_DIR / "histograms.png", dpi=300)
plt.close(fig)

# ------------------------------------------------------------------
# 6. Boxplots by target
# ------------------------------------------------------------------
for col in num_cols_no_target:
    fig, ax = plt.subplots()
    sns.boxplot(x=target, y=col, data=df, palette="Set2", ax=ax)
    ax.set_title(f"{col} by {target}")
    fig.tight_layout()
    fig.savefig(OUT_DIR / f"boxplot_{col}.png", dpi=300)
    plt.close(fig)

# ------------------------------------------------------------------
# 7. Correlation heatmap
# ------------------------------------------------------------------
fig, ax = plt.subplots(figsize=(10, 8))
corr = df[num_cols].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, cmap="vlag", center=0, ax=ax)
ax.set_title("Correlation Matrix")
fig.tight_layout()
fig.savefig(OUT_DIR / "correlation_heatmap.png", dpi=300)
plt.close(fig)

# ------------------------------------------------------------------
# 8. Pairplot (comment out if dataset is very large)
# ------------------------------------------------------------------
pairplot = sns.pairplot(df[num_cols], hue=target, diag_kind="kde", corner=True)
pairplot.fig.suptitle("Pairplot", y=1.02)
pairplot.savefig(OUT_DIR / "pairplot.png", dpi=300)
plt.close("all")

# ------------------------------------------------------------------
# 9. Simple outlier detection (1.5 * IQR rule)
# ------------------------------------------------------------------
outlier_lines = []
for col in num_cols_no_target:
    q1, q3 = df[col].quantile([0.25, 0.75])
    iqr = q3 - q1
    low, high = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    outliers = df[(df[col] < low) | (df[col] > high)][col]
    if not outliers.empty:
        outlier_lines.append(f"{col}: {outliers.values}")

with open(OUT_DIR / "outliers.txt", "w") as f:
    f.write("\n".join(outlier_lines) if outlier_lines else "No outliers found")

print(f"\nEDA complete. All outputs saved to: {OUT_DIR.resolve()}")


# Different Correaltion

In [None]:
"""
Complete EDA script for “Heart Prediction Quantum Dataset.csv”
-----------------------------------------------------------------
Adds:  • Pearson, Spearman and Kendall-Tau correlation matrices,
       • Heat-maps saved as images,
       • Automatic comment on which correlation measure is likely
         most appropriate for this dataset (written to `best_corr.txt`).
Images and text outputs are placed inside ./output/
"""

# ─────────────────────────────────────────────────────────────────
# Imports & configuration
# ─────────────────────────────────────────────────────────────────
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

sns.set(style="whitegrid", font_scale=1.1)
plt.rcParams["figure.figsize"] = (8, 5)

CSV_PATH = Path("Heart Prediction Quantum Dataset.csv")
OUT_DIR  = Path("output")
OUT_DIR.mkdir(exist_ok=True)

# ─────────────────────────────────────────────────────────────────
# Load data
# ─────────────────────────────────────────────────────────────────
df = pd.read_csv(CSV_PATH)
target = "HeartDisease" if "HeartDisease" in df.columns else df.columns[-1]

# ─────────────────────────────────────────────────────────────────
# Function to compute & save correlation heat-map
# ─────────────────────────────────────────────────────────────────
def save_corr_heatmap(data: pd.DataFrame, method: str, fname: str):
    """
    Compute correlation with the chosen `method` and save a triangular heat-map.
    """
    corr = data.corr(method=method)
    mask = np.triu(np.ones_like(corr, dtype=bool))

    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(corr, mask=mask, annot=True, cmap="vlag", center=0, ax=ax)
    ax.set_title(f"{method.capitalize()} Correlation Matrix")
    fig.tight_layout()
    fig.savefig(OUT_DIR / fname, dpi=300)
    plt.close(fig)
    return corr

# ─────────────────────────────────────────────────────────────────
# 1. Pearson – linear relationships
# ─────────────────────────────────────────────────────────────────
pearson_corr = save_corr_heatmap(df, "pearson", "heatmap_pearson.png")

# ─────────────────────────────────────────────────────────────────
# 2. Spearman – monotonic (rank) relationships
# ─────────────────────────────────────────────────────────────────
spearman_corr = save_corr_heatmap(df, "spearman", "heatmap_spearman.png")

# ─────────────────────────────────────────────────────────────────
# 3. Kendall – concordance measure (robust, good for small N)
# ─────────────────────────────────────────────────────────────────
kendall_corr = save_corr_heatmap(df, "kendall", "heatmap_kendall.png")

# ─────────────────────────────────────────────────────────────────
# 4. Decide which correlation measure is “best”
#    Criteria (rule-of-thumb, can be refined):
#       • Non-normal features or small sample  -> prefer Spearman / Kendall
#       • Many ties (discrete vars)            -> prefer Kendall
#       • Purely linear behaviour & sizeable N -> Pearson
# ─────────────────────────────────────────────────────────────────
n_rows = len(df)
approx_normal = df.skew(numeric_only=True).abs().lt(1).all()

if n_rows < 30:
    best = "Spearman (small sample; less sensitive to outliers / non-normality)"
elif not approx_normal:
    best = "Spearman (data not approximately normal)"
else:
    best = "Pearson (looks roughly linear & normal enough)"

# ─────────────────────────────────────────────────────────────────
# 5. Save verdict and correlation-with-target table
# ─────────────────────────────────────────────────────────────────
summary_lines = []

summary_lines.append("Correlation with target (“{}”)".format(target))
summary_lines.append("\nPEARSON\n"  + pearson_corr[target].to_string())
summary_lines.append("\nSPEARMAN\n" + spearman_corr[target].to_string())
summary_lines.append("\nKENDALL\n"  + kendall_corr[target].to_string())
summary_lines.append("\n>>> Suggested best correlation metric:\n" + best)

with open(OUT_DIR / "best_corr.txt", "w") as f:
    f.write("\n\n".join(summary_lines))

print("Correlation heat-maps & summary saved to:", OUT_DIR.resolve())
