# XGBoost Classification Tutorial
**Dataset:** Heart Disease (Cleveland subset)

## Step 1: Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
sns.set(style='whitegrid')

## Step 2: Load and Preview Dataset

In [None]:
df = pd.read_csv("heart.csv")
df.head()

## Step 3: Exploratory Data Analysis (EDA)

### 3.1 Data Overview

In [None]:
df.info()

### 3.2 Missing Value Check

In [None]:
df.isnull().sum()

### 3.3 Target Distribution

In [None]:
sns.countplot(x='num', data=df, palette='pastel')
plt.title("Distribution of Heart Disease Diagnosis")
plt.xlabel("Target (0 = No Disease, >0 = Disease)")
plt.ylabel("Count")
plt.show()

### 3.4 Correlation Heatmap

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Feature Correlation Heatmap")
plt.show()

### 3.5 Feature Distributions

In [None]:
features = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
for i, col in enumerate(features):
    sns.histplot(df[col], kde=True, ax=axes[i//3][i%3])
    axes[i//3][i%3].set_title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

## Step 4: Preprocessing

In [None]:
df = df.drop(['id', 'dataset'], axis=1)
df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)
df = df.drop('num', axis=1)

categorical_cols = df.select_dtypes(include=['object', 'bool']).columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))

X = df.drop('target', axis=1)
y = df['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

## Step 5: Train XGBoost Classifier

In [None]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## Step 6: Evaluation Metrics

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

## Step 7: Feature Importance Plot

In [None]:
xgb.plot_importance(model, max_num_features=10, importance_type='gain', height=0.5)
plt.title("Top 10 Feature Importances (Gain)")
plt.tight_layout()
plt.show()