## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")

## 2. Load Dataset

In [None]:
# Load the dataset
# Note: Download dataset dari Kaggle terlebih dahulu
# Gunakan: kaggle datasets download -d redwankarimsony/heart-disease-data

df = pd.read_csv('heart.csv')

print(f"Shape of dataset: {df.shape}")
print(f"\nJumlah baris: {df.shape[0]}")
print(f"Jumlah kolom: {df.shape[1]}")

## 3. Eksplorasi Data Awal

In [None]:
# Tampilkan 5 baris pertama
print("5 Baris Pertama:")
df.head()

In [None]:
# Info dataset
print("\nInformasi Dataset:")
df.info()

In [None]:
# Cek missing values
print("\nMissing Values:")
missing = df.isnull().sum()
missing_pct = (df.isnull().sum() / len(df)) * 100
missing_table = pd.DataFrame({'Missing Count': missing, 'Percentage': missing_pct})
print(missing_table[missing_table['Missing Count'] > 0])

if missing.sum() == 0:
    print("\nâœ“ Tidak ada missing values!")

In [None]:
# Cek duplikat
duplicates = df.duplicated().sum()
print(f"\nJumlah baris duplikat: {duplicates}")

if duplicates > 0:
    print(f"Persentase duplikat: {(duplicates/len(df))*100:.2f}%")

## 4. Statistik Deskriptif

In [None]:
# Statistik untuk variabel numerik
print("Statistik Deskriptif - Variabel Numerik:")
df.describe().T

In [None]:
# Identifikasi tipe kolom
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print(f"\nKolom Numerik ({len(numeric_cols)}): {numeric_cols}")
print(f"\nKolom Kategorikal ({len(categorical_cols)}): {categorical_cols}")

In [None]:
# Distribusi variabel target (jika ada kolom target)
# Biasanya kolom target bernama 'target', 'output', 'HeartDisease', dll
target_cols = [col for col in df.columns if any(x in col.lower() for x in ['target', 'output', 'disease', 'diagnosis'])]

if target_cols:
    target_col = target_cols[0]
    print(f"\nDistribusi Target Variable '{target_col}':")
    print(df[target_col].value_counts())
    print(f"\nPersentase:")
    print(df[target_col].value_counts(normalize=True) * 100)
else:
    print("\nTarget variable tidak teridentifikasi. Menampilkan semua kolom:")
    print(df.columns.tolist())

## 5. Visualisasi Data

In [None]:
# Distribusi variabel target
if target_cols:
    plt.figure(figsize=(8, 5))
    df[target_col].value_counts().plot(kind='bar', color=['#FF6B6B', '#4ECDC4'])
    plt.title(f'Distribusi {target_col}', fontsize=14, fontweight='bold')
    plt.xlabel(target_col)
    plt.ylabel('Jumlah')
    plt.xticks(rotation=0)
    plt.grid(axis='y', alpha=0.3)
    
    # Tambahkan nilai di atas bar
    for i, v in enumerate(df[target_col].value_counts()):
        plt.text(i, v + 5, str(v), ha='center', fontweight='bold')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Distribusi variabel numerik
if len(numeric_cols) > 0:
    n_cols = 3
    n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_rows == 1 and n_cols == 1 else axes
    
    for idx, col in enumerate(numeric_cols):
        axes[idx].hist(df[col].dropna(), bins=30, color='skyblue', edgecolor='black', alpha=0.7)
        axes[idx].set_title(f'Distribusi {col}', fontweight='bold')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frekuensi')
        axes[idx].grid(alpha=0.3)
    
    # Hide extra subplots
    for idx in range(len(numeric_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Box plots untuk deteksi outliers
if len(numeric_cols) > 0:
    n_cols = 3
    n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_rows == 1 and n_cols == 1 else axes
    
    for idx, col in enumerate(numeric_cols):
        axes[idx].boxplot(df[col].dropna(), vert=True)
        axes[idx].set_title(f'Box Plot - {col}', fontweight='bold')
        axes[idx].set_ylabel(col)
        axes[idx].grid(alpha=0.3)
    
    # Hide extra subplots
    for idx in range(len(numeric_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()

## 6. Analisis Korelasi

In [None]:
# Correlation matrix
if len(numeric_cols) > 1:
    plt.figure(figsize=(12, 10))
    correlation_matrix = df[numeric_cols].corr()
    
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix - Variabel Numerik', fontsize=14, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()

In [None]:
# Korelasi dengan target variable
if target_cols and len(numeric_cols) > 1:
    target_correlation = df[numeric_cols].corr()[target_col].sort_values(ascending=False)
    
    plt.figure(figsize=(10, 6))
    target_correlation.drop(target_col).plot(kind='barh', color='coral')
    plt.title(f'Korelasi dengan {target_col}', fontsize=14, fontweight='bold')
    plt.xlabel('Korelasi')
    plt.ylabel('Variabel')
    plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("\nKorelasi dengan target (sorted):")
    print(target_correlation.drop(target_col))

## 7. Analisis Berdasarkan Target (jika ada)

In [None]:
# Perbandingan distribusi berdasarkan target
if target_cols and len(numeric_cols) > 1:
    numeric_features = [col for col in numeric_cols if col != target_col]
    
    if len(numeric_features) > 0:
        n_cols = 3
        n_rows = (len(numeric_features) + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
        axes = axes.flatten() if n_rows > 1 else [axes] if n_rows == 1 and n_cols == 1 else axes
        
        for idx, col in enumerate(numeric_features):
            for target_val in df[target_col].unique():
                data = df[df[target_col] == target_val][col].dropna()
                axes[idx].hist(data, bins=20, alpha=0.6, label=f'{target_col}={target_val}')
            
            axes[idx].set_title(f'{col} by {target_col}', fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frekuensi')
            axes[idx].legend()
            axes[idx].grid(alpha=0.3)
        
        # Hide extra subplots
        for idx in range(len(numeric_features), len(axes)):
            axes[idx].axis('off')
        
        plt.tight_layout()
        plt.show()

## 8. Analisis Kategorikal (jika ada)

In [None]:
# Analisis variabel kategorikal
if len(categorical_cols) > 0:
    for col in categorical_cols:
        print(f"\n{'='*50}")
        print(f"Distribusi {col}:")
        print(f"{'='*50}")
        print(df[col].value_counts())
        print(f"\nPersentase:")
        print(df[col].value_counts(normalize=True) * 100)

In [None]:
# Visualisasi variabel kategorikal
if len(categorical_cols) > 0:
    n_cols = 2
    n_rows = (len(categorical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, n_rows * 5))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_rows == 1 and n_cols == 1 else axes
    
    for idx, col in enumerate(categorical_cols):
        df[col].value_counts().plot(kind='bar', ax=axes[idx], color='teal', alpha=0.7)
        axes[idx].set_title(f'Distribusi {col}', fontweight='bold')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Jumlah')
        axes[idx].tick_params(axis='x', rotation=45)
        axes[idx].grid(axis='y', alpha=0.3)
    
    # Hide extra subplots
    for idx in range(len(categorical_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()

## 9. Insight dan Kesimpulan

In [None]:
print("="*70)
print("RINGKASAN ANALISIS DATA HEART DISEASE")
print("="*70)

print(f"\n1. INFORMASI DATASET:")
print(f"   - Total sampel: {len(df)}")
print(f"   - Total fitur: {len(df.columns)}")
print(f"   - Fitur numerik: {len(numeric_cols)}")
print(f"   - Fitur kategorikal: {len(categorical_cols)}")
print(f"   - Missing values: {df.isnull().sum().sum()}")
print(f"   - Data duplikat: {df.duplicated().sum()}")

if target_cols:
    print(f"\n2. TARGET VARIABLE ({target_col}):")
    for val, count in df[target_col].value_counts().items():
        pct = (count / len(df)) * 100
        print(f"   - {val}: {count} ({pct:.2f}%)")
    
    if len(numeric_cols) > 1:
        print(f"\n3. TOP KORELASI DENGAN TARGET:")
        top_corr = df[numeric_cols].corr()[target_col].drop(target_col).abs().sort_values(ascending=False).head(5)
        for feature, corr in top_corr.items():
            print(f"   - {feature}: {corr:.3f}")

print(f"\n4. STATISTIK FITUR NUMERIK:")
for col in numeric_cols[:5]:  # Show top 5
    print(f"   - {col}:")
    print(f"     Mean: {df[col].mean():.2f}, Std: {df[col].std():.2f}")
    print(f"     Min: {df[col].min():.2f}, Max: {df[col].max():.2f}")

print("\n" + "="*70)

## 10. Rekomendasi Langkah Selanjutnya

Berdasarkan analisis di atas, berikut beberapa langkah yang bisa dilakukan:

1. **Data Preprocessing**:
   - Handle missing values (jika ada)
   - Handle outliers
   - Feature scaling/normalization
   - Encoding variabel kategorikal

2. **Feature Engineering**:
   - Buat fitur baru dari kombinasi fitur existing
   - Feature selection berdasarkan korelasi

3. **Modeling**:
   - Logistic Regression
   - Random Forest
   - XGBoost/LightGBM
   - Neural Networks

4. **Evaluation**:
   - Cross-validation
   - Metrics: Accuracy, Precision, Recall, F1-Score, ROC-AUC
   - Confusion Matrix
   - Feature importance analysis