In [None]:
# ==========================================
# CHEAT SHEET: END-TO-END CLASSIFICATION
# ==========================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:

# 1. LOAD DATA
df = pd.read_csv('data.csv')

# 2. DATA CLEANING (WRANGLING)
# ------------------------------------------
# A. Isi Missing Values (Angka) dengan Median
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# B. Isi Missing Values (Kategori) dengan Modus
mode_embarked = df['Embarked'].mode()[0]
df['Embarked'] = df['Embarked'].fillna(mode_embarked)

In [None]:

# 3. FEATURE ENGINEERING (MEMBUAT DATA BARU)
# ------------------------------------------
# A. Ekstrak Gelar dari Nama (Regex)
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# Mapping gelar ke angka (Grouping)
title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 1, "Countess": 3, "Ms": 1, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona": 3, "Mme": 1, "Capt": 3, "Sir": 3 }
df['Title_Code'] = df['Title'].map(title_mapping)
# (Opsional) Jika ada gelar yang tidak ter-map, isi dengan 0
df['Title_Code'] = df['Title_Code'].fillna(0)

# B. Family Size & IsAlone
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

In [None]:

# 4. PRE-PROCESSING (ENCODING & SELECTION)
# ------------------------------------------
# A. Encoding (Ubah Teks ke Angka)
df['Sex'] = df['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
df['Embarked'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

# B. Feature Selection (Buang Kolom Sampah)
# Simpan target (y) dulu jika ini data latihan
target = df['Survived'] 
features = df.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin', 'Title'], axis=1)

In [None]:
# 5. MODELING (TRAINING & EVALUATION)
# ------------------------------------------
# A. Split Data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# B. Train Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# C. Evaluasi
prediksi = model.predict(X_test)
print(f"Akurasi: {accuracy_score(y_test, prediksi) * 100:.2f}%")

# D. Cek Feature Importance
print(pd.Series(model.feature_importances_, index=features.columns).sort_values(ascending=False))