<a href="https://colab.research.google.com/github/bjungweapon/mjc.ai.ml/blob/BDU/BDU.animation.8.3.SVM.titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import seaborn as sns
from matplotlib.animation import FuncAnimation
from matplotlib.colors import ListedColormap
from IPython.display import HTML
import matplotlib.animation as animation
from sklearn.decomposition import PCA

# 타이타닉 데이터셋 불러오기
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# 데이터 전처리
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# 범주형 데이터 인코딩
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
embarked_dummies = pd.get_dummies(df['Embarked'], prefix='Embarked')
df = pd.concat([df, embarked_dummies], axis=1)
df = df.drop('Embarked', axis=1)

# 타겟 변수와 특성 분리
X = df.drop('Survived', axis=1)
y = df['Survived']

# 교육용 시각화를 위해 가장 중요한 특성 2개만 선택 (Age와 Fare)
X_selected = X[['Age', 'Fare']]

# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


# SVM 모델 생성 함수
def create_svm_model(C_value, kernel_type='linear'):
    model = SVC(C=C_value, kernel=kernel_type, gamma='auto')
    model.fit(X_train, y_train)
    return model


# 결정 경계 시각화 함수
def plot_decision_boundary(ax, model, X, y, h=0.02):
    # 메쉬 그리드 설정
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # 메쉬 그리드 포인트에서의 예측 계산
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # 결정 경계 플롯
    # Removed the fillstyle argument as it's not used by contourf
    ax.contourf(xx, yy, Z, alpha=0.3, cmap=ListedColormap(['#FFAAAA', '#AAAAFF']))
    ax.contour(xx, yy, Z, colors='k', linestyles=['-'], linewidths=2)

    # 데이터 포인트 플롯
    for label_idx, marker, color in zip([0, 1], ['x', 'o'], ['blue', 'red']):
        ax.scatter(
            X[y == label_idx, 0], X[y == label_idx, 1],
            c=color, marker=marker, s=50, alpha=0.7,
            label=f'{"Deceased" if label_idx == 0 else "Survived"}'
        )

    # 서포트 벡터 표시
    if hasattr(model, 'support_vectors_'):
        ax.scatter(
            model.support_vectors_[:, 0], model.support_vectors_[:, 1],
            s=100, linewidth=1, facecolors='none', edgecolors='green',
            label='SUPPORT VECTOR'
        )

    # 그래프 설정
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xlabel('Age (scaled)')
    ax.set_ylabel('Fare (scaled)')
    ax.set_title(f'SVM (C={model.C})')
    ax.legend(loc='upper right')


# C 값에 따른 SVM 모델 변화 애니메이션
fig, ax = plt.subplots(figsize=(10, 8))
plt.subplots_adjust(bottom=0.2)

# 애니메이션을 위한 C 값 범위
C_values = np.logspace(-2, 2, 20)

# 초기 텍스트 객체
text = ax.text(0.02, 0.02, '', transform=ax.transAxes, fontsize=12,
                bbox=dict(facecolor='white', alpha=0.8))

# 애니메이션 함수
def update(frame):
    ax.clear()
    C = C_values[frame]

    # 현재 C 값으로 SVM 모델 생성
    model = create_svm_model(C)

    # 결정 경계 시각화
    plot_decision_boundary(ax, model, X_train, y_train)

    # 모델 성능 계산
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    # 성능 정보 텍스트 업데이트
    info_text = f'C: {C:.2f}\n'
    info_text += f'Training accuracy: {train_acc:.2f}\n'
    info_text += f'Test accuracy: {test_acc:.2f}\n'
    info_text += f'Number of support vectors: {len(model.support_vectors_)}'

    text = ax.text(0.02, 0.02, info_text, transform=ax.transAxes, fontsize=12,
                    bbox=dict(facecolor='white', alpha=0.8))

    # 마진 설명 추가
    title_text = f'SVM Decision Boundary (C={C:.2f})'
    if C < 0.1:
        title_text += " - Large margin, weak regularization (underfitting risk)"
    elif C > 10:
        title_text += " - Small margin, strong regularization (overfitting risk)"
    else:
        title_text += " - Balanced margin"

    ax.set_title(title_text)

    return ax, text

# 애니메이션 생성
ani = FuncAnimation(fig, update, frames=len(C_values), interval=500, blit=False)

# HTML로 변환하여 Colab에서 재생 가능하게 함
html_animation = animation.HTMLWriter(fps=2)
ani.save('svm_animation.html', writer=html_animation)
HTML(ani.to_jshtml())

# 커널 변화에 따른 SVM 애니메이션
fig2, ax2 = plt.subplots(figsize=(10, 8))
plt.subplots_adjust(bottom=0.2)

# 다양한 커널 사용
kernels = ['linear', 'poly', 'rbf']
titles = ['Linear Kernel', 'Polynomial Kernel', 'RBF Kernel']

# 초기 텍스트 객체
text2 = ax2.text(0.02, 0.02, '', transform=ax2.transAxes, fontsize=12,
                bbox=dict(facecolor='white', alpha=0.8))

# 커널 애니메이션 함수
# 커널 애니메이션 함수
def update_kernel(frame):
    ax2.clear()
    kernel_idx = frame % len(kernels)
    kernel = kernels[kernel_idx]
    title = titles[kernel_idx]

    # C 값은 균형 잡힌 값 사용
    model = SVC(C=1.0, kernel=kernel, gamma='auto')
    model.fit(X_train, y_train)

    # 결정 경계 시각화
    # Removed fillstyle='full' as it is not a valid argument for plot_decision_boundary
    plot_decision_boundary(ax2, model, X_train, y_train)

    # 모델 성능 계산
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)


    # 성능 정보 텍스트 업데이트
    info_text = f'Kernel: {title}\n'
    info_text += f'Training accuracy: {train_acc:.2f}\n'
    info_text += f'Test accuracy: {test_acc:.2f}\n'

    if hasattr(model, 'support_vectors_'):
        info_text += f'Number of support vectors: {len(model.support_vectors_)}'

    text2 = ax2.text(0.02, 0.02, info_text, transform=ax2.transAxes, fontsize=12,
                    bbox=dict(facecolor='white', alpha=0.8))

    # 커널 특성 설명
    kernel_explanation = ""
    if kernel == 'linear':
        kernel_explanation = "Linear Kernel: Straight decision boundary suitable for simple classification problems"
    elif kernel == 'poly':
        kernel_explanation = "Polynomial Kernel: Non-linear decision boundary capable of capturing complex patterns"
    elif kernel == 'rbf':
        kernel_explanation = "RBF Kernel: Highly flexible decision boundary suitable for complex data"

    ax2.set_title(f'SVM {title} (C=1.0)\n{kernel_explanation}')

    return ax2, text2

# 커널 애니메이션 생성
ani2 = FuncAnimation(fig2, update_kernel, frames=9, interval=1000, blit=False)

# HTML로 변환
html_animation2 = animation.HTMLWriter(fps=1)
ani2.save('svm_kernel_animation.html', writer=html_animation2)
HTML(ani2.to_jshtml())

# 추가: 전체 특성에 대한 PCA 시각화와 SVM
# 더 많은 특성을 포함시켜 PCA로 축소하여 시각화
numerical_cols = ['Age', 'Fare', 'Sex', 'Pclass', 'SibSp', 'Parch']
X_more = X[numerical_cols]
X_more_scaled = scaler.fit_transform(X_more)

# PCA 적용하여 2차원으로 축소
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_more_scaled)

# PCA 변환된 데이터로 학습/테스트 분할
X_pca_train, X_pca_test, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# PCA 데이터에 대한 SVM 애니메이션
fig3, ax3 = plt.subplots(figsize=(10, 8))
plt.subplots_adjust(bottom=0.2)

# 초기 텍스트 객체
text3 = ax3.text(0.02, 0.02, '', transform=ax3.transAxes, fontsize=12,
                bbox=dict(facecolor='white', alpha=0.8))

# PCA 애니메이션 함수
def update_pca(frame):
    ax3.clear()
    C = C_values[frame]

    # 현재 C 값으로 SVM 모델 생성
    model = SVC(C=C, kernel='rbf', gamma='auto')
    model.fit(X_pca_train, y_train_pca)

    # 결정 경계 시각화
    plot_decision_boundary(ax3, model, X_pca_train, y_train_pca)

    # 모델 성능 계산
    y_train_pred = model.predict(X_pca_train)
    y_test_pred = model.predict(X_pca_test)
    train_acc = accuracy_score(y_train_pca, y_train_pred)
    test_acc = accuracy_score(y_test_pca, y_test_pred)

    # 성능 정보 텍스트 업데이트
    info_text = f'C: {C:.2f} (PCA applied data)\n'
    info_text += f'Training accuracy: {train_acc:.2f}\n'
    info_text += f'Test accuracy: {test_acc:.2f}\n'

    if hasattr(model, 'support_vectors_'):
        info_text += f'Number of support vectors: {len(model.support_vectors_)}'

    text3 = ax3.text(0.02, 0.02, info_text, transform=ax3.transAxes, fontsize=12,
                    bbox=dict(facecolor='white', alpha=0.8))

    ax3.set_xlabel('First Principal Component')
    ax3.set_ylabel('Second Principal Component')
    ax3.set_title(f'SVM on PCA Reduced Data (C={C:.2f})')

    return ax3, text3

# PCA 애니메이션 생성
ani3 = FuncAnimation(fig3, update_pca, frames=len(C_values), interval=500, blit=False)

# HTML로 변환
html_animation3 = animation.HTMLWriter(fps=2)
ani3.save('svm_pca_animation.html', writer=html_animation3)
HTML(ani3.to_jshtml())

# 추가 - SVM 하이퍼파라미터 튜닝 과정을 보여주는 애니메이션
fig4, ax4 = plt.subplots(figsize=(12, 10))
plt.subplots_adjust(bottom=0.15)

# C와 gamma를 함께 조정
C_values_tune = np.logspace(-2, 2, 5)
gamma_values = np.logspace(-3, 1, 5)

# 모든 조합 생성
param_combinations = [(c, g) for c in C_values_tune for g in gamma_values]

# 초기 텍스트 객체
text4 = ax4.text(0.02, 0.02, '', transform=ax4.transAxes, fontsize=12,
                bbox=dict(facecolor='white', alpha=0.8))

# 하이퍼파라미터 튜닝 애니메이션 함수
def update_hyperparams(frame):
    ax4.clear()
    C, gamma = param_combinations[frame % len(param_combinations)]

    # 현재 파라미터로 SVM 모델 생성
    model = SVC(C=C, kernel='rbf', gamma=gamma)
    model.fit(X_train, y_train)

    # 결정 경계 시각화
    plot_decision_boundary(ax4, model, X_train, y_train)

    # 모델 성능 계산
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    # 성능 정보 텍스트 업데이트
    info_text = f'C: {C:.2f}, gamma: {gamma:.4f}\n'
    info_text += f'Training accuracy: {train_acc:.2f}\n'
    info_text += f'Test accuracy: {test_acc:.2f}\n'

    if hasattr(model, 'support_vectors_'):
        info_text += f'Number of support vectors: {len(model.support_vectors_)}'

    text4 = ax4.text(0.02, 0.02, info_text, transform=ax4.transAxes, fontsize=12,
                    bbox=dict(facecolor='white', alpha=0.8))

    # 하이퍼파라미터 설명
    param_explanation = ""
    if C < 0.1:
        param_explanation += "Low C: Large margin, better generalization / "
    elif C > 10:
        param_explanation += "High C: Small margin, focus on training data / "

    if gamma < 0.01:
        param_explanation += "Low gamma: Wider influence range, smoother boundary"
    elif gamma > 1:
        param_explanation += "High gamma: Narrower influence range, complex boundary"

    ax4.set_title(f'SVM Hyperparameter Tuning (C={C:.2f}, gamma={gamma:.4f})\n{param_explanation}')

    return ax4, text4

# 하이퍼파라미터 튜닝 애니메이션 생성
ani4 = FuncAnimation(fig4, update_hyperparams, frames=len(param_combinations), interval=500, blit=False)

# HTML로 변환
html_animation4 = animation.HTMLWriter(fps=2)
ani4.save('svm_hyperparams_animation.html', writer=html_animation4)
HTML(ani4.to_jshtml())

print("Animation creation complete! Please check the animations above.")
