# Support Vector Machine (SVM) and XGBoost

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from evaluation import *
from feature_scaler import *
from xgboost import XGBClassifier

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

SVMs can handle non-linear data by mapping it into a higher-dimensional space through the kernel trick. This transformation enables SVMs to find hyperplanes in the new space, effectively handling complex decision boundaries.

Here are common types of kernels used in Support Vector Machines (SVMs), each explained in one line:

1. **Linear Kernel**: It creates a linear decision boundary and works well for linearly separable data.
2. **Polynomial Kernel**: It allows for non-linear decision boundaries and is useful when data has polynomial relationships.
3. **Radial Basis Function (RBF) Kernel**: It is versatile and suitable for various data types, offering non-linear decision boundaries based on the similarity to data points.
4. **Sigmoid Kernel**: It can model sigmoidal decision boundaries and is often used in binary classification problems.
5. **Custom Kernels**: Kernels can be customized to match the specific characteristics of your data, providing flexibility for unique scenarios.

## Data Pre-processing

In [2]:
df = pd.read_csv('./data/breast-cancer.csv')

encoder = LabelEncoder()
df['diagnosis'] = encoder.fit_transform(df['diagnosis'])
df = df.drop(columns=["id"], axis=1)

In [3]:
# Data split and scaling
X = df.drop(columns=["diagnosis"])
y = df['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_ss, X_test_ss = standard_scaler(X_train, X_test) # Standard Scaler
X_train_mms, X_test_mms = min_max_scaler(X_train, X_test) # Min-Max Scaler

### Removing features when the correlation between them exceeds a certain threshold

In [4]:
X_corr = X.corr(numeric_only=True).round(3)
correlation_threshold = 0.9
upper_matrix = X_corr.where(np.triu(np.ones(X_corr.shape), k=1).astype(bool))
features_to_drop = [x for x in upper_matrix.columns if any(upper_matrix[x] > correlation_threshold)]
X_corr = X.drop(X[features_to_drop], axis=1)

X_corr_train, X_corr_test = train_test_split(X_corr, test_size=0.2, random_state=42)
X_corr_train_ss, X_corr_test_ss = standard_scaler(X_corr_train, X_corr_test) # Standard Scaler
X_corr_train_mms, X_corr_test_mms = min_max_scaler(X_corr_train, X_corr_test) # Min-Max Scaler

### XGBoost

In [8]:
score = []

# With Feature Scaling
xgb_classifier_scaled = XGBClassifier()
xgb_classifier_scaled.fit(X_train_ss, y_train)
y_pred = xgb_classifier_scaled.predict(X_test_ss)
score.append(evaluate(y_test, y_pred))

xgb_classifier_scaled = XGBClassifier()
xgb_classifier_scaled.fit(X_corr_train_ss, y_train)
y_pred = xgb_classifier_scaled.predict(X_corr_test_ss)
score.append(evaluate(y_test, y_pred))

In [9]:
tmp_pca = [0,0,0,0,0,0]
tmp_pca2 = [0,0,0,0,0,0]

import warnings
for n in range(2, 20):
    pca = PCA(n_components=n)
    pca.fit(X_train)

    warnings.filterwarnings("ignore")
    X_train_pca = pca.transform(X_train_ss)
    X_test_pca = pca.transform(X_test_ss)

    model = XGBClassifier()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    tmp = evaluate(y_test, y_pred, False)
    if tmp_pca[0] < tmp[0]:
        tmp_pca = tmp

    pca = PCA(n_components=n)
    pca.fit(X_corr_train_ss)

    warnings.filterwarnings("ignore")
    X_train_pca = pca.transform(X_corr_train_ss)
    X_test_pca = pca.transform(X_corr_test_ss)

    model = XGBClassifier()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    tmp = evaluate(y_test, y_pred, False)
    if tmp_pca2[0] < tmp[0]:
        tmp_pca2 = tmp

score.append(tmp_pca)
score.append(tmp_pca2)

In [10]:
pd.DataFrame(score,
    index=['Standard Scaler', 'Standard Scaler Corr','PCA Standard Scaler', 'PCA Standard Scaler Corr'], 
    columns=['Accuracy', 'F1', 'Recall', 'Precision', 'AUC', 'Confusion Matrix'])

Unnamed: 0,Accuracy,F1,Recall,Precision,AUC,Confusion Matrix
Standard Scaler,0.95614,0.956036,0.95614,0.956088,0.951032,"[[69, 2], [3, 40]]"
Standard Scaler Corr,0.973684,0.973742,0.973684,0.973958,0.974288,"[[69, 2], [1, 42]]"
PCA Standard Scaler,0.982456,0.982369,0.982456,0.982937,0.976744,"[[71, 0], [2, 41]]"
PCA Standard Scaler Corr,0.938596,0.93845,0.938596,0.938457,0.932362,"[[68, 3], [4, 39]]"


### SVC

In [11]:
score = [] 
model = SVC()
model.fit(X_train_mms, y_train)
y_pred = model.predict(X_test_mms)
score.append(np.array(evaluate(y_test, y_pred, False)))

model = SVC()
model.fit(X_corr_train_mms, y_train)
y_pred = model.predict(X_corr_test_mms)
score.append(np.array(evaluate(y_test, y_pred, False)))

In [12]:
model = SVC()
model.fit(X_train_ss, y_train)
y_pred = model.predict(X_test_ss)
score.append(np.array(evaluate(y_test, y_pred, False)))

model = SVC()
model.fit(X_corr_train_ss, y_train)
y_pred = model.predict(X_corr_test_ss)
score.append(np.array(evaluate(y_test, y_pred, False)))

In [13]:
tmp_pca = [0,0,0,0,0,0]
tmp_pca2 = [0,0,0,0,0,0]

import warnings
for n in range(2, 20):
    pca = PCA(n_components=n)
    pca.fit(X_train)

    warnings.filterwarnings("ignore")
    X_train_pca = pca.transform(X_train_mms)
    X_test_pca = pca.transform(X_test_mms)

    model = SVC()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    tmp = evaluate(y_test, y_pred, False)
    if tmp_pca[0] < tmp[0]:
        tmp_pca = tmp

    pca = PCA(n_components=n)
    pca.fit(X_corr_train_ss)

    warnings.filterwarnings("ignore")
    X_train_pca = pca.transform(X_corr_train_mms)
    X_test_pca = pca.transform(X_corr_test_mms)

    model = SVC()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    tmp = evaluate(y_test, y_pred, False)
    if tmp_pca2[0] < tmp[0]:
        tmp_pca2 = tmp

score.append(tmp_pca)
score.append(tmp_pca2)


In [14]:
tmp_pca = [0,0,0,0,0,0]
tmp_pca2 = [0,0,0,0,0,0]

import warnings
for n in range(2, 20):
    pca = PCA(n_components=n)
    pca.fit(X_train)

    warnings.filterwarnings("ignore")
    X_train_pca = pca.transform(X_train_ss)
    X_test_pca = pca.transform(X_test_ss)

    model = SVC()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    tmp = evaluate(y_test, y_pred, False)
    if tmp_pca[0] < tmp[0]:
        tmp_pca = tmp

    pca = PCA(n_components=n)
    pca.fit(X_corr_train_ss)

    warnings.filterwarnings("ignore")
    X_train_pca = pca.transform(X_corr_train_ss)
    X_test_pca = pca.transform(X_corr_test_ss)

    model = SVC()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    tmp = evaluate(y_test, y_pred, False)
    if tmp_pca2[0] < tmp[0]:
        tmp_pca2 = tmp

score.append(tmp_pca)
score.append(tmp_pca2)

In [15]:
pd.DataFrame(score,
    index=['Min-Max Scaler', 'Min-Max Scaler Corr', 'Standard Scaler', 'Standard Scaler Corr', 'PCA Min-Max Scaler', 'PCA Min-MAx Scaler Corr', 'PCA Standard Scaler', 'PCA Standard Scaler Corr'], 
    columns=['Accuracy', 'F1', 'Recall', 'Precision', 'AUC', 'Confusion Matrix'])

Unnamed: 0,Accuracy,F1,Recall,Precision,AUC,Confusion Matrix
Min-Max Scaler,0.973684,0.973621,0.973684,0.973719,0.969702,"[[70, 1], [2, 41]]"
Min-Max Scaler Corr,0.938596,0.93845,0.938596,0.938457,0.932362,"[[68, 3], [4, 39]]"
Standard Scaler,0.982456,0.982369,0.982456,0.982937,0.976744,"[[71, 0], [2, 41]]"
Standard Scaler Corr,0.947368,0.947368,0.947368,0.947368,0.94399,"[[68, 3], [3, 40]]"
PCA Min-Max Scaler,0.622807,0.478046,0.622807,0.387889,0.5,"[[71, 0], [43, 0]]"
PCA Min-MAx Scaler Corr,0.973684,0.973621,0.973684,0.973719,0.969702,"[[70, 1], [2, 41]]"
PCA Standard Scaler,0.622807,0.478046,0.622807,0.387889,0.5,"[[71, 0], [43, 0]]"
PCA Standard Scaler Corr,0.95614,0.956237,0.95614,0.956488,0.955617,"[[68, 3], [2, 41]]"
