# SVM

### Import libraries

In [2]:
import pandas as pd # one-hot encoding
import numpy as np
import matplotlib.pyplot as plt # graphs
import matplotlib.colors as colors
from sklearn.utils import resample # downsample dataset
from sklearn.model_selection import train_test_split # split to training and testing datasets
from sklearn.model_selection import GridSearchCV # cross validation
from sklearn.preprocessing import scale # scale and center data
from sklearn.svm import SVC # support vector classifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

### Import data

In [None]:
df = pd.read_csv("yt_comments.csv")
df.head()
df.rename({'xxx':'yyy'}, axis='columns', inplace=True) # change column name
df.head()
df.drop('aaa', axis=1, inplace=True) # remove column
df.head()

In [None]:
df.dtypes
df['yyy'].unique()
print(f"There are {len(df)} rows in the data set")
print(f"There are {len(df.loc[(df['yyy'] == y)])} containing missing values")

### Downsample (in case our dataset is big enough)
SVM does not require much data and works fine with small datasets

In [None]:
df_downsampled = resample(df, replace=False, n_samples=1000, random_state=42)
print(f"There are {len(df_downsampled)} rows in the downsampled dataset")

### Combine English and translated German comments

In [None]:
df_downsample = pd.concat([df_downsampled_en, df_downsampled_de]) # merge into single dataframe
len(df_downsample)

### Split the data

In [None]:
X =
y = 

### One-Hot encoding

In [None]:
X_encoded = pd.get_dummies(X, columns=['yyy', 'zzz'])
X_encoded.head()

### Centering and scaling
Each column should have a mean value = 0 and standard deviation = 1

In [None]:
X_train, X_test, y_train, y _test = train_test_split(X_encoded,
                                                     y,
                                                     test_size=0.33, # default is 0.25
                                                     random_state=42,
                                                     stratify=y,  # if imbalanced dataset
                                                     schuffle=True
                                                    )
X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)

### SVM

In [None]:
clf_svm = SVC(random_state=42)
clf_svm.fit(X_train_scaled, y_train) # supervised

### Optimize parameters using cross validation

In [None]:
params = [{'C': [],
         'gamma': [],
         'kernel': []},
        ]

optimal_params = GridSearchCV(SVC(),
                             params,
                             cv=5,
                             scorng='accuracy',
                             verbose=0
                             )

optimal_params.fit(X_train_scaled, y_train)
print(f"Optimal parameters are {optimal_params.best_params_}")

In [None]:
clf_svm = SVC(C=1.0, 
              kernel='rbf',
              degree=3,
              gamma='scale',
              tol=0.001,
              random_state=42,
             )

clf_svm.fit(X_train_scaled, y_train)

### Evaluation

In [None]:
plot_confusion_matrix(clf_svm,
                      X_test_scaled,
                      y_test,
                      valued_format='d',
                      display_labels=['Y', 'N']
                     )

predicted =  clf_svm.predict(X_test_scaled)
print(f"Accuracy is {accuracy_score(y_test, predicted)})

### SVM decision boundary?