# Support Vector Machine #

In [None]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline
import sys
import matplotlib

print('python:',sys.version)
print('Numpy:',np.__version__)
print('Pandas:',pd.__version__)
print('Scikitlearn:',sklearn.__version__)
print('Seaborn: ',sns.__version__)
print('matplotlib:',matplotlib.__version__)


Invented in 1963 by Vladimir N.Vapnik. <br>
in 1992, he suggested a way to to create non linear classifiers by applying the kernel trick to maximum margin hyperplanes.<br>
The current standard incarnation (soft margin) was published in 1995 by Corinna Cortes.<br>
It's a supervised machine learning method used for classification, regression and outliers detection.<br>
In classification, it's about the best way to separate the classes, however, there is infinite lines (2D) or hyperplanes (3D) that can be used to separate 2 classes. <br>
In SVM, the separating line is the line that allows for largest margin between 2 classes(or more), the separating line is placed in the middle of the margin (maximum margin), than, it will optimize and locate the hyperplane that maximises the margin of the 2 classes (or more). The samples that are closest to the hyperplane are called <b> Support Vectors </b>.

# I. Linear SVM Classification: #

<ul>
<li>Calculate support vectors.</li>
<li>Separate with straight line (Linearly separable).</li>
<li>Hard margin Classification: strictly based on those which are at the margin between the 2 classes, however , it's sensitive to outliers.</li>
<li>Soft margin classification: widen the margin and allows for violation, with scikit learn you can control the width of the margin. Control with C hyperparameter:<ul> <ul><li>smaller C leads to a wide street but more margin violations</li> 
<li> High C means smaller margin and  fewer margin violations.</li> </ul></ul></li>
<br>SVM are sensitive to feature scaling.
    

In [None]:
from sklearn import svm

Iris=pd.read_csv(r'C:\Users\minam\Dev py\Coursera\Machine Learning_Anthony Ng/Data/Iris.csv')
Iris.head()


In [None]:
col=['PetalLengthCm','PetalWidthCm','Species']
Iris_df=Iris.loc[:,col]
np.unique(Iris['Species'],return_counts=True)
Iris_df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Spec_to_num={'Iris-setosa':0,'Iris-versicolor':1,'Iris-virginica':2 }
Y=Iris_df['Species'].map(Spec_to_num)
X=Iris_df.loc[:, 'PetalLengthCm':'PetalWidthCm']
X_train, X_test, Y_train, Y_test=train_test_split(X,Y,train_size=0.8, random_state=0)
X_test, Y_test


<ul>
<li><b>Linear SVC: </b>similar to SVC with parameter kernel='linear', but implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions, and should scale better to large numbers of samples. </li> 
<li><b>SVC: </b>C-Support Vector Classification, the implementation is based on libsvm. The fit time complexity is more than quadratic with the number of samples which makes it hard to scale to dataset with more than a couple of 10000 samples. </li>


In [None]:
C=0.001
clf=svm.SVC(kernel='linear',C=C)
#clf=svm.linearSVC(C=C, loss='hinge')
#clf=svm.SVC(kernel='poly', degree=3,C=C)
#clf=svm.SVC(kernel='rbf', gamma=0.7,C=C)
clf.fit(X,Y)


In [None]:
clf.predict([[6,2]])


In [None]:
Xv=X.values.reshape(-1,1)
h=0.02
X_min,X_max=Xv.min(),Xv.max()+1
Y_min,Y_max=Y.min(),Y.max()+1
xx,yy=np.meshgrid(np.arange(X_min,X_max,h), np.arange(Y_min,Y_max,h))
yy

In [None]:
z=clf.predict(np.c_[xx.ravel(),yy.ravel()])
z=z.reshape(xx.shape)
fig=plt.figure(figsize=(10,6))
ax=plt.contourf(xx,yy,z,cmap='afmhot',alpha=0.3)
plt.scatter(X.values[:,0], X.values[:,1],c=Y, s=80,alpha=0.9, edgecolors='g')
plt.show()


## 1) Implementation : ##

In [None]:
#Scaling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, Y_train, Y_test=train_test_split(X,Y,train_size=0.8, random_state=0)
sc_x=StandardScaler()
X_st_train=sc_x.fit_transform(X_train)
X_st_train


In [None]:
C=1.0
clf=svm.SVC(kernel='linear', C=C)
clf.fit(X_st_train, Y_train)


In [None]:
#Cross validation
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

res=cross_val_score(clf,X_st_train, Y_train, cv=10, scoring='accuracy' )
print(f'Accuracy= \t %.4f +/- %.4f'% (np.mean(res), np.std(res)))
res


In [None]:
Y_train_pred=cross_val_predict(clf,X_st_train, Y_train, cv=3)
confusion_matrix(Y_train,Y_train_pred )

print(f'Precision Score= %.4f \n Recall Score= %.4f \n F1 Score= %.4f'% (precision_score(Y_train, Y_train_pred, average='weighted'),
recall_score(Y_train, Y_train_pred, average='weighted'),f1_score(Y_train, Y_train_pred, average='weighted')))


In [None]:
#Cross validation with test set 
X_st_test=sc_x.fit_transform(X_test)
Y_test_pred=cross_val_predict(clf,X_st_test, Y_test, cv=3)
confusion_matrix(Y_test,Y_test_pred )


In [None]:
print(f'Precision Score= %.4f \n Recall Score= %.4f \n F1 Score= %.4f'% (precision_score(Y_test, Y_test_pred, average='weighted'),
recall_score(Y_test, Y_test_pred, average='weighted'),f1_score(Y_test, Y_test_pred, average='weighted')))
print('F1 Score 2nd display = {0:.4f}'.format(f1_score(Y_test, Y_test_pred, average='weighted')))


## 2)Polynomial Kernel: ##

In [None]:
C=1
clf=svm.SVC(kernel='poly', degree=3,C=C, gamma='auto')
clf.fit(X,Y)
Xv=X.values.reshape(-1,1)
h=0.02
X_min,X_max=Xv.min(),Xv.max()+1
Y_min,Y_max=Y.min(),Y.max()+1
xx,yy=np.meshgrid(np.arange(X_min,X_max,h), np.arange(Y_min,Y_max,h))
yy


In [None]:
z=clf.predict(np.c_[xx.ravel(),yy.ravel()])
z=z.reshape(xx.shape)
fig=plt.figure(figsize=(10,6))
ax=plt.contourf(xx,yy,z,cmap='afmhot',alpha=0.3)
plt.scatter(X.values[:,0], X.values[:,1],c=Y, s=80,alpha=0.9, edgecolors='g')
plt.show()


### Implementation: ###

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, Y_train, Y_test=train_test_split(X,Y,train_size=0.8, random_state=0)
sc_x=StandardScaler()
X_st_train=sc_x.fit_transform(X_train)
C=1.0
clf=svm.SVC(kernel='poly', degree=10, C=C, gamma='auto')
clf.fit(X_st_train,Y_train )


### Cross Validation with Train set ###

In [None]:
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

res=cross_val_score(clf,X_st_train, Y_train, cv=10, scoring='accuracy')
print(f'Average Accuracy= %.4f +/- %.4f \n' % (np.mean(res), np.std(res)))


In [None]:
Y_train_pred=cross_val_predict(clf,X_st_train, Y_train, cv=10)
confusion_matrix(Y_train, Y_train_pred)


In [None]:
print(f'Precision Score= %.4f \n Recall Score= %.4f \n F1 Score= %.4f'% (precision_score(Y_train, Y_train_pred, average='weighted'),
recall_score(Y_train, Y_train_pred, average='weighted'),f1_score(Y_train, Y_train_pred, average='weighted')))


### Cross Validation with Test set ###

In [None]:
Y_test_pred=cross_val_predict(clf,sc_x.fit_transform(X_test), Y_test, cv=3)
confusion_matrix(Y_test,Y_test_pred)


In [None]:
print(f'Precision Score= %.4f \n Recall Score= %.4f \n F1 Score= %.4f'% (precision_score(Y_test, Y_test_pred, average='weighted'),
recall_score(Y_test, Y_test_pred, average='weighted'),f1_score(Y_test, Y_test_pred, average='weighted')))


## 3) Gaussian Radial Basis Function: ##

The kernel function can be any of the following:
<ul>
<li>Linear: (x,x')
<li>Polynomial: $$(\gamma (x,x')+r)^d $$  <br>d is specified by keyword degree<br>r by coef0
<li>rbf: $$exp(-\gamma ||x-x'||^2) $$<br>$\gamma$ is specified by keyword gamma, must be > 0
<li>Sigmoid: $$ tanh(\gamma (x,x')+r)$$ <br>where r is specified by coef0
</ul>


In [None]:
from sklearn.preprocessing import StandardScaler

X_train, X_test, Y_train, Y_test=train_test_split(X,Y,train_size=0.8, random_state=0)
sc_x=StandardScaler()
X_sts_train=sc_x.fit_transform(X_train)
C=1
clf=svm.SVC(kernel='rbf', gamma=0.7, C=C)
clf.fit(X_sts_train, Y_train)


In [None]:
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

res=cross_val_score(clf,X_sts_train, Y_train, cv=10, scoring='accuracy')
print(f'Average Accuracy= %.4f +/- %.4f \n' % (np.mean(res), np.std(res)))


In [None]:
Y_train_pred=cross_val_predict(clf,sc_x.fit_transform(X_sts_train), Y_train, cv=3)
confusion_matrix(Y_train,Y_train_pred)


In [None]:
print(f'Precision Score= %.4f \nRecall Score=\t %.4f \nF1 Score=\t %.4f'% (precision_score(Y_train, Y_train_pred, average='weighted'),
recall_score(Y_train, Y_train_pred, average='weighted'),f1_score(Y_train, Y_train_pred, average='weighted')))


### Grid Search ###

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split , GridSearchCV#grid search cross validation

pieline=Pipeline([('clf', svm.SVC(kernel='rbf', C=1, gamma=0.1))])
params={'clf__C': (0.1,0.5,1,2,5,10,20), 'clf__gamma':(0.001,0.01, 0.1,0.25,0.5,0.75,1)}
col=['PetalLengthCm','PetalWidthCm']

svm_grid_rbf=GridSearchCV(pieline, params, n_jobs=-1, cv=3, verbose=1, scoring='accuracy')

X_train.values

svm_grid_rbf.fit(X_train,y=Y_train)


In [None]:

best=svm_grid_rbf.best_estimator_.get_params()
svm_grid_rbf.best_score_


In [None]:
for k in sorted (params.keys()):
    print(f'{k} : \t{best[k]}')


### Conductiong validation in the test set: ###

In [None]:
Y_test_pred=svm_grid_rbf.predict(X_test)
confusion_matrix(Y_test, Y_test_pred )


In [None]:
print(f'Precision Score= %.4f \nRecall Score=\t %.4f \nF1 Score=\t %.4f'% (precision_score(Y_test, Y_test_pred, average='weighted'),
recall_score(Y_test, Y_test_pred, average='weighted'),f1_score(Y_test, Y_test_pred, average='weighted')))


In [None]:
Xv=X.values.reshape(-1,1)
h=0.02
X_min,X_max=Xv.min(),Xv.max()+1
Y_min,Y_max=Y.min(),Y.max()+1
xx,yy=np.meshgrid(np.arange(X_min,X_max,h), np.arange(Y_min,Y_max,h))
yy
z=svm_grid_rbf.predict(np.c_[xx.ravel(),yy.ravel()])
z=z.reshape(xx.shape)
fig=plt.figure(figsize=(10,6))
ax=plt.contourf(xx,yy,z,cmap='afmhot',alpha=0.3)
plt.scatter(X.values[:,0], X.values[:,1],c=Y, s=80,alpha=0.9, edgecolors='g')
plt.show()


# II. Linear SVM Regression: #

In [None]:

Boston=pd.read_csv('Data/housing.data',delim_whitespace=True, header=None)
Boston.columns=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX',
                'PTRATIO','B','LSTAT','MEDV']
#Boston.to_csv('Boston.csv')
Boston.describe()
X=Boston['LSTAT'].values
Y=Boston['MEDV']
Y


In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

svr=SVR(gamma='auto')
svr.fit(X.reshape(-1,1), Y)
sort_idx=X.flatten().argsort()


In [None]:
plt.figure(figsize=(10,6))
plt.scatter(X[sort_idx], Y[sort_idx])
plt.plot(X[sort_idx],svr.predict(X[sort_idx].reshape(-1,1)), color='k') 
plt.xlabel('LSTAT')
plt.ylabel('MEDV')
plt.show()


In [None]:
X_train, X_test, Y_train, Y_test=train_test_split(X,Y,train_size=0.7, random_state=42)


### Linear Kernel ###

In [None]:
svr=SVR(kernel='linear')
svr.fit(X.reshape(-1,1), Y)
Y_train_pred=svr.predict(X_train.reshape(-1,1))
Y_test_pred=svr.predict(X_test.reshape(-1,1))
print(f'MSE Train= %.4f , MSE Test= %.4f'% (mean_squared_error(Y_train,Y_train_pred),mean_squared_error(Y_test,Y_test_pred) ))
print(f'R² score Train= %.4f , R² score Test= %.4f'% (r2_score(Y_train,Y_train_pred),r2_score(Y_test,Y_test_pred)))


### Polynomial Kernel ###

In [None]:
svr=SVR(kernel='poly', C=1e3, degree=2, gamma='auto')
svr.fit(X_train.reshape(-1,1), Y_train)
Y_train_pred=svr.predict(X_train.reshape(-1,1))
Y_test_pred=svr.predict(X_test.reshape(-1,1))
print(f'MSE Train= %.4f , MSE Test= %.4f'% (mean_squared_error(Y_train,Y_train_pred),mean_squared_error(Y_test,Y_test_pred) ))
print(f'R² score Train= %.4f , R² score Test= %.4f'% (r2_score(Y_train,Y_train_pred),r2_score(Y_test,Y_test_pred)))


### rbf Kernel ###

In [None]:
svr=SVR(kernel='rbf', C=1e3, gamma=0.1)
svr.fit(X_train.reshape(-1,1), Y_train)
Y_train_pred=svr.predict(X_train.reshape(-1,1))
Y_test_pred=svr.predict(X_test.reshape(-1,1))
print(f'MSE Train= %.4f , MSE Test= %.4f'% (mean_squared_error(Y_train,Y_train_pred),mean_squared_error(Y_test,Y_test_pred) ))
print(f'R² score Train= %.4f , R² score Test= %.4f'% (r2_score(Y_train,Y_train_pred),r2_score(Y_test,Y_test_pred)))


# III. Advantages and disadvantages : #
The advantages are:
<ul>
<li>Effective in high dimensional spaces </li>
<li>Uses only a subset of training points (support vector) in the decision function. </li>
<li>Many diffeerent kernel functions can be specified for the decision function. </li>
<ul><li>Linear </li>
<li>Polynomial </li>
<li>RBF </li>
<li>Sigmoid </li>
<li>Custom </li>
</ul>
</ul>
The disadvantages of support vector machine include:
<ul>
<li>Beware of overfitting when num_features > num_samples. </li>
<li>Choice of kernel and Regularization can have a large impact on performance. </li>
<li>No probability estimates </li>
</ul>



|Classe | Out of Core Support | Kernel Trick |
-----------|:----------------------------:|:----------------:|
'SGD Classifier' | Yes | No |
'Linear SVC' | No | No |
'SVC' |No | Yes |
