<a href="https://colab.research.google.com/github/nguyenminhduc0233/NguyenMinhDuc_ML_2023/blob/main/Lab_5_20130233_NguyenMinhDuc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This lab is to deal with **SVM** to classification tasks and compare its performance with other competitive algorithms. In general, **SVM** is one of the most popular and widely used supervised machine learning algorithms.

*   **Deadline: 23:59, 17/03/2023**



# Import libraries

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from prettytable import PrettyTable

#Task 1. 
For breast cancer dataset (https://tinyurl.com/3vme8hr3) which could be loaded from datasets in sklearn as follows:

```
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()
```

*   1.1.	Apply SVM algorithm to above dataset using linear kernel.
*   1.2.	Compare the obtained results with other competitive algorithms (Logistic Regression, Decision Tree, kNN) based on metrics: accuracy, precision, recall, f1 measures.



In [2]:
from IPython.core.prefilter import MacroChecker
cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc_smv = round((metrics.accuracy_score(y_test, y_pred)),4)
precision_smv = round((metrics.precision_score(y_test, y_pred, average = 'macro')),4)
f1_smv = round((metrics.f1_score(y_test, y_pred, average = 'macro')),4)
recall_smv = round((metrics.recall_score(y_test, y_pred, average = 'macro')),4)

classifier = LogisticRegression(random_state = 0, max_iter=10000)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
acc_logistic = round((metrics.accuracy_score(y_test, y_pred)),4)
precision_logistic = round((metrics.precision_score(y_test, y_pred, average = 'macro')),4)
f1_logistic = round((metrics.f1_score(y_test, y_pred, average = 'macro')),4)
recall_logistic = round((metrics.recall_score(y_test, y_pred, average = 'macro')),4)

k_range = range(1, 30, 2)
accs = []
precision = []
recall = []
f1 = []
for k in k_range:
  KNN = KNeighborsClassifier(n_neighbors = k)
  KNN.fit(X_train, y_train)
  y_pred = KNN.predict(X_test)
  accs.append(metrics.accuracy_score(y_test, y_pred))
  precision.append(metrics.precision_score(y_test, y_pred, average = 'micro'))
  recall.append(metrics.recall_score(y_test, y_pred, average = 'micro'))
  f1.append(metrics.f1_score(y_test, y_pred, average = 'micro'))
acc_knn = round(max(accs),4)
precision_knn = round(max(precision),4)
f1_knn = round(max(f1),4)
recall_knn = round(max(recall),4)

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
acc_dtree = round((metrics.accuracy_score(y_test, y_pred)),4)
precision_dtree = round((metrics.precision_score(y_test, y_pred, average = 'macro')),4)
f1_dtree = round((metrics.f1_score(y_test, y_pred, average = 'macro')),4)
recall_dtree = round((metrics.recall_score(y_test, y_pred, average = 'macro')),4)

t = PrettyTable(['','acc','precision','recall','f1'])
t.add_row(['kNN',acc_knn,precision_knn,recall_knn,f1_knn])
t.add_row(['Decision Tree',acc_dtree,precision_dtree,recall_dtree,f1_dtree])
t.add_row(['Logistic Regression',acc_logistic,precision_logistic,recall_logistic,f1_logistic])
t.add_row(['SMV',acc_smv,precision_smv,recall_smv,f1_smv])
print(t)

print(metrics.classification_report(y_test, y_pred))

+---------------------+--------+-----------+--------+--------+
|                     |  acc   | precision | recall |   f1   |
+---------------------+--------+-----------+--------+--------+
|         kNN         | 0.9357 |   0.9357  | 0.9357 | 0.9357 |
|    Decision Tree    | 0.9474 |   0.9615  | 0.9286 | 0.9415 |
| Logistic Regression | 0.9474 |   0.948   | 0.9385 | 0.9429 |
|         SMV         | 0.9532 |   0.9563  | 0.9431 | 0.949  |
+---------------------+--------+-----------+--------+--------+
              precision    recall  f1-score   support

           0       1.00      0.86      0.92        63
           1       0.92      1.00      0.96       108

    accuracy                           0.95       171
   macro avg       0.96      0.93      0.94       171
weighted avg       0.95      0.95      0.95       171



#Task 2. 

*   1.1.	Perform SVM algorithm to **Iris dataset** using **linear kernel**.
*   1.2.	Compare the obtained results in 1.1 with SVM using other kernels (**Polynomial Kernel, Gaussian Kernel, Sigmoid Kernel, Radial Basis Function Kernel**). Some metrics could be used: accuracy, precision, recall, f1 measures





In [3]:
from scipy.sparse import data
data_iris = datasets.load_iris()
X = data_iris.data
y = data_iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1)
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc_linear = metrics.accuracy_score(y_test, y_pred)
precision_linear = metrics.precision_score(y_test, y_pred, average='micro')
recall_linear = metrics.recall_score(y_test, y_pred, average='micro')
f1_linear = metrics.f1_score(y_test, y_pred, average='micro')

In [4]:
clf = svm.SVC(kernel='poly', degree=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc_poly = round(metrics.accuracy_score(y_test, y_pred),4)
precision_poly = round(metrics.precision_score(y_test, y_pred, average='micro'),4)
recall_poly = round(metrics.recall_score(y_test, y_pred, average='micro'),4)
f1_poly = round(metrics.f1_score(y_test, y_pred, average='micro'),4)

clf = svm.SVC(kernel='sigmoid', degree=4)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc_sigmoid = round(metrics.accuracy_score(y_test, y_pred),4)
precision_sigmoid = round(metrics.precision_score(y_test, y_pred, average='micro'),4)
recall_sigmoid = round(metrics.recall_score(y_test, y_pred, average='micro'),4)
f1_sigmoid = round(metrics.f1_score(y_test, y_pred, average='micro'),4)

clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc_rbf = round(metrics.accuracy_score(y_test, y_pred),4)
precision_rbf = round(metrics.precision_score(y_test, y_pred, average='micro'),4)
recall_rbf = round(metrics.recall_score(y_test, y_pred, average='micro'),4)
f1_rbf = round(metrics.f1_score(y_test, y_pred, average='micro'),4)

t = PrettyTable(['','acc','precision','recall','f1'])
t.add_row(['linear',acc_linear,precision_linear,recall_linear,f1_linear])
t.add_row(['sigmoid',acc_sigmoid,precision_sigmoid,recall_sigmoid,f1_sigmoid])
t.add_row(['poly',acc_poly,precision_poly,recall_poly,f1_poly])
t.add_row(['rbf',acc_rbf,precision_rbf,recall_rbf,f1_rbf])
print(t)

+---------+--------+-----------+--------+--------+
|         |  acc   | precision | recall |   f1   |
+---------+--------+-----------+--------+--------+
|  linear |  1.0   |    1.0    |  1.0   |  1.0   |
| sigmoid | 0.2889 |   0.2889  | 0.2889 | 0.2889 |
|   poly  | 0.9778 |   0.9778  | 0.9778 | 0.9778 |
|   rbf   | 0.9778 |   0.9778  | 0.9778 | 0.9778 |
+---------+--------+-----------+--------+--------+


#Task 3. 
Compare the performance of selected classification algorithms (Decision Tree, kNN, Logistic Regression) and SVM (using different kernels) with mnist dataset based on accuracy, precision, recall, f1 measures.


In [5]:
data_mnist = datasets.load_digits()
X = data_mnist.data
y = data_mnist.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1)
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc_svm = metrics.accuracy_score(y_test, y_pred)
precision_svm = metrics.precision_score(y_test, y_pred, average='micro')
recall_svm = metrics.recall_score(y_test, y_pred, average='micro')
f1_svm = metrics.f1_score(y_test, y_pred, average='micro')

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
acc_dtree = round((metrics.accuracy_score(y_test, y_pred)),4)
precision_dtree = round((metrics.precision_score(y_test, y_pred, average = 'macro')),4)
f1_dtree = round((metrics.f1_score(y_test, y_pred, average = 'macro')),4)
recall_dtree = round((metrics.recall_score(y_test, y_pred, average = 'macro')),4)

k_range = range(1, 30, 2)
accs = []
precision = []
recall = []
f1 = []
for k in k_range:
  KNN = KNeighborsClassifier(n_neighbors = k)
  KNN.fit(X_train, y_train)
  y_pred = KNN.predict(X_test)
  accs.append(metrics.accuracy_score(y_test, y_pred))
  precision.append(metrics.precision_score(y_test, y_pred, average = 'micro'))
  recall.append(metrics.recall_score(y_test, y_pred, average = 'micro'))
  f1.append(metrics.f1_score(y_test, y_pred, average = 'micro'))
acc_knn = round(max(accs),4)
precision_knn = round(max(precision),4)
f1_knn = round(max(f1),4)
recall_knn = round(max(recall),4)

classifier = LogisticRegression(random_state = 0, max_iter=10000)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
acc_logistic = round((metrics.accuracy_score(y_test, y_pred)),4)
precision_logistic = round((metrics.precision_score(y_test, y_pred, average = 'macro')),4)
f1_logistic = round((metrics.f1_score(y_test, y_pred, average = 'macro')),4)
recall_logistic = round((metrics.recall_score(y_test, y_pred, average = 'macro')),4)

t = PrettyTable(['','acc','precision','recall','f1'])
t.add_row(['kNN',acc_knn,precision_knn,recall_knn,f1_knn])
t.add_row(['Decision Tree',acc_dtree,precision_dtree,recall_dtree,f1_dtree])
t.add_row(['Logistic Regression',acc_logistic,precision_logistic,recall_logistic,f1_logistic])
t.add_row(['SMV',acc_smv,precision_smv,recall_smv,f1_smv])
print(t)


+---------------------+--------+-----------+--------+--------+
|                     |  acc   | precision | recall |   f1   |
+---------------------+--------+-----------+--------+--------+
|         kNN         | 0.9907 |   0.9907  | 0.9907 | 0.9907 |
|    Decision Tree    |  0.85  |   0.8506  | 0.8519 |  0.85  |
| Logistic Regression | 0.9685 |   0.9673  | 0.9681 | 0.9674 |
|         SMV         | 0.9532 |   0.9563  | 0.9431 | 0.949  |
+---------------------+--------+-----------+--------+--------+


#Task 4. 
Compare the performance of selected classification algorithms (Decision Tree, kNN, Logistic Regression) and SVM (using different kernels) with **credit card dataset** based on accuracy, precision, recall, f1 measures.

*   Give some comments on the obtained results
*   Identify issues with dataset, and propose the solutions to these issues



In [6]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/ML_2023/Lab5'

Mounted at /content/gdrive
/content/gdrive/MyDrive/ML_2023/Lab5


In [7]:
import pandas as pd
dataset = pd.read_csv("creditcard.csv", sep=",")
data_credit = dataset.head(10000)
X = data_credit[['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']]
y = data_credit[['Class']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1)

In [8]:
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc_svm = metrics.accuracy_score(y_test, y_pred)
precision_svm = metrics.precision_score(y_test, y_pred, average='micro')
recall_svm = metrics.recall_score(y_test, y_pred, average='micro')
f1_svm = metrics.f1_score(y_test, y_pred, average='micro')

  y = column_or_1d(y, warn=True)


In [9]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
acc_dtree = round((metrics.accuracy_score(y_test, y_pred)),4)
precision_dtree = round((metrics.precision_score(y_test, y_pred, average = 'macro')),4)
f1_dtree = round((metrics.f1_score(y_test, y_pred, average = 'macro')),4)
recall_dtree = round((metrics.recall_score(y_test, y_pred, average = 'macro')),4)

k_range = range(1, 30, 2)
accs = []
precision = []
recall = []
f1 = []
for k in k_range:
  KNN = KNeighborsClassifier(n_neighbors = k)
  KNN.fit(X_train, y_train)
  y_pred = KNN.predict(X_test)
  accs.append(metrics.accuracy_score(y_test, y_pred))
  precision.append(metrics.precision_score(y_test, y_pred, average = 'micro'))
  recall.append(metrics.recall_score(y_test, y_pred, average = 'micro'))
  f1.append(metrics.f1_score(y_test, y_pred, average = 'micro'))
acc_knn = round(max(accs),4)
precision_knn = round(max(precision),4)
f1_knn = round(max(f1),4)
recall_knn = round(max(recall),4)

classifier = LogisticRegression(random_state = 0, max_iter=10000)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
acc_logistic = round((metrics.accuracy_score(y_test, y_pred)),4)
precision_logistic = round((metrics.precision_score(y_test, y_pred, average = 'macro')),4)
f1_logistic = round((metrics.f1_score(y_test, y_pred, average = 'macro')),4)
recall_logistic = round((metrics.recall_score(y_test, y_pred, average = 'macro')),4)

t = PrettyTable(['','acc','precision','recall','f1'])
t.add_row(['kNN',acc_knn,precision_knn,recall_knn,f1_knn])
t.add_row(['Decision Tree',acc_dtree,precision_dtree,recall_dtree,f1_dtree])
t.add_row(['Logistic Regression',acc_logistic,precision_logistic,recall_logistic,f1_logistic])
t.add_row(['SMV',acc_smv,precision_smv,recall_smv,f1_smv])
print(t)

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)


+---------------------+--------+-----------+--------+--------+
|                     |  acc   | precision | recall |   f1   |
+---------------------+--------+-----------+--------+--------+
|         kNN         | 0.997  |   0.997   | 0.997  | 0.997  |
|    Decision Tree    | 0.9983 |   0.8497  | 0.8884 | 0.868  |
| Logistic Regression | 0.9993 |   0.9997  | 0.8889 | 0.9373 |
|         SMV         | 0.9532 |   0.9563  | 0.9431 | 0.949  |
+---------------------+--------+-----------+--------+--------+


#Finally,
Save a copy in your Github. Remember renaming the notebook.