# SVM Exercises


- dataset: 'Iris.csv'


In [109]:
# Data
import pandas as pd

# Model
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

# Timer
import time

In [110]:
dataset = pd.read_csv('Iris.csv')

### Data knowledge

In [111]:
#show BASIC INFORMATION: max, min, mean của các columns trong dataset
dataset.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


### Data processing

In [112]:
# T number of instances (rows) that belong to each class. 
dataset.groupby('Species').size()                 # Iris.csv

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64

In [113]:
# Data instances
X = dataset.iloc[:, :-1]
y = dataset.iloc[:,-1]

In [114]:
# Scale the data to be between -1 and 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [115]:
# Labels are categorical variables. Therefore, we have to transform them into a numeric format as KNeighborsClassifier does not accept string labels

# Iris-setosa correspond to 0
# Iris-versicolor correspond to 1
# Iris-virginica correspond to 2
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
yl = le.fit_transform(y)

In [116]:
# Spliting dataset into training set and test set
# training set: to build classifier
# test set: to evaluate classifier  
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, yl, test_size = 0.2, random_state = 0)
print(X_train.shape) # 80% rows dùng để train
print(y_train.shape)
print(X_test.shape) # 20% rows dùng để test
print(y_test.shape)

(120, 5)
(120,)
(30, 5)
(30,)


### Use Support Vector Machine to classify the Iris dataset


In [117]:
svc=SVC(kernel='linear')

svc.fit(X_train,y_train)
accuracy = accuracy_score(y_train, svc.predict(X_train))*100
print('Train accuracy of our model is equal ' + str(round(accuracy, 2)) + ' %.')

Train accuracy of our model is equal 100.0 %.


In [118]:
# Predicting on the test set
start = time.time()
y_pred=svc.predict(X_test)
print("Elapsed time = ", time.time() - start)
y_pred

Elapsed time =  0.0006451606750488281


array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0])

In [119]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[11,  0,  0],
       [ 0, 13,  0],
       [ 0,  0,  6]], dtype=int64)

In [120]:
#Sử dụng hàm accuracy_score cho test set
accuracy = accuracy_score(y_test, y_pred)*100
print('Test accuracy of our model is equal ' + str(round(accuracy, 2)) + ' %.')

Test accuracy of our model is equal 100.0 %.


In [121]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00         6

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [122]:
cv = StratifiedKFold(n_splits=10)            # Desired number of Cross Validation folds
fold_accuracy = list()

for train_fold, valid_fold in cv.split(X, y):
    f_train = dataset.loc[train_fold] # Extract train data with cv indices
    f_valid = dataset.loc[valid_fold] # Extract valid data with cv indices

    model = svc.fit(X = f_train.drop(['Species'], axis=1), 
                            y = f_train["Species"]) # We fit the model with the fold train data
    valid_acc = svc.score(X = f_valid.drop(['Species'], axis=1), 
                            y = f_valid["Species"])# We calculate accuracy with the fold validation data
    fold_accuracy.append(valid_acc)

avg = sum(fold_accuracy)/len(fold_accuracy)
print(avg)

0.9600000000000002
