# Support Vector Machines
You should build a machine learning pipeline using a support vector machine model. In particular, you should do the following:
- Load the `mnist` dataset using [Pandas](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html). You can find this dataset in the datasets folder.
- Split the dataset into training and test sets using [Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).
- Conduct data exploration, data preprocessing, and feature engineering if necessary.
- Train and test a support vector machine model using [Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html).
- Check the documentation to identify the most important hyperparameters, attributes, and methods of the model. Use them in practice.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score



In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/m-mahdavi/teaching/refs/heads/main/datasets/mnist.csv")

In [None]:
df.head()

Unnamed: 0,id,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,31953,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,34452,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,60897,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,36953,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1981,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##Splitting the Data

In [None]:
df_train,df_test = sklearn.model_selection.train_test_split(df)  #train_test_split function uses  75% for the train & 25% for the test
print("df size:",df.shape)
print("df_train size:",df_train.shape)
print("df_test size:",df_test.shape)

df size: (4000, 786)
df_train size: (3000, 786)
df_test size: (1000, 786)


##Data Exploration

In [None]:
df_train.isnull().sum() #Checking whether the consist of any null values

Unnamed: 0,0
id,0
class,0
pixel1,0
pixel2,0
pixel3,0
...,...
pixel780,0
pixel781,0
pixel782,0
pixel783,0


In [None]:
df_train.dtypes #Checking the type of data

Unnamed: 0,0
id,int64
class,int64
pixel1,int64
pixel2,int64
pixel3,int64
...,...
pixel780,int64
pixel781,int64
pixel782,int64
pixel783,int64


In [None]:
df_train.describe()

Unnamed: 0,id,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,...,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,34446.490667,4.436333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.088667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,20465.942694,2.876115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.926601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16489.75,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,34665.5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,51907.75,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,69998.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,125.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##Data Preprocessing

In [None]:
X_train = df_train.drop(["id","class"], axis=1) #Removing the ID Column and Class column from the data as it is not counted as feature
Y_train = df_train["class"] #Taking class as our target variable

X_test = df_test.drop(["id","class"], axis=1) #Removing the ID Column and Class column from the data as it is not counted as feature
Y_test = df_test["class"] #Taking class as our target variable

print("X_train size:",X_train.shape)
print("Y_train size:",Y_train.shape)
print("X_test size:",X_test.shape)
print("Y_test size:",Y_test.shape)


X_train size: (3000, 784)
Y_train size: (3000,)
X_test size: (1000, 784)
Y_test size: (1000,)


##Feature Engineering - Scaling


##Model Preparation

In [None]:
model = SVC()

model.fit(X_train,Y_train)

##Model Prediction

In [None]:
Y_predict = model.predict(X_test)

##Evaluation Metrics - Accuracy Metric

In [None]:
accuracy = accuracy_score(Y_test, Y_predict)
print(f"Accuracy: {accuracy*100: f}")

Accuracy:  94.300000


##Dimensionality Reduction

In [None]:
# Retain 95% variance
pca = PCA(0.95)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

print(f"Reduced X_train shape: {X_train_reduced.shape}")

Reduced X_train shape: (3000, 146)


In [None]:
model = SVC()
model.fit(X_train_reduced,Y_train)

In [None]:
Y_predict = model.predict(X_test_reduced)

In [None]:
accuracy = accuracy_score(Y_test, Y_predict)
print(f"Accuracy: {accuracy*100: f}")

Accuracy:  94.600000
