## Support Vector Machine

Submitted by Nahari Terena

In [1]:
import numpy as np                  
import pandas as pd                 
import seaborn as sns               
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

We got the dataset from UCI Repository

In [2]:
df1 = pd.read_csv("dataset/Youtube02-KatyPerry.csv")        
df2 = pd.read_csv("dataset/Youtube04-Eminem.csv")           
df3 = pd.read_csv("dataset/Youtube05-Shakira.csv")

Now, we can merge all the dataset

In [3]:
frames = [df1,df2, df3]                          
df_merged = pd.concat(frames)
keys = ["Katy","Eminem", "Shakira"]
df_with_keys = pd.concat(frames,keys=keys)
dataset=df_with_keys


print(dataset.size)             
print(dataset.shape)               
print(dataset.keys())  

5840
(1168, 5)
Index(['COMMENT_ID', 'AUTHOR', 'DATE', 'CONTENT', 'CLASS'], dtype='object')


#### Data Pre Processing

In [4]:
dataset = dataset[["CONTENT" , "CLASS"]] 

# Predictor and Target attribute
dataset_X = dataset['CONTENT']                       # predictor attribute
dataset_y = dataset['CLASS']                         # target attribute

In [5]:
# Feature Extraction from Text using  TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer

# Extract Feature With TF-IDF model 
corpus = dataset_X                               
cv = TfidfVectorizer()                           
X = cv.fit_transform(corpus).toarray()           

In [6]:
# Split the dataset into Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, dataset_y, test_size=0.3, random_state=0)

X.shape

(1168, 3432)

#### Building a model

In [7]:
from sklearn.svm import SVC

classifier = SVC(kernel = 'linear', random_state= 0)    
classifier.fit(X_train, y_train)

In [8]:
# Predict the result
y_pred = classifier.predict(X_test)
print(y_pred)

[0 0 1 0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 0 1 1 1 1 1 0 0 1 0 0 0 0 0 0 1 0 1 1
 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1 0 1 0 1 1 0 0 0 1 1 1 1 1 0 0
 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 1 0 0 1 1 0 0 0 1 1 1
 1 1 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 1
 1 1 1 0 0 0 0 0 0 1 0 1 1 1 0 1 0 0 1 1 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0
 0 1 1 1 0 0 0 0 1 0 1 1 0 1 0 0 1 1 1 1 0 0 1 0 1 1 1 1 0 0 1 0 1 1 0 0 0
 0 1 1 1 0 0 0 1 1 1 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 1 1 1 0 1 1 0
 1 0 1 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 1 1 1 1 1 0 0 1 0 0 0 1 1 0 1 0 1 0 1
 0 1 0 1 1 1 1 0 0 1 0 0 1 1 1 1 0 0 1 1 0 0 1 0 0 1 0 1 0 1 1 1 1 1 1 0 1
 1 0 0 1 0 0 0 1 1 0 0 1 1 0 0 1 0 0]


#### Evaluating the result

We can check first on confusion matrix and then other metrics.

In [9]:
from sklearn.metrics import confusion_matrix
confusion_matrix= confusion_matrix(y_test, y_pred)
print(confusion_matrix)

TP = confusion_matrix[1, 1]        
TN = confusion_matrix[0, 0]           
FP = confusion_matrix[0, 1]           
FN = confusion_matrix[1, 0]

[[153   3]
 [ 15 180]]


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, roc_auc_score
 
# Accuracy Score 
print('Accuracy Score:', accuracy_score(y_test, y_pred))

# Precision Score
print('Precision Score:', precision_score(y_test, y_pred))

# True positive Rate (TPR) or Sensitivity or Recall    
print('True positive Rate:', recall_score(y_test, y_pred))

# False positive Rate (FPR)
print('False positive Rate', (FP / float(TN + FP)))

# F1 Score or F-Measure or F-Score
print('F1 Score:', (f1_score(y_test, y_pred)))

# Specificity
print('Specificity:', (TN / (TN + FP)))

# Mean Absolute Error
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))

# ROC Area
print('ROC Area:', roc_auc_score(y_test, y_pred))

Accuracy Score: 0.9487179487179487
Precision Score: 0.9836065573770492
True positive Rate: 0.9230769230769231
False positive Rate 0.019230769230769232
F1 Score: 0.9523809523809524
Specificity: 0.9807692307692307
Mean Absolute Error: 0.05128205128205128
ROC Area: 0.951923076923077


#### Save and load the model

As the metrics are satisfatory, we can save and load the model.

In [11]:
import pickle               # pickle used for serializing and de-serializing a Python object structure

Support_Vector_Machine = open("model.pkl","wb")          # open the file for writing
pickle.dump(classifier,Support_Vector_Machine)           # dumps an object to a file object
Support_Vector_Machine.close()                           # here we close the fileObject

In [12]:
# Load the model
ytb_model = open("model.pkl","rb")           # open the file for reading
new_model = pickle.load(ytb_model)           # load the object from the file into new_model
new_model

In [13]:
# Used the model for Prediction
comment = ["Hey Music Fans I really appreciate all of you,but see this song too"]
vect = cv.transform(comment).toarray()
new_model.predict(vect)

array([1], dtype=int64)

In [14]:
if new_model.predict(vect) == 1:
    print("Spam")
else:
    print("Ham")

Spam
