Mike Cresswell: Support Vector Machine Implementation

SVM Classifying on TF-IDF Vectorized Text and Source

In [None]:
#Code Classifies on Text and Source
#Using Linear SVM Kernel
import pandas as pd
import numpy as np
import time
import io
import requests
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import KFold 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score

#Get Processed Data
url="https://raw.githubusercontent.com/mgcresswell/TCSS555-Project/main/deceptive-opinion_processed.csv"
s=requests.get(url).content
Corpus = pd.read_csv(io.StringIO(s.decode('utf-8')))

#Set Y
y = Corpus['deceptive']
#Drop unused X columns
X = Corpus.drop(['id','deceptive','hotel','polarity'], axis=1)

#Encode Class Labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
sourceEncoded = label_encoder.fit_transform(X['source'])

#One hot encode source label
onehot_encoder = OneHotEncoder(sparse=False)
sourceEncoded = sourceEncoded.reshape(len(sourceEncoded), 1)
X['source'] = onehot_encoder.fit_transform(sourceEncoded)

#Vectorize text using tfidf vector
Tfidf_vect = TfidfVectorizer(max_features=1288)
Tfidf_vect.fit(Corpus['text'])
Text_Idf = Tfidf_vect.transform(X['text'])
X['text'] = Text_Idf.toarray()

#Train/Test Split
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in split.split(X, y):
   Train_X, Test_X = X.loc[train_index], X.loc[test_index]
   Train_Y, Test_Y = y[train_index], y[test_index]

#Initialize model class
SVM = svm.SVC(kernel='linear')
start = time.perf_counter()
SVM.fit(Train_X, Train_Y)
stop = time.perf_counter()

#Test Accuracy 
y_pred = SVM.predict(Test_X)
test_Accuracy = accuracy_score(Test_Y, y_pred)*100

#Test Accuracy 
y_pred = SVM.predict(Train_X)
train_Accuracy = accuracy_score(Train_Y, y_pred)*100

#measure cross validation
crossvalMean = cross_val_score(SVM, X, y, cv=10).mean()*100

#get training time
curTime = stop - start;
print(f"Training Time = {curTime:0.8f} Seconds")
print(f"Test Accuracy = {test_Accuracy}")
print(f"Train Accuracy = {train_Accuracy}")
print(f"Cross Validation Mean = {crossvalMean}")

Training Time = 0.00520728 Seconds
Test Accuracy = 100.0
Train Accuracy = 100.0
Cross Validation Mean = 100.0


In [None]:
#Code Only Classifies on Text
#Using Linear SVM Kernel
import pandas as pd
import numpy as np
import time
import io
import requests
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import KFold 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score

url="https://raw.githubusercontent.com/mgcresswell/TCSS555-Project/main/deceptive-opinion_processed.csv"
s=requests.get(url).content
Corpus = pd.read_csv(io.StringIO(s.decode('utf-8')))

y = Corpus['deceptive']
X =  np.array(Corpus['text'])

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

Tfidf_vect = TfidfVectorizer(max_features=1288)
Tfidf_vect.fit(X)
Text_Idf = Tfidf_vect.transform(X)
X = Text_Idf.toarray()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in split.split(X, y):
   Train_X, Test_X = X[train_index], X[test_index]
   Train_Y, Test_Y = y[train_index], y[test_index]


SVM = svm.SVC(kernel='linear')
start = time.perf_counter()
SVM.fit(Train_X, Train_Y)
stop = time.perf_counter()

#Test Accuracy 
y_pred = SVM.predict(Test_X)
test_Accuracy = accuracy_score(Test_Y, y_pred)*100

#Test Accuracy 
y_pred = SVM.predict(Train_X)
train_Accuracy = accuracy_score(Train_Y, y_pred)*100

crossvalMean = cross_val_score(SVM, X, y, cv=10).mean()

curTime = stop - start;
print(f"Training Time = {curTime:0.8f} Seconds")
print(f"Test Accuracy = {test_Accuracy}")
print(f"Train Accuracy = {train_Accuracy}")
print(f"Cross Validation Mean = {crossvalMean}")

Training Time = 1.71617123 Seconds
Test Accuracy = 87.91666666666667
Train Accuracy = 96.78571428571429
Cross Validation Mean = 0.8674999999999999


In [None]:
#Uses RBF Kernel & Feature Engineering
#Features used for classification determined by results
#from SVM_RBF_Tuning functions
import pandas as pd
import numpy as np
import time
import io
import requests
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import KFold 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score

url="https://raw.githubusercontent.com/mgcresswell/TCSS555-Project/main/deceptive-opinion_processed.csv"
s=requests.get(url).content
processed = pd.read_csv(io.StringIO(s.decode('utf-8'))) 

url="https://raw.githubusercontent.com/mgcresswell/TCSS555-Project/main/deceptive-opinion.csv"
s=requests.get(url).content
raw = pd.read_csv(io.StringIO(s.decode('utf-8'))) 

y = processed['deceptive']
X = processed.drop(['id','deceptive','source','hotel','polarity'], axis=1)

#Feature Engineering
punc = ['`','~','!','(',')','_','-','{','[','}','}',':',';','"',',','.','?','/','""']
X['word_count'] = raw["text"].apply(lambda x: len(str(x).split(" ")))
X['sentence_count'] = raw["text"].apply(lambda x: len(str(x).split(".")))
X['num_unique_words'] = raw['text'].apply(lambda x: len(set(w for w in x.split())))
X['avg_sentence_length'] = X['word_count'] / X['sentence_count']
X['punc_count'] = raw['text'].apply(lambda x : len([a for a in x if a in punc]))
X['capitals'] = raw['text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
X['num_question_marks'] = raw['text'].apply(lambda x: x.count('?'))
X['num_symbols'] = raw['text'].apply(lambda x: sum(x.count(w) for w in '*&$%'))
X['words_vs_unique'] = X['num_unique_words'] / X['word_count']

X = X.drop(['word_count','sentence_count','num_unique_words'], axis=1)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

Tfidf_vect = TfidfVectorizer(max_features=3500)
Tfidf_vect.fit(X['text'])
Text_Idf = Tfidf_vect.transform(X['text'])
X['text'] = Text_Idf.toarray()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in split.split(X, y):
   Train_X, Test_X = X.loc[train_index], X.loc[test_index]
   Train_Y, Test_Y = y[train_index], y[test_index]


SVM = svm.SVC(kernel='rbf', gamma='scale')
start = time.perf_counter()
SVM.fit(Train_X, Train_Y)
stop = time.perf_counter()

#Test Accuracy 
y_pred = SVM.predict(Test_X)
test_Accuracy = accuracy_score(Test_Y, y_pred)*100

#Test Accuracy 
y_pred = SVM.predict(Train_X)
train_Accuracy = accuracy_score(Train_Y, y_pred)*100

crossvalMean = cross_val_score(SVM, X, y, cv=10).mean()

curTime = stop - start;
print(f"Training Time = {curTime:0.8f} Seconds")
print(f"Test Accuracy = {test_Accuracy}")
print(f"Training Accuracy = {train_Accuracy}")
print(f"Cross Validation Mean = {crossvalMean}")

Training Time = 0.04814170 Seconds
Test Accuracy = 71.45833333333333
Training Accuracy = 67.05357142857142
Cross Validation Mean = 0.675
