Mike Cresswell: SVM RBF Tuning

In [1]:
import pandas as pd
import numpy as np
import io
import requests
import time
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from itertools import combinations

In [3]:
url="https://raw.githubusercontent.com/mgcresswell/TCSS555-Project/main/deceptive-opinion_processed.csv"
s=requests.get(url).content
Corpus = pd.read_csv(io.StringIO(s.decode('utf-8')))

url="https://raw.githubusercontent.com/mgcresswell/TCSS555-Project/main/deceptive-opinion.csv"
s=requests.get(url).content
raw = pd.read_csv(io.StringIO(s.decode('utf-8')))

y = Corpus['deceptive']
X = Corpus.drop(['id','deceptive'], axis=1)

In [4]:
#feature engineering
punc = ['`','~','!','(',')','_','-','{','[','}','}',':',';','"',',','.','?','/','""']
X['char_count'] = raw["text"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))
X['total_length'] = raw['text'].apply(len)
X['punc_count'] = raw['text'].apply(lambda x : len([a for a in x if a in punc]))
X['word_count'] = raw["text"].apply(lambda x: len(str(x).split(" ")))
X['char_count'] = raw["text"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))
X['sentence_count'] = raw["text"].apply(lambda x: len(str(x).split(".")))
X['avg_word_length'] = X['char_count'] / X['word_count']
X['avg_sentence_length'] = X['word_count'] / X['sentence_count']
X['word_density'] = X['word_count'] / (X['char_count'] + 1)
X['punc_count'] = raw['text'].apply(lambda x : len([a for a in x if a in punc]))
X['total_length'] = raw['text'].apply(len)
X['capitals'] = raw['text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
X['num_exclamation_marks'] = raw['text'].apply(lambda x: x.count('!'))
X['num_question_marks'] = raw['text'].apply(lambda x: x.count('?'))
X['num_punctuation'] = raw['text'].apply(lambda x: sum(x.count(w) for w in '.,;:'))
X['num_symbols'] = raw['text'].apply(lambda x: sum(x.count(w) for w in '*&$%'))
X['num_unique_words'] = raw['text'].apply(lambda x: len(set(w for w in x.split())))
X['words_vs_unique'] = X['num_unique_words'] / X['word_count']
X["word_unique_percent"] =  X["num_unique_words"]*100/X['word_count']

In [5]:
#preprocessing
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
hotelEncoded = label_encoder.fit_transform(X['hotel'])
polarityEncoded = label_encoder.fit_transform(X['polarity'])
sourceEncoded = label_encoder.fit_transform(X['source'])

onehot_encoder = OneHotEncoder(sparse=False)
hotelEncoded = hotelEncoded.reshape(len(hotelEncoded), 1)
X['hotel'] = onehot_encoder.fit_transform(hotelEncoded)
polarityEncoded = polarityEncoded.reshape(len(polarityEncoded), 1)
X['polarity'] = onehot_encoder.fit_transform(polarityEncoded)
sourceEncoded = sourceEncoded.reshape(len(sourceEncoded), 1)
X['source'] = onehot_encoder.fit_transform(sourceEncoded)

Tfidf_vect = TfidfVectorizer(max_features=2300)
Tfidf_vect.fit(Corpus['text'])
Text_Idf = Tfidf_vect.transform(X['text'])
X['text'] = Text_Idf.toarray()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in split.split(X, y):
   Train_X, Test_X = X.loc[train_index], X.loc[test_index]
   Train_Y, Test_Y = y[train_index], y[test_index]

In [7]:
tunningData = []
#rdf            
for i in range(1, 3):
    for combo in combinations(X.columns,i): 
        combo = np.array(combo)
        comboDF_TrainX = Train_X[combo]
        comboDF_TestX = Test_X[combo]

        SVM = svm.SVC(kernel='rbf', gamma='scale')
        start = time.perf_counter()
        SVM.fit(comboDF_TrainX, Train_Y)
        end = time.perf_counter()
        y_pred = SVM.predict(comboDF_TestX)
        curTime = end - start
        row = { 'features': ",".join(combo),'accuracy':accuracy_score(Test_Y, y_pred),'time':curTime}
        tunningData.append(row)


df = pd.DataFrame(data=tunningData, columns=['features','accuracy','time'])
df = df.sort_values(['accuracy', 'time'], ascending=[False, True])
print(df)

                           features  accuracy      time
57                      source,text  1.000000  0.001494
65              source,word_density  1.000000  0.001535
39                  polarity,source  1.000000  0.001693
21                     hotel,source  1.000000  0.001724
2                            source  1.000000  0.001878
..                              ...       ...       ...
104  char_count,word_unique_percent  0.485417  0.043151
12                         capitals  0.483333  0.040514
5                      total_length  0.483333  0.041629
4                        char_count  0.481250  0.041968
7                        word_count  0.479167  0.043675

[210 rows x 3 columns]
