Michael Li (ml5803) and Kaixuan Zhou (kz1005)

Support Vector Machine

Text Classification : Toxic, Information, Sports, Religious, and Advertisement

Step 1: Imports and Authentication

In [0]:
import warnings
warnings.simplefilter("ignore")

In [0]:
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

In [0]:
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

Step 2: Import Data


In [4]:
#import libraries
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

worksheet = gc.open('output_copy').sheet1
records = 30000
rows = worksheet.get_all_values()
header = rows[0]

data = rows[1:records]

print("Total number of data:", len(data))
print("data header:")
print(rows[0])

Total number of data: 21115
data header:
['Handle', 'TweetID', 'Tweet', 'Toxic', 'Information', 'Sports', 'Religious', 'Advertisement', 'Classification']


In [0]:
# # Uncomment for Balanced Data
# balanced_data = []
# count = [0, 0, 0, 0, 0]
# for tweet in data:
#     count[int(tweet[-1])] += 1

# new_count = [0, 0, 0, 0, 0]
# min_count = min(count)
# for tweet in data:
#     if new_count[int(tweet[-1])] < min_count:
#         balanced_data.append(tweet)
#         new_count[int(tweet[-1])] += 1

# data = balanced_data
# print("Total number of data:", len(data))

In [0]:
#shuffle df
df = pd.DataFrame(data, columns = header) 
df = df.sample(frac= 1).reset_index(drop=True)

In [0]:
labels = ['Toxic', 'Information', 'Sports', 'Religious', 'Advertisement']

In [0]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

TFID Vectorizer to create matrix representation of words

In [0]:
# TFID Vectorizer
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, stop_words='english')

Start training using SVM

In [0]:
from sklearn import svm

# Create a classifier: a support vector classifier
# svc = svm.SVC(probability=False,  kernel="rbf", C=2.8, gamma=.0073,verbose=10)
svc = svm.SVC(probability=False,  kernel="linear", C=2.8, gamma=.0073,verbose=10)

Step 3: K-Fold Cross Validation

In [11]:
kf = KFold(n_splits=5)
iteration = 1
import pickle

accuracies = { label:[] for label in labels}

for train_index, test_index in kf.split(df):
    print("Iteration:", iteration)
    iteration += 1

    for model in labels:
        Xtr, Xts = df["Tweet"][train_index], df["Tweet"][test_index]
        ytr, yts = df[model][train_index], df[model][test_index]
        Xtr = vec.fit_transform(Xtr)
        Xts = vec.transform(Xts)
        print("model:", model)
        with open( model + str(iteration) + "svc.p", "wb" ) as fp:
            pickle.dump( [svc, Xtr, ytr], fp)
        with open( model + str(iteration) + "svc.p", "rb" ) as fp:
            svc, Xtr, ytr = pickle.load(fp)
        print("fitting the svm")
        svc.fit(Xtr, ytr)
        yhat_ts = svc.predict(Xts)
        acc = np.mean(yhat_ts == yts)
        print("Accuracy:", acc)
        accuracies[model].append(acc)

Iteration: 1
model: Toxic
fitting the svm
[LibSVM]Accuracy: 0.995264030310206
model: Information
fitting the svm
[LibSVM]Accuracy: 0.9152261425526876
model: Sports
fitting the svm
[LibSVM]Accuracy: 0.9689793985318494
model: Religious
fitting the svm
[LibSVM]Accuracy: 0.939379587970637
model: Advertisement
fitting the svm
[LibSVM]Accuracy: 0.9571394743073645
Iteration: 2
model: Toxic
fitting the svm
[LibSVM]Accuracy: 0.9936064409187781
model: Information
fitting the svm
[LibSVM]Accuracy: 0.9171205304286053
model: Sports
fitting the svm
[LibSVM]Accuracy: 0.9670850106559318
model: Religious
fitting the svm
[LibSVM]Accuracy: 0.9400899834241061
model: Advertisement
fitting the svm
[LibSVM]Accuracy: 0.9486147288657353
Iteration: 3
model: Toxic
fitting the svm
[LibSVM]Accuracy: 0.995264030310206
model: Information
fitting the svm
[LibSVM]Accuracy: 0.9154629410371774
model: Sports
fitting the svm
[LibSVM]Accuracy: 0.9680322045938906
model: Religious
fitting the svm
[LibSVM]Accuracy: 0.93511721

### Testing

In [12]:
#Build model for testing

models = {}
for model in labels:
    X = vec.fit_transform(df["Tweet"])
    Y = df[model]
    temp = svm.SVC(probability=False,  kernel="linear", C=2.8, gamma=.0073,verbose=10)
    temp.fit(X,Y)
    models[model] = temp

models

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

{'Advertisement': SVC(C=2.8, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma=0.0073, kernel='linear',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=10),
 'Information': SVC(C=2.8, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma=0.0073, kernel='linear',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=10),
 'Religious': SVC(C=2.8, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma=0.0073, kernel='linear',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=10),
 'Sports': SVC(C=2.8, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma=0.0073, kernel='linear',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=10),
 

In [0]:
def test(tweet):
    test_data = vec.transform([tweet])
    json = {}
    #model predicting: 1 is in that category, 0 is not
    for model in models.keys():
        result = models[model].predict(test_data)
        json[model] = result[0]
    return json

In [14]:
print(test("Fuck all of you"))

{'Toxic': '1', 'Information': '0', 'Sports': '0', 'Religious': '0', 'Advertisement': '0'}


In [15]:
for elem in accuracies:
    temp = np.array(accuracies[elem])
    temp_avg = np.average(temp)
    print(elem, ":", temp_avg)

Toxic : 0.9957849869760833
Information : 0.9177835661851764
Sports : 0.9672744494435236
Religious : 0.9403741416054938
Advertisement : 0.9529718209803457


In [16]:
print(test("The Sun is 150 million kilometers away from the Moon."))

{'Toxic': '0', 'Information': '1', 'Sports': '0', 'Religious': '0', 'Advertisement': '0'}


In [17]:
print(test("The Latest: Chiefs, Patriots getting chippy in New England"))

{'Toxic': '0', 'Information': '0', 'Sports': '1', 'Religious': '0', 'Advertisement': '0'}


In [18]:
print(test("Trust in the LORD with all your heart, and do not lean on your own understanding. In all your ways acknowledge him, and he will make straight your paths."))

{'Toxic': '0', 'Information': '0', 'Sports': '0', 'Religious': '1', 'Advertisement': '0'}


In [19]:
print(test("Christmas special: Buy 1 get 1 free."))

{'Toxic': '0', 'Information': '0', 'Sports': '0', 'Religious': '0', 'Advertisement': '1'}
