# Support Vector Machine for Hate Speech Detection

### Import Libraries

In [6]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from evaluate_classification import EvaluateBinaryClassification

### Initialise Random Variables

In [2]:
SEED = 123
np.random.seed(SEED)

### Loading Data

In [3]:
BASE = 'D:\\ResearchDataGtx1060\\SentimentData\\Hate\\'
fins_train = ['random_hate_train.csv']
fins_test = ['eastasian_hate_test.csv']
track = 0

In [4]:
# We apply only this preprocessing because our data is already preprocessed
def cleanNonAscii(text):
    '''
    Remove Non ASCII characters from the dataset.
    Arguments:
        text: str
    returns: 
        text: str
    '''
    return ''.join(i for i in text if ord(i) < 128)

In [7]:
df_train = pd.read_csv(BASE+fins_train[track])
df_train.head()

Unnamed: 0,label,text
0,1,<user> if you are one of the <number> mil <has...
1,0,best <hashtag> law of attraction </hashtag> <h...
2,1,<hashtag> michelle obama </hashtag> is the mos...
3,0,smiling because life is good rite now ! <repea...
4,0,ã ¢ â  â ¤ ã ¯ â ¸ â  ã ¢ â  â ¤ ã ¯ â ¸ â ...


In [8]:
df_train.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,2242
1,2242


In [9]:
df_train['text'] = df_train['text'].apply(cleanNonAscii)
df_train.head(5)

Unnamed: 0,label,text
0,1,<user> if you are one of the <number> mil <has...
1,0,best <hashtag> law of attraction </hashtag> <h...
2,1,<hashtag> michelle obama </hashtag> is the mos...
3,0,smiling because life is good rite now ! <repea...
4,0,<hashtag> ...


In [10]:
X_train, y_train = df_train['text'].values, df_train['label'].values

In [11]:
df_test = pd.read_csv(BASE+fins_test[track])
df_test.head()

Unnamed: 0,label,text
0,1,<user> <user> the chinese are probably sprayin...
1,0,rt <user> : unpatriotic losers are tweeting ou...
2,1,<user> thus <hashtag> 2019 n co v </hashtag> i...
3,0,north korea closes borders to avoid coronaviru...
4,1,<user> this is a declaration of war . it prove...


In [12]:
df_test.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,3898
1,3898


In [13]:
df_test['text'] = df_test['text'].apply(cleanNonAscii)
df_test.head(5)

Unnamed: 0,label,text
0,1,<user> <user> the chinese are probably sprayin...
1,0,rt <user> : unpatriotic losers are tweeting ou...
2,1,<user> thus <hashtag> 2019 n co v </hashtag> i...
3,0,north korea closes borders to avoid coronaviru...
4,1,<user> this is a declaration of war . it prove...


In [14]:
X_test, y_test = df_test['text'].values, df_test['label'].values

### Transforming data suitable for model format

In [15]:
vectorizer = CountVectorizer()
Xtrain = vectorizer.fit_transform(X_train)
Xtrain = Xtrain.toarray()
#count_vectorizer.vocabulary_

In [16]:
Xtest = vectorizer.transform(X_test)
Xtest = Xtest.toarray()

### Define linear SVM and fit the model

In [18]:
#LinearSVC
clf_LinearSVC = LinearSVC()
clf_LinearSVC.fit(Xtrain, y_train)



LinearSVC()

In [20]:
actual, predicted = y_test, clf_LinearSVC.predict(Xtest)
predicted = np.array(predicted)

### Evaluating the model with test dataset

In [21]:
ebc = EvaluateBinaryClassification(gnd_truths = actual, predictions = predicted)
print(ebc.get_full_report())

EvaluateBinaryClassification Object Created

Total Samples	7796
Positive Samples	3898
Negative Samples	3898
True Positive	2291
True Negative	2064
False Positive	1834
False Negative	1607
Accuracy	0.5586198050282196
Precision	0.5553939393939394
Recall	0.5877373011800924
F1 Measure	0.5711080643150942
Cohen Kappa Score	0.11723961005643924
Area Under Curve	0.5586198050282196

              precision    recall  f1-score   support

           0       0.56      0.53      0.55      3898
           1       0.56      0.59      0.57      3898

    accuracy                           0.56      7796
   macro avg       0.56      0.56      0.56      7796
weighted avg       0.56      0.56      0.56      7796



In [22]:
ebc.save_full_report(model_name='Linear SVM', path='C:\\Users\\User\\JupyterPythonPredator\\COVID19\\domain_adaptation_rerun_randomhate_eastasianhate_')

### Define Kernel SVM and Fit the Model

In [23]:
param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
score = 'accuracy'

In [25]:
# fit model no training data
grid_svm = GridSearchCV(SVC(), param_grid, cv=10, scoring='%s' % score)
grid_svm.fit(Xtrain, y_train)

print("Best parameters set found on development set:")
print()
print(grid_svm.best_params_)
print()
print(grid_svm)

Best parameters set found on development set:

{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}

GridSearchCV(cv=10, estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']},
                         {'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
             scoring='accuracy')


### Evaluating the Kernel SVM model with test dataset

In [27]:
actual, predicted = y_test, grid_svm.predict(Xtest)

In [28]:
ebc = EvaluateBinaryClassification(gnd_truths = actual, predictions = predicted)
print(ebc.get_full_report())

EvaluateBinaryClassification Object Created

Total Samples	7796
Positive Samples	3898
Negative Samples	3898
True Positive	2539
True Negative	1964
False Positive	1934
False Negative	1359
Accuracy	0.577603899435608
Precision	0.5676279901632014
Recall	0.6513596716264751
F1 Measure	0.6066180862501492
Cohen Kappa Score	0.15520779887121605
Area Under Curve	0.577603899435608

              precision    recall  f1-score   support

           0       0.59      0.50      0.54      3898
           1       0.57      0.65      0.61      3898

    accuracy                           0.58      7796
   macro avg       0.58      0.58      0.58      7796
weighted avg       0.58      0.58      0.58      7796



In [29]:
ebc.save_full_report(model_name='Kernel SVM', path='C:\\Users\\User\\JupyterPythonPredator\\COVID19\\domain_adaptation_rerun_randomhate_eastasianhate_')