# Ridge Classifier for Hate Speech Detection

### Import Libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from evaluate_classification import EvaluateBinaryClassification

### Initialise Random Variables

In [3]:
SEED = 123
np.random.seed(SEED)

### Loading Data

In [4]:
BASE = 'D:\\ResearchDataGtx1060\\SentimentData\\Hate\\'
fins_train = ['random_hate_train.csv']
fins_test = ['eastasian_hate_test.csv']
track = 0

In [5]:
# We apply only this preprocessing because our data is already preprocessed
def cleanNonAscii(text):
    '''
    Remove Non ASCII characters from the dataset.
    Arguments:
        text: str
    returns: 
        text: str
    '''
    return ''.join(i for i in text if ord(i) < 128)

In [6]:
df_train = pd.read_csv(BASE+fins_train[track])
df_train.head()

Unnamed: 0,label,text
0,1,<user> if you are one of the <number> mil <has...
1,0,best <hashtag> law of attraction </hashtag> <h...
2,1,<hashtag> michelle obama </hashtag> is the mos...
3,0,smiling because life is good rite now ! <repea...
4,0,ã ¢ â  â ¤ ã ¯ â ¸ â  ã ¢ â  â ¤ ã ¯ â ¸ â ...


In [7]:
df_train.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,2242
1,2242


In [8]:
df_train['text'] = df_train['text'].apply(cleanNonAscii)
df_train.head(5)

Unnamed: 0,label,text
0,1,<user> if you are one of the <number> mil <has...
1,0,best <hashtag> law of attraction </hashtag> <h...
2,1,<hashtag> michelle obama </hashtag> is the mos...
3,0,smiling because life is good rite now ! <repea...
4,0,<hashtag> ...


In [9]:
X_train, y_train = df_train['text'].values, df_train['label'].values

In [10]:
df_test = pd.read_csv(BASE+fins_test[track])
df_test.head()

Unnamed: 0,label,text
0,1,<user> <user> the chinese are probably sprayin...
1,0,rt <user> : unpatriotic losers are tweeting ou...
2,1,<user> thus <hashtag> 2019 n co v </hashtag> i...
3,0,north korea closes borders to avoid coronaviru...
4,1,<user> this is a declaration of war . it prove...


In [11]:
df_test.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,3898
1,3898


In [12]:
df_test['text'] = df_test['text'].apply(cleanNonAscii)
df_test.head(5)

Unnamed: 0,label,text
0,1,<user> <user> the chinese are probably sprayin...
1,0,rt <user> : unpatriotic losers are tweeting ou...
2,1,<user> thus <hashtag> 2019 n co v </hashtag> i...
3,0,north korea closes borders to avoid coronaviru...
4,1,<user> this is a declaration of war . it prove...


In [13]:
X_test, y_test = df_test['text'].values, df_test['label'].values

### Transforming data suitable for model format

In [14]:
count_vectorizer = CountVectorizer(binary=False)
count_vectorizer.fit(X_train)
#count_vectorizer.vocabulary_

CountVectorizer()

In [15]:
train_vectors = count_vectorizer.transform(X_train)
X_train_vectors = train_vectors.toarray()
test_vectors = count_vectorizer.transform(X_test)
X_test_vectors = test_vectors.toarray()

### set hyper parameters to search¶

In [27]:
param_grid = {'alpha': [0.9, 1.0],
              'solver': ['lsqr']
             }

### Define and fit the model

In [28]:
clf_RidgeClassifier = RidgeClassifier()
keys = clf_RidgeClassifier.get_params().keys()
print(keys)
grid_rc = GridSearchCV(clf_RidgeClassifier, param_grid, cv=10)
grid_rc.fit(X_train_vectors, y_train)

dict_keys(['alpha', 'class_weight', 'copy_X', 'fit_intercept', 'max_iter', 'normalize', 'random_state', 'solver', 'tol'])


Traceback (most recent call last):
  File "C:\Users\User\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py", line 945, in fit
    super().fit(X, Y, sample_weight=sample_weight)
  File "C:\Users\User\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py", line 574, in fit
    sample_weight=sample_weight, return_mean=True)
  File "C:\Users\User\anaconda3\lib\site-packages\sklearn\linear_model\_base.py", line 160, in _preprocess_data
    X_offset = np.average(X, axis=0, weights=sample_weight)
  File "<__array_function__ internals>", line 6, in average
  File "C:\Users\User\anaconda3\lib\site-packages\numpy\lib\function_base.py", line 425, in average
    avg = np.multiply(a, wgt, dtype=result_dtype).sum(axis)/scl
MemoryError: Unable to allocate 291. MiB for an array with shape (4035, 9447) and data type float6

GridSearchCV(cv=10, estimator=RidgeClassifier(),
             param_grid={'alpha': [0.9, 1.0], 'solver': ['lsqr']})

In [29]:
print("Best parameters set found on development set:")
print(grid_rc.best_params_)

Best parameters set found on development set:
{'alpha': 0.9, 'solver': 'lsqr'}


In [22]:
y_true, y_pred = y_train, grid_rc.predict(X_train_vectors)
print(EvaluateBinaryClassification(gnd_truths = y_true, predictions = y_pred).get_full_report())

EvaluateBinaryClassification Object Created

Total Samples	4484
Positive Samples	2242
Negative Samples	2242
True Positive	2228
True Negative	2238
False Positive	4
False Negative	14
Accuracy	0.995985727029438
Precision	0.9982078853046595
Recall	0.9937555753791257
F1 Measure	0.9959767545820295
Cohen Kappa Score	0.991971454058876
Area Under Curve	0.9959857270294381

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2242
           1       1.00      0.99      1.00      2242

    accuracy                           1.00      4484
   macro avg       1.00      1.00      1.00      4484
weighted avg       1.00      1.00      1.00      4484



### Evaluating the model with test dataset

In [24]:
actual, predicted = y_test, grid_rc.predict(X_test_vectors)

In [25]:
ebc = EvaluateBinaryClassification(gnd_truths = actual, predictions = predicted)
print(ebc.get_full_report())

EvaluateBinaryClassification Object Created

Total Samples	7796
Positive Samples	3898
Negative Samples	3898
True Positive	1725
True Negative	2432
False Positive	1466
False Negative	2173
Accuracy	0.5332221652129298
Precision	0.540582889376371
Recall	0.4425346331452027
F1 Measure	0.4866694879390605
Cohen Kappa Score	0.0664443304258594
Area Under Curve	0.5332221652129298

              precision    recall  f1-score   support

           0       0.53      0.62      0.57      3898
           1       0.54      0.44      0.49      3898

    accuracy                           0.53      7796
   macro avg       0.53      0.53      0.53      7796
weighted avg       0.53      0.53      0.53      7796



In [26]:
ebc.save_full_report(model_name='RC', path='C:\\Users\\User\\JupyterPythonPredator\\COVID19\\domain_adaptation_rerun_randomhate_eastasianhate_')