In [1]:
#!pip install normalise

In [2]:
import numpy as np
import multiprocessing as mp
import pandas as pd
import string
import spacy 
from sklearn.base import TransformerMixin, BaseEstimator


nlp = spacy.load("en_core_web_sm")

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 nlp = nlp,
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Punctuation removal
            2. Stop words removal
            3. Lemmatization

        nlp  - spacy model
        n_jobs - parallel jobs to run
        """
        self.nlp = nlp
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        doc = self.nlp(text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _remove_punct(self, doc):
        return (t for t in doc if t.text not in string.punctuation)

    def _remove_stop_words(self, doc):
        return (t for t in doc if not t.is_stop)

    def _lemmatize(self, doc):
        return ' '.join(t.lemma_ for t in doc)

In [3]:
!pip install datasets
import datasets
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')
df = dataset['train'].to_pandas()
df.describe()



Using custom data configuration ucberkeley-dlab--measuring-hate-speech-b2914663eaad033d
Found cached dataset parquet (C:/Users/pedro/.cache/huggingface/datasets/ucberkeley-dlab___parquet/ucberkeley-dlab--measuring-hate-speech-b2914663eaad033d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,hatespeech,hate_speech_score,infitms,outfitms,annotator_severity,std_err,annotator_infitms,annotator_outfitms,hypothesis,annotator_age
count,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,...,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135451.0
mean,23530.416138,5567.097812,1.281352,2.954307,2.828875,2.56331,2.278638,2.698575,1.846211,1.052045,...,0.744733,-0.567428,1.034322,1.001052,-0.018817,0.300588,1.007158,1.011841,0.014589,37.910772
std,12387.194125,3230.508937,1.023542,1.231552,1.309548,1.38983,1.370876,0.8985,1.402372,1.345706,...,0.93226,2.380003,0.496867,0.791943,0.487261,0.23638,0.269876,0.675863,0.613006,11.641276
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-8.34,0.1,0.07,-1.82,0.02,0.39,0.28,-1.578693,18.0
25%,18148.0,2719.0,0.0,2.0,2.0,2.0,1.0,2.0,1.0,0.0,...,0.0,-2.33,0.71,0.56,-0.38,0.03,0.81,0.67,-0.341008,29.0
50%,20052.0,5602.5,1.0,3.0,3.0,3.0,3.0,3.0,2.0,0.0,...,0.0,-0.34,0.96,0.83,-0.02,0.34,0.97,0.85,0.110405,35.0
75%,32038.25,8363.0,2.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,...,2.0,1.41,1.3,1.22,0.35,0.42,1.17,1.13,0.449555,45.0
max,50070.0,11142.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,2.0,6.3,5.9,9.0,1.36,1.9,2.01,9.0,0.987511,81.0


In [4]:

df[['hatespeech', 'text']]

Unnamed: 0,hatespeech,text
0,0.0,Yes indeed. She sort of reminds me of the elde...
1,0.0,The trans women reading this tweet right now i...
2,2.0,Question: These 4 broads who criticize America...
3,0.0,It is about time for all illegals to go back t...
4,2.0,For starters bend over the one in pink and kic...
...,...,...
135551,0.0,عاجل سماحة #السيد_عبدالملك_بدرالدين_الحوثي نص...
135552,0.0,Millions of #Yemen-is participated in mass ral...
135553,0.0,@AbeShinzo @realDonaldTrump @shinzoabe 独裁者は行きま...
135554,0.0,Millions of #Yemen-is participated in mass ral...


In [5]:
df["hatespeech"].value_counts()

0.0    80624
2.0    46021
1.0     8911
Name: hatespeech, dtype: int64

In [6]:
df['hatespeech'] = (df['hatespeech']).apply(lambda x: 1 if x > 0 else 0)

In [7]:
df['hatespeech'] .value_counts(normalize=True)

0    0.594765
1    0.405235
Name: hatespeech, dtype: float64

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import  LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import normalize
from sklearn.model_selection import KFold

X = df["text"]
y = df['hatespeech']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42, test_size= .3, stratify= y)

clf  = Pipeline(steps=[
        ('normalize', TextPreprocessor(n_jobs=-1)), 
        ('features', TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)),
        ('classifier', LogisticRegressionCV(cv=5,solver='saga',scoring='accuracy', n_jobs=-1, verbose=0))
    ])

clf.fit(X_train, y_train)
y_pred_log = clf.predict(X_test)
clf1_score = clf.score(X_train, y_train)

In [None]:
from sklearn.metrics._plot.confusion_matrix import plot_confusion_matrix

plot_confusion_matrix(clf, X_test, y_test)

In [None]:
def cross_validation_pip(X_train, y_train, estimator, num_split = 5):
  
  kf = KFold(n_splits= num_split)

  kf.get_n_splits(X_train)
  
  score_val_list = []
  score_train_list = []
  
  for train_index, test_index in kf.split(X):
      # print("TRAIN:", train_index, "TEST:", test_index)
      X_train, X_test = X[train_index], X[test_index]
      y_train, y_test = y[train_index], y[test_index]
      estimator.fit(X_train, y_train)

      # now how did we do?
      accuracy_train = estimator.score(X_train, y_train)
      accuracy_val = estimator.score(X_test, y_test)
      score_val_list.append(accuracy_val)
      score_train_list.append(accuracy_train)
    
  return {'train': np.mean(score_train_list), 'validation': np.mean(score_val_list)}

In [None]:
clf2  = Pipeline(steps=[
        ('normalize', TextPreprocessor(n_jobs=-1)), 
        ('features', TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)),
        ('classifier', MultinomialNB(alpha=1.0, fit_prior=True))])

clf2.fit(X_train, y_train)

y_pred_NB = clf2.predict(X_test)

clf2_score = cross_validation_pip(X_train, y_train, clf2, num_split = 3)

In [None]:
# from sklearn.metrics import classification_report


# clf2_class_report = classification_report(y_test, y_pred, output_dict= True)

In [None]:
# clf2_class_report['weighted avg']

In [None]:
plot_confusion_matrix(clf2, X_test, y_test)

In [None]:
from sklearn.svm import LinearSVC

clf3  = Pipeline(steps=[
        ('normalize', TextPreprocessor(n_jobs=-1)), 
        ('features', TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)),
        ('classifier', LinearSVC(random_state=0, tol=1e-5) )] )

clf3.fit(X_train, y_train)

y_pred_SVM = clf3.predict(X_test)

clf3_score = cross_validation_pip(X_train, y_train, clf3, num_split = 3)

In [None]:
plot_confusion_matrix(clf3, X_test, y_test)

In [None]:
print(f"Logisitc Regression accuracy score {clf1_score} \nMultiNomialNB accuracy score {clf2_score} \nSVM score is {clf3_score}")