### Set up library


In [123]:
%%capture
!pip install nlpaug
!pip install nltk>=3.4.5

In [153]:
import pandas as pd
import re
from scipy.sparse import hstack
from scipy import sparse

import nlpaug.augmenter.word as naw

import nltk
from tqdm import trange
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, chi2

from imblearn.over_sampling import SMOTE

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


### load dataset

In [125]:
df = pd.read_parquet("hf://datasets/tdavidson/hate_speech_offensive/data/train-00000-of-00001.parquet")

### preprocessing

In [126]:
def clean(sample):
    sample = sample.lower()
    sample = re.sub(r'@\w+', '', sample)
    sample = re.sub('[^a-z A-Z 0-9-]+', '', sample)
    sample = " ".join([word for word in sample.split() if word not in stopwords.words('english') and word != 'rt'])

    return sample

In [127]:
df['tweet'] = df['tweet'].apply(lambda x : clean(x))

In [128]:
X = df.drop(columns='class')
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = df['class'])

Handling imbalanced dataset
- augmenting the minority classes
- oversampling
- SMOTE


Augmenting

- Replace synonyms in samples from class 0 and 2

In [129]:
def augmenting_data(row):
    aug = naw.SynonymAug(aug_src='wordnet')
    augmented_list = aug.augment(row['tweet'])
    row['tweet'] = augmented_list[0]  # Lấy bản đầu tiên
    return row

In [130]:
X_train = pd.concat([X_train, y_train], axis=1)
augmented_data = X_train[(X_train['class'] == 0) | (X_train['class'] == 2)].copy()
augmented_data.shape

(4474, 6)

In [131]:
augmented_tweet = augmented_data.apply(augmenting_data, axis=1)
X_train = pd.concat([X_train, augmented_tweet])

Oversampling

- Randomly selecting 1200 samples from class 0

In [132]:
oversampling_data = X_train[X_train['class'] == 0].copy()
oversampling_data.shape

(2288, 6)

In [133]:
sampled_data = oversampling_data.sample(n=1200, random_state=42)
X_train = pd.concat([X_train, sampled_data])

In [134]:
X_train['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,15352
2,6660
0,3488


Tf-idf vetorization

In [135]:
tweets = X_train['tweet'].values

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(tweets)

In [136]:
y_train = X_train['class']
X_train = X_train.drop(columns=['tweet', 'class'])

X_train = sparse.csr_matrix(X_train)
X_train = hstack([X_train, X_train_tfidf])

In [149]:
X_test_numeric = X_test.drop(columns=['tweet'])

X_test_numeric_sparse = sparse.csr_matrix(X_test_numeric.values)

X_test_tfidf = vectorizer.transform(X_test['tweet'])

X_test = hstack([X_test_numeric_sparse, X_test_tfidf])


SMOTE

In [142]:
sampling_strategy = {0: 5000, 2: 8000}
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)

X_train, y_train = smote.fit_resample(X_train, y_train)

Feature selection

- choose top 1000 best features

In [145]:
selector = SelectKBest(chi2, k=1000)
X_train = selector.fit_transform(X_train, y_train)

In [151]:
X_test = selector.transform(X_test)

### Training

In [152]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# 2. Dự đoán
y_pred = nb_model.predict(X_test)

# 3. Đánh giá
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96       286
           1       1.00      0.99      1.00      3838
           2       1.00      1.00      1.00       833

    accuracy                           0.99      4957
   macro avg       0.97      1.00      0.99      4957
weighted avg       1.00      0.99      0.99      4957



In [154]:
nb_model = LogisticRegression()
nb_model.fit(X_train, y_train)

# 2. Dự đoán
y_pred = nb_model.predict(X_test)

# 3. Đánh giá
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       286
           1       1.00      1.00      1.00      3838
           2       1.00      1.00      1.00       833

    accuracy                           1.00      4957
   macro avg       1.00      1.00      1.00      4957
weighted avg       1.00      1.00      1.00      4957

