# <span style="font-family:Verdana ; color:red; font-size:36px; font-weight:bold; font-style:italic">Hate Speech Detection Using Machine Learning</span>

# <span style="font-family:Verdana ; color:green; font-size:26px; font-weight:bold; font-style:italic">1. Importing Necessary Packages</span>

In [None]:
# importing the necessary libraries

import pandas as pd
import numpy as np
import nltk
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
# Download stopwords and punkt (for word tokenization)
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# <span style="font-family:Verdana ; color:green; font-size:26px; font-weight:bold; font-style:italic">2. Importing the Dataset</span>

In [None]:
# loading the dataset

df = pd.read_csv('/content/train.csv')
print(df.head(10))

   count  hate_speech_count  offensive_language_count  neither_count  class  \
0      3                  0                         0              3      2   
1      3                  0                         3              0      1   
2      3                  0                         3              0      1   
3      3                  0                         2              1      1   
4      6                  0                         6              0      1   
5      3                  1                         2              0      1   
6      3                  0                         3              0      1   
7      3                  0                         3              0      1   
8      3                  0                         3              0      1   
9      3                  1                         2              0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17

# <span style="font-family:Verdana ; color:green; font-size:26px; font-weight:bold; font-style:italic">3. Map the Columns for Hate Speech</span>

In [None]:
df['labels'] = df['class'].map({0: 'Hate Speech', 1: 'Offensive Language', 2: 'Normal'})
print(df.head(10))

   count  hate_speech_count  offensive_language_count  neither_count  class  \
0      3                  0                         0              3      2   
1      3                  0                         3              0      1   
2      3                  0                         3              0      1   
3      3                  0                         2              1      1   
4      6                  0                         6              0      1   
5      3                  1                         2              0      1   
6      3                  0                         3              0      1   
7      3                  0                         3              0      1   
8      3                  0                         3              0      1   
9      3                  1                         2              0      1   

                                               tweet              labels  
0  !!! RT @mayasolovely: As a woman you shouldn't...   

# <span style="font-family:Verdana; color:green; font-size:26px; font-weight:bold; font-style:italic;">4. Splitting the Columns</span>

In [None]:
#splitting the columns
df = df[['tweet', 'labels']]
print(df.head())

                                               tweet              labels
0  !!! RT @mayasolovely: As a woman you shouldn't...              Normal
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  Offensive Language
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  Offensive Language
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  Offensive Language
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  Offensive Language


# <span style="font-family:Verdana; color:green; font-size:26px; font-weight:bold; font-style:italic;">5. Cleaning The Sentence in Dataset</span>

In [None]:
# cleaning the text

stemmer = PorterStemmer()
stopwords = stopwords.words('english')

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopwords]
    text = ' '.join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = ' '.join(text)
    return text

df['tweet'] = df['tweet'].apply(clean)

# <span style="font-family:Verdana; color:green; font-size:26px; font-weight:bold; font-style:italic;">6. Train Dataset Using Decision Tree Classifier</span>

In [None]:
# split data into train, validation, and test sets
X = np.array(df['tweet'])
y = np.array(df['labels'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
# Vectorize text data
cv = CountVectorizer()
X_train = cv.fit_transform(X_train)
X_val = cv.transform(X_val)
X_test = cv.transform(X_test)

In [None]:
#train the model
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [None]:
# Evaluate on validation set
from sklearn.metrics import classification_report
y_pred = clf.predict(X_val)
print("Validation Report:")
print(classification_report(y_val, y_pred, target_names=['Hate Speech', 'Offensive Language', 'Normal']))

# Evaluate on test set
y_pred = clf.predict(X_test)
print("Test Report:")
print(classification_report(y_test, y_pred, target_names=['Hate Speech', 'Offensive Language', 'Normal']))

Validation Report:
                    precision    recall  f1-score   support

       Hate Speech       0.38      0.38      0.38       288
Offensive Language       0.79      0.84      0.81       814
            Normal       0.93      0.92      0.92      3855

          accuracy                           0.87      4957
         macro avg       0.70      0.71      0.70      4957
      weighted avg       0.88      0.87      0.87      4957

Test Report:
                    precision    recall  f1-score   support

       Hate Speech       0.36      0.34      0.35       290
Offensive Language       0.80      0.84      0.82       835
            Normal       0.93      0.92      0.92      3832

          accuracy                           0.87      4957
         macro avg       0.70      0.70      0.70      4957
      weighted avg       0.87      0.87      0.87      4957



# <span style="font-family:Verdana; color:green; font-size:26px; font-weight:bold; font-style:italic;">7. Validate The Dataset</span>

In [None]:
# Load the text sample
sample = 'nigga' # test with for instance: kill, dog, idiot, hello

# Preprocess the sample text
sample_processed = clean(sample)

# Vectorize the preprocessed sample text
sample_vector = cv.transform([sample_processed])

# Predict the label for the sample text
sample_prediction = clf.predict(sample_vector)

print("Prediction for sample text '{}': {}".format(sample, sample_prediction[0]))

Prediction for sample text 'nigga': Offensive Language
