In [13]:
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag

import pandas as pd
import string
import random

In [14]:
df = pd.read_csv('HateSpeechDatasetBalanced.csv').sample(n=5000)
df = df.dropna()
df = df.drop_duplicates()
df

Unnamed: 0,Content,Label
317840,father day o,0
371775,who authorized you question my authority gogo ...,0
90822,pm met nicola sturgeon in edinburgh today the ...,0
618920,yours are dumb because most other subway stati...,1
701429,the tree angela see the seal guide to house re...,1
...,...,...
228868,none of the two maps are suitable imo if any m...,0
211548,by a person with no life and takes it up the ass,1
628753,do they get the same old warning or... is this...,1
470816,jimmy these are not for whatever be some pieta...,1


In [15]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return 'n'

def preprocess_words(words):
    words = [word for word in words if word.lower() not in stopwords.words('english')]
    words = [word for word in words if word.lower() not in string.punctuation]
    words = [word for word in words if word.isalpha()]

    word_tag = pos_tag(words)

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in word_tag]

    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return words

In [16]:
tweets = df['Content']
labels = df['Label']

word_list = []

for sentence in tweets:
    check_words = word_tokenize(sentence)
    for word in check_words:
        word_list.append(word)

word_list = preprocess_words(word_list)

labeled_data = zip(tweets, labels)

feature_sets = []

for tweet, label in labeled_data:
    feature = {}

    check_words = word_tokenize(tweet)
    check_words = preprocess_words(check_words)

    for word in word_list:
        feature[word] = word in check_words

    feature_sets.append((feature, label))

random.shuffle(feature_sets)

train_count = int(len(feature_sets) * 0.8)
train_set = feature_sets[:train_count]
test_set = feature_sets[train_count:]

classifier = NaiveBayesClassifier.train(train_set)
print(f"Accuracy: {accuracy(classifier, test_set) * 100 :.2f}%")

Accuracy: 66.90%


In [17]:
import pickle
file = open('model.pickle', 'wb')
pickle.dump(classifier, file)