In [37]:
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag

import pandas as pd
import string
import random

In [38]:
df = pd.read_csv('HateSpeechDatasetBalanced.csv').sample(n=3000)
df = df.dropna()
df = df.drop_duplicates()
df

Unnamed: 0,Content,Label
723695,and btw eloy are your stink bomb right fucking,1
361137,further the block what a slut placed long hour...,0
633408,i have not made uncivil comments it is not til...,1
467431,fascist religion what a slut directly related ...,1
75059,baseball with for her birthday for some reason...,0
...,...,...
696457,see svo my zoo i tragedies still piss people o...,1
629034,please stop moron user joule wilburn who is a ...,1
132900,retweet only sluts have skin and hair proper l...,0
706503,go on school several stooges block people,1


In [39]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return 'n'

def preprocess_words(words):
    words = [word for word in words if word.lower() not in stopwords.words('english')]
    words = [word for word in words if word.lower() not in string.punctuation]
    words = [word for word in words if word.isalpha()]

    word_tag = pos_tag(words)

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in word_tag]

    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return words

In [40]:
tweets = df['Content']
labels = df['Label']

word_list = []

for sentence in tweets:
    check_words = word_tokenize(sentence)
    for word in check_words:
        word_list.append(word)

word_list = preprocess_words(word_list)

labeled_data = zip(tweets, labels)

feature_sets = []

for tweet, label in labeled_data:
    feature = {}

    check_words = word_tokenize(tweet)
    check_words = preprocess_words(check_words)

    for word in word_list:
        feature[word] = word in check_words

    feature_sets.append((feature, label))

random.shuffle(feature_sets)

train_count = int(len(feature_sets) * 0.8)
train_set = feature_sets[:train_count]
test_set = feature_sets[train_count:]

classifier = NaiveBayesClassifier.train(train_set)
print(f"Accuracy: {accuracy(classifier, test_set) * 100 :.2f}%")

Accuracy: 68.83%


In [41]:
import pickle
file = open('model.pickle', 'wb')
pickle.dump(classifier, file)