In [2]:
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag

import pandas as pd
import string
import random

In [3]:
df = pd.read_csv('HateSpeechDatasetBalanced.csv').sample(n=5000)
df = df.dropna()
df = df.drop_duplicates()
df

Unnamed: 0,Content,Label
468035,a very old woman called you myself... jdjsskak...,1
66372,omen have mouths and anuses just as appealing ...,1
172143,gamergate why why go to all this effort when y...,0
449616,beto mira,1
539261,as a mexican and now naturalized american i fo...,1
...,...,...
492017,let us talk here about opposite of concepts li...,1
171613,that s right wikipedia is a collaborative sic ...,0
331740,suck my cock motherfuckers why do not you fuck...,1
341795,i disagree there is a clear geographic compone...,0


In [4]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return 'n'

def preprocess_words(words):
    words = [word for word in words if word.lower() not in stopwords.words('english')]
    words = [word for word in words if word.lower() not in string.punctuation]
    words = [word for word in words if word.isalpha()]

    word_tag = pos_tag(words)

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in word_tag]

    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return words

In [5]:
tweets = df['Content']
labels = df['Label']

word_list = []

for sentence in tweets:
    check_words = word_tokenize(sentence)
    for word in check_words:
        word_list.append(word)

word_list = preprocess_words(word_list)

labeled_data = zip(tweets, labels)

feature_sets = []

for tweet, label in labeled_data:
    feature = {}

    check_words = word_tokenize(tweet)
    check_words = preprocess_words(check_words)

    for word in word_list:
        feature[word] = word in check_words

    feature_sets.append((feature, label))

random.shuffle(feature_sets)

train_count = int(len(feature_sets) * 0.8)
train_set = feature_sets[:train_count]
test_set = feature_sets[train_count:]

classifier = NaiveBayesClassifier.train(train_set)
print(f"Accuracy: {accuracy(classifier, test_set) * 100 :.2f}%")

Accuracy: 71.00%


In [6]:
import pickle
file = open('model.pickle', 'wb')
pickle.dump(classifier, file)