In [37]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random
import os

In [38]:
# read the data from the csv 
data = pd.read_csv('hate_speech_data.csv')

# making sure that the non-hate data and hate data both have 100,000 records 
label_data = data.loc[data['Label'] == '0']
non_hate_data = label_data.sample(100000)
hate_data = data.loc[data['Label'] == '1']

# creating it into a new csv
frames = [hate_data, non_hate_data]
new_data = pd.concat(frames)
new_data.to_csv("updated_hate_speech.csv")

In [39]:
# read the new csv 
updated_hate_speech = pd.read_csv('updated_hate_speech.csv')

In [38]:
import glob
import os
import csv

# combine all the text files that contain the hate and non-hate speech into one csv 
# set the path of the folder containing the files
folder_path = 'hate_speech_vicom/'

# set the name of the new file to be created
new_file_name = 'combined.csv'

# get a list of all the files in the folder
file_list = glob.glob(folder_path + '*.txt')

# open the new file for writing
with open(new_file_name, 'w', newline='') as outfile:
    # create a CSV writer object
    writer = csv.writer(outfile)
    # write the header row with column names
    writer.writerow(['file_id', 'contents'])
    # loop through all the files in the folder
    for filename in file_list:
        # get the name of the file without the path and extension
        file_title = os.path.splitext(os.path.basename(filename))[0]
        # open each file for reading
        with open(filename, 'r') as infile:
            # read the contents of the file
            contents = infile.read().strip()
            # write the file title and contents to the new CSV file
            writer.writerow([file_title, contents])


In [39]:
# read the csv file 
new = pd.read_csv('combined.csv')

In [40]:
# read the csv that contains whether something is considered hate or non hate 
label = pd.read_csv('labels.csv')

In [41]:
# join the two csvs on the file id
merged_df = pd.merge(new, label, on='file_id')

In [42]:
# change noHate to 0 and  hate 1 to support the previous models
merged_df.loc[merged_df["label"] == "noHate", "label"] = 0
merged_df.loc[merged_df["label"] == "hate", "label"] = 1

In [43]:
# drop the rows that have labels that aren't hate or non hate 
merged_df.drop(merged_df[merged_df['label'] == 'idk/skip'].index, inplace = True)
merged_df.drop(merged_df[merged_df['label'] == 'relation'].index, inplace = True)

In [44]:
# change the type of the label to int
merged_df['label'] = merged_df['label'].astype('int64')

In [47]:
# making sure that the non-hate data and hate data both have 1196 records 
label_data = merged_df.loc[merged_df['label'] == 0]
non_hate_data = label_data.sample(1196)
hate_data = merged_df.loc[merged_df['label'] == 1]
frames = [hate_data, non_hate_data]
merged_df = pd.concat(frames)

In [48]:
# creating the dataframe to a csv 
merged_df.to_csv('merged_hate.csv', index=False)

In [50]:
# read the csv 
pd.read_csv('merged_hate.csv')

Unnamed: 0,file_id,contents,user_id,subforum_id,num_contexts,label
0,33222250_1,I would not be surprised if this woman was spe...,572710,1346,2,1
1,13481672_2,If they were jew pieces of garbage who the hel...,584956,1345,0,1
2,13456820_1,Put that fat Black idiot to work and she would...,572059,1347,0,1
3,13456898_1,"I know exactly what you 're saying , I just ha...",572496,1396,0,1
4,14678603_1,I am White and I believe all White people know...,572073,1371,0,1
...,...,...,...,...,...,...
2387,13494421_1,The reasons : Alcoholism takes around 150000 l...,584433,1395,0,0
2388,13457473_2,YouTube - BLACK WOMAN GOES BALLISTIC ON REPORT...,572158,1347,0,0
2389,31708555_1,oh definitely do push ups and sit ups for the ...,573774,1363,0,0
2390,14112998_3,Camie,572043,1381,0,0


In [54]:
import pandas as pd

# read the csv file from the Davidson Hate Speech data set 
davidson_data = pd.read_csv("labeled_data.csv")

# drop the unnecessary rows 
davidson_data.drop(davidson_data[davidson_data['class'] == 1].index, inplace = True)

In [55]:
# re-number the columns so that it follows the same numbering as our original data set 
davidson_data.loc[davidson_data["class"] == 0, "class"] = 1
davidson_data.loc[davidson_data["class"] == 2, "class"] = 0

# making sure that the non-hate data and hate data both have 1430 records 
label_data = davidson_data.loc[davidson_data['class'] == 0]
non_hate_data = label_data.sample(1430)
hate_data = davidson_data.loc[davidson_data['class'] == 1]

# merging them into a dataframe and then saving it as a csv 
frames = [hate_data, non_hate_data]
new_data = pd.concat(frames)
new_data.to_csv("davidson_data.csv")

In [56]:
davidson_data

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,0,!!! RT @mayasolovely: As a woman you shouldn't...
40,40,3,0,1,2,0,""" momma said no pussy cats inside my doghouse """
63,63,3,0,0,3,0,"""@Addicted2Guys: -SimplyAddictedToGuys http://..."
66,66,3,0,1,2,0,"""@AllAboutManFeet: http://t.co/3gzUpfuMev"" woo..."
67,67,3,0,1,2,0,"""@Allyhaaaaa: Lemmie eat a Oreo &amp; do these..."
...,...,...,...,...,...,...,...
24767,25280,3,0,1,2,0,"you know what they say, the early bird gets th..."
24776,25289,3,3,0,0,1,you're all niggers
24777,25290,3,2,1,0,1,you're such a retard i hope you get type 2 dia...
24779,25292,3,0,1,2,0,"you've gone and broke the wrong heart baby, an..."


In [35]:
# remove the index 
davidson_data.to_csv('davidson_data.csv', index=False)