This is a brief exploration of the Davidson dataset, a coded corpus of tweets. Several human coders (n = number in "count" column) classified each tweet as hate speech (labeled 0 in class column), offensive speech (labeled 1), or neither hate speech nor offensive language (labeled 2). Since we are collecting Reddit coments that are likely to be either hate speech or offensive language, should we eliminate the tweets labeled as neither hate speech nor offensive language from the original data set?

This characterizes the frequency and percentage of each type of language in the original data set and a data set that includes just hate speech and offensive language.

In [1]:
import pandas as pd

labeled_df = pd.read_csv('Davidson_data.csv')

#df = df.drop(df.columns[[0, 1, 3]], axis=1)  # df.columns is zero-based pd.Index

# drop unnamed column that duplicates index
labeled_df = labeled_df.drop(labeled_df.columns[0], axis = 1)

display(labeled_df.head())

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [2]:
# count = number human coders
# hate_speech = number coders who identified this tweet as hate speech
# offensive_language = number coders who identified this tweet as offensive language
# neither = number coders who identified this tweet as offensive language
# class: 
# 0 = hate speech
# 1 = offensive
# 2 = neither




In [3]:
# how many of each class?

labeled_df.groupby('class').size()

class
0     1430
1    19190
2     4163
dtype: int64

In [4]:
hate = (labeled_df[labeled_df['class'] == 0])
hate_count = hate['class'].count()
display('hate', hate_count)

offensive = (labeled_df[labeled_df['class'] == 1])
offensive_count = offensive['class'].count()
display('offensive', offensive_count)

neither = (labeled_df[labeled_df['class'] == 2])
neither_count = neither['class'].count()
display('neither', neither_count)

'hate'

1430

'offensive'

19190

'neither'

4163

In [5]:
# what is the percentage of hate speech, offensive language, and tweets with neither in the original data set?

total = hate_count + offensive_count + neither_count
hate_percentage = hate_count/total
offensive_percentage = offensive_count/total
neither_percentage = neither_count/total

display('hate', hate_percentage)
display('offensive', offensive_percentage)
display('neither', neither_percentage)

'hate'

0.05770084332001776

'offensive'

0.7743211072105879

'neither'

0.16797804946939435

In [6]:
# what percentage of the original data is retained if we eliminate the "neither" category?

total_wo_neither = hate_count + offensive_count
percent_original_df = total_wo_neither/(total_wo_neither + neither_count)

display('total rows without neither category', total_wo_neither)
display('percent original df', percent_original_df)

'total rows without neither category'

20620

'percent original df'

0.8320219505306057

In [7]:
# what is the percentage of hate speech and offensive language with only hate and offensive speech in the data set?

hate_perc_hate = hate_count/total_wo_neither
offensive_perc_hate = offensive_count/total_wo_neither

display(hate_perc_hate)
display(offensive_perc_hate)

0.06935014548981572

0.9306498545101843

In [8]:
# make df tht includes only hate and offensive tweets

ho_df = labeled_df[labeled_df['class'] != 2]
display(ho_df.head())

#write to csv
ho_df.to_csv('Davidson_trimmed.csv', index = False)  # donot add index to csv (index already exists)

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."


In [9]:
ho_df = pd.read_csv('Davidson_trimmed.csv')
display(ho_df.head())

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
1,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
2,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
3,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
4,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."
