### This notebook addresses research questions 3 and 6.1

##### Requirements:
* comments.csv
* reference.csv

##### Generates:
* top_tokens_hateful.csv
* top_tokens_reference.csv

In [1]:
import pandas as pd
import re
from collections import Counter, defaultdict
from tqdm import tqdm
from gensim.parsing.preprocessing import (preprocess_string, strip_tags, strip_multiple_whitespaces, 
                                          strip_non_alphanum, strip_punctuation, remove_stopword_tokens, 
                                          STOPWORDS, strip_numeric
                                         )

In [2]:
# import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')



In [3]:
c = pd.read_csv('../../data/comments.csv')
r = pd.read_csv('../../data/reference.csv')

In [4]:
CUSTOM_FILTERS = [strip_tags, 
                strip_punctuation, 
                strip_multiple_whitespaces, 
                strip_non_alphanum, 
                strip_numeric
            ]

my_stopwords = list(STOPWORDS)
my_stopwords.extend(['ve', 'll', 'com', 'https'])

In [5]:
male_words = ['amigo','ape','apes','bastard','bastards','bear',
                'beast','beefcake','bloke','blokes','boy','boyfriend','boyfriends','boys','brah',
                'bro','brother','brothers','bruh','brute','buck','bud','buddies','buddy',
                'buds','bugger','bull','chad','chairman','chap','chum','dad','dads','dawg',
                'dick','dickhead','dickheads','dilf','dude','fag','faggot','father','father',
                'fella','fellow','fiance','fucker','gent','gent','gentleman','gentlemen',
                'grandfather','grandfathers','grandpa','grandpas','grandson','grandsons',
                'groom','grooms','guy','guys','he',"he'd","he's",'hero','him','himself',
                'his','hombre','hombres','hubby','hunk','husband','husbands','incel','incels',
                'king','kings','lad','lads','macho','machos','male','man','mankind','master','men',
                "men's",'mensch','mister','mr','muscle','nephew','nephews','papa','pop','prick',
                'pricks','priest','prince','run-on','sir','son','sonny','sons','stud','thug',
                'thugs','uncle','uncles','virile','waiter','waiters','wanker','widower','widowers']


female_words = ['actress','actresses','amiga','amigas','aunt','aunts','babe','babes','biddy','bimbo',
                'bimbos','bride','brides','butch','chairwoman','chick','chicks','cow','cows',
                'cunt','cunts','dame','daughter','daughters','doe','dudess','dudette','dudine',
                'dyke','dykes','fag hag','fem','female','feme','femme','fiancee','floozy',
                'fusby','gal','girl','girlfriend','girlfriends','girls','goddess','goddesses',
                'granddaughter','granddaughters','grandma','grandmas','grandmother','grandmothers',
                'hag','hags','hay bag','hay-bag','her','heroine','hers','herself','hoe',
                'hoe bag','hoes','honey','hooker','hookers','hussy','jugs','karen','ladies',
                'lady','lost rib','lubra','mama','mamas','manhole','milf','mistress','mistresses',
                'mivvy','mom','moms','mother','mothers','mrs','ms','muff','niece','nieces',
                'priestess','princess','prostitute','prostitutes','pussies','pussy','queen',
                'queens','quim','scow','she',"she'd","she's",'shickster','sister','sisters',
                'skank','skanks','slut','sluts','spokeswoman','strumpet','tar leather',
                'tit','toots','tootsie','tramp','waitress','whore','whores','widow','widows',
                'wife','wimmin','wives','woman','womankind','women','women',"women's",'womyn']

f_df = pd.DataFrame({'identifier': female_words})
m_df = pd.DataFrame({'identifier': male_words})
f_df['gender'] = 'female'
m_df['gender'] = 'male'
gender_id_df = f_df.append(m_df, ignore_index=True)

In [6]:
def get_tokens(data):
    '''
    tokenize all comments and remove stopwords
    outputs a single list of tokens
    '''
    tokens = []
    print('tokenizing...',end='\r')
    for line in data.body.to_list():
        pre = preprocess_string(line.lower(), CUSTOM_FILTERS)
        tokens.extend(remove_stopword_tokens(pre, my_stopwords))
    print(' ')
    return [t for t in tokens if len(t)>1]


def get_dict(data, k):
    '''
    creates dictionary with 
    counter of terms for each gender
    '''
    top_tokens = defaultdict()
    print('counting tokens...',end='\r')
    top_tokens['all'] = Counter(get_tokens(data)).most_common(k)
    top_tokens['female'] = Counter(get_tokens(data[data.gender=='female'])).most_common(k)
    top_tokens['male'] = Counter(get_tokens(data[data.gender=='male'])).most_common(k)
    print(' ')
    return top_tokens


def identifiers_dict(data, k):
    '''
    creates a dictionary with counter of 
    gender identifiers for each gender
    '''
    global gender_id_df

    pattern = re.compile('|'.join(gender_id_df.identifier.to_list()))

    words = []
    print('finding identifiers...',end='\r')
    for txt in data.body.to_list():
        words.extend(re.findall(pattern, txt.lower()))

    top_tokens = defaultdict()
    top_tokens['f_terms'] = Counter([w for w in words if w in female_words]).most_common(k)
    top_tokens['m_terms'] = Counter([w for w in words if w in male_words]).most_common(k)

    tokens=[]
    for line in data.body.to_list():
        pre = preprocess_string(line.lower(), CUSTOM_FILTERS)
        tokens.extend(remove_stopword_tokens(pre, my_stopwords))

    top_tokens['both_terms'] = Counter([w for w in tokens if w in gender_id_df.identifier.to_list()]).most_common(k)
    print(' ')
    return top_tokens


def get_top_k_tokens(data, k, gender_id=False):
    '''
    creates df with k most frequent tokens 
    for each gender and for all data
    '''
    if gender_id:
        data_dict = identifiers_dict(data, k)
    else:
        data_dict = get_dict(data, k)

    print('creating the dataframe...',end='\r')
    for i,key in enumerate(data_dict.keys()):
        cat_df = pd.DataFrame(data_dict[key])
        cat_df.insert(0, 'gender', key)
        if i==0:
            df = cat_df.copy()
        else:
            df = df.append(cat_df, ignore_index=True)
    df.columns = ['gender', 'token', 'freq']
    df['rank'] = list(range(k))*3
    print(' ')
    return df


def get_percentage(data):
    '''
    takes df with top_k gender identifiers (for all genders) 
    and calculates percentage of frequency within each gender
    '''
    for i,g in enumerate(['f_terms', 'm_terms', 'both_terms']):
        df = data[data['gender'] == g].copy()
        df['percentage'] = 100 * df.freq/df.freq.sum()
        if i==0:
            perc_id = df.copy()
        else:
            perc_id = perc_id.append(df, ignore_index=True)
        perc_id = perc_id.round(2)
    return perc_id


In [7]:
# def get_tokens(data):
#     '''
#     tokenize all comments and remove stopwords
#     outputs a single list of tokens
#     '''
#     tokens = []
#     print('tokenizing...',end='\r')
#     for line in data.body.to_list():
#         pre = preprocess_string(line.lower(), CUSTOM_FILTERS)
#         tokens.extend(remove_stopword_tokens(pre, my_stopwords))
#     print(' ')
#     return [t for t in tokens if len(t)>1]


# def get_dict(df, k):
#     '''
#     creates dictionary with 
#     counter of terms for each gender
#     '''
#     top_tokens = defaultdict()
#     print('counting tokens...',end='\r')
#     top_tokens['all'] = Counter(get_tokens(df)).most_common(k)
#     top_tokens['female'] = Counter(get_tokens(df[df.gender=='female'])).most_common(k)
#     top_tokens['male'] = Counter(get_tokens(df[df.gender=='male'])).most_common(k)
#     print(' ')
#     return top_tokens


# def identifiers_dict(df, k):
#     '''
#     creates a dictionary with counter of 
#     gender identifiers for each gender
#     '''
#     global gender_id_df

#     pattern = re.compile('|'.join(gender_id_df.identifier.to_list()))

#     words = []
#     print('finding identifiers...',end='\r')
#     for txt in c.body:
#         words.extend(re.findall(pattern, txt.lower()))

#     top_tokens = defaultdict()
#     top_tokens['f_terms'] = Counter([w for w in words if w in female_words]).most_common(k)
#     top_tokens['m_terms'] = Counter([w for w in words if w in male_words]).most_common(k)

#     tokens=[]
#     for line in df.body.to_list():
#         pre = preprocess_string(line.lower(), CUSTOM_FILTERS)
#         tokens.extend(remove_stopword_tokens(pre, my_stopwords))

#     top_tokens['both_terms'] = Counter([w for w in tokens if w in gender_id_df.identifier.to_list()]).most_common(k)
#     print(' ')
#     return top_tokens


# def get_top_k_tokens(data, k, gender_id=False):
#     '''
#     creates df with k most frequent tokens 
#     for each gender and for all data
#     '''
#     if gender_id:
#         data_dict = identifiers_dict(data, k)
#     else:
#         data_dict = get_dict(data, k)

#     print('creating the dataframe...',end='\r')
#     for i,key in enumerate(data_dict.keys()):
#         cat_df = pd.DataFrame(data_dict[key])
#         cat_df.insert(0, 'gender', key)
#         if i==0:
#             df = cat_df.copy()
#         else:
#             df = df.append(cat_df, ignore_index=True)
#     df.columns = ['gender', 'token', 'freq']
#     df['rank'] = list(range(k))*3
#     print(' ')
#     return df


# def get_percentage(data):
#     '''
#     takes df with top_k gender identifiers (for all genders) 
#     and calculates percentage of frequency within each gender
#     '''
#     for i,g in enumerate(['f_terms', 'm_terms', 'both_terms']):
#         df = data[data['gender'] == g].copy()
#         df['percentage'] = 100 * df.freq/df.freq.sum()
#         if i==0:
#             perc_id = df.copy()
#         else:
#             perc_id = perc_id.append(df, ignore_index=True)
#         perc_id = perc_id.round(2)
#     return perc_id


#### Question 3

How does language differ based on gendered speech? 

##### Operationalized question
How do the words associated with Reddit comments differ based on the presence of male or female pronouns?


### Top tokens

In [8]:
c_hate = c[c.davidson_label!=2]
top_hate = get_top_k_tokens(c_hate, 20)
# top_hate.to_csv('../../data/top_tokens_hateful.csv', index=False)

 
 
 
 
 


In [9]:
top_hate.sample(5)

Unnamed: 0,gender,token,freq,rank
54,male,pom,52,14
22,female,fuck,13655,2
12,all,want,8105,12
0,all,cunt,96307,0
6,all,people,12564,6


In [10]:
top_reference = get_top_k_tokens(r, 20)
# top_reference.to_csv('../../data/top_tokens_reference.csv', index=False)

 
 
 
 
 


In [11]:
top_reference.sample(5)

Unnamed: 0,gender,token,freq,rank
5,all,good,6490,5
28,female,amp,498,8
19,all,years,3608,19
11,all,work,4320,11
41,male,people,2725,1


#### Question 6.1

How often do the gender identifiers appear in the corpus? 

##### Operationalized question
For each gender identifier, what is its frequency in the corpus and its proportion of the separate totals female or male identifiers?

### Top gender identifier terms

In [12]:
top_id_hate = get_top_k_tokens(c_hate, 20, gender_id=True)

 
 


In [13]:
top_id_reference = get_top_k_tokens(r, 20, gender_id=True)

 
 


In [14]:
perc_id_hate = get_percentage(top_id_hate)
perc_id_ref = get_percentage(top_id_reference)
# perc_id_hate.to_csv('../../data/top_gender_id_hate.csv', index=False)
# perc_id_ref.to_csv('../../data/top_gender_id_ref.csv', index=False)

In [15]:
perc_id_hate.sample(5)

Unnamed: 0,gender,token,freq,rank,percentage
23,m_terms,men,13803,3,3.6
55,both_terms,cunts,1553,15,0.89
39,m_terms,sir,755,19,0.2
2,f_terms,she,28295,2,10.21
56,both_terms,mother,1471,16,0.85


In [16]:
perc_id_ref.sample(5)

Unnamed: 0,gender,token,freq,rank,percentage
47,both_terms,girl,641,7,4.59
33,m_terms,boy,858,13,0.24
42,both_terms,women,1404,2,10.06
23,m_terms,king,13502,3,3.81
31,m_terms,pop,1347,11,0.38


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6bb356e3-2dd8-4635-8ff0-a5ca506d60d1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>