# Topic Gender Bias in ESUPOL (BTW17)

## Imports

In [62]:
import os

#import pandas as pd
os.environ['MODIN_ENGINE'] = 'dask'
import modin.pandas as pd

from nltk.corpus import stopwords
stopwords = stopwords.words('german')

from HanTa import HanoverTagger as ht
tagger = ht.HanoverTagger('./morphmodel_ger.pgz')

from genderize import Genderize
import pprint
import json



import glob

from functions import preprocessing
pd.options.display.max_rows = 500
pd.options.display.max_columns=500
pd.options.display.width = 5000



## Load data

In [40]:
source_path = 'G:\dis25'
all_files = glob.glob(source_path + '/*.csv')[1:]


### Load first slice of btw17 dataset into a Pandas DataFrame
#### Load only the raw_data columns
*the first slice was handled separately since it's the only one with headers*

In [2]:
chunksize = 10 ** 6
chunks = pd.read_csv(os.path.join(source_path,'suggestions_20210719.csv'), chunksize=chunksize, usecols=['raw_data'])
btw17_df = pd.concat(chunk for chunk in chunks)

NameError: name 'pd' is not defined

### Extract the raw data column and save th DataFrame to a separate file
- *The queryterms were removed from the suggestions*
- *The queryterms only include the firstname now*

In [None]:
btw17_df = pd.DataFrame(btw17_df.raw_data.apply(lambda x: json.loads(x)).to_list(),columns=['queryterm','suggestions',3,4,5])[['queryterm','suggestions']]
btw17_df['suggestions'] = btw17_df.apply(lambda x: [suggestion.replace(x.queryterm.lower(), '') for suggestion in x.suggestions], axis=1)
btw17_df['queryterm'] = btw17_df.queryterm.apply(lambda x: x.split(' ')[0].split('-')[0])

In [5]:
btw17_df = pd.to_csv('./raw_suggestions_20210719.csv', usecols=['queryterm','suggestions'])

### Load the other Dataset slices, proceed as before

In [None]:
chunksize = 10 ** 6
for i in range(42,63):
    chunks = pd.read_csv(all_files[i],
                         header=None,
                         names=['id','queryterm','date','client','lang','url','raw_data'],
                         usecols=['raw_data'],
                         chunksize=chunksize)
    temp_df = pd.concat(chunks)

    temp_df = pd.DataFrame(temp_df.raw_data.apply(lambda x: json.loads(x)).to_list(),columns=['queryterm','suggestions',3,4,5])[['queryterm','suggestions']]
    temp_df['suggestions'] = temp_df.apply(lambda x: [suggestion.replace(x.queryterm.lower(), '').strip() for suggestion in x.suggestions], axis=1)
    temp_df['queryterm'] = temp_df.queryterm.apply(lambda x: x.split(' ')[0].split('-')[0])


    temp_df.to_csv('./raw_'+all_files[i].split('\\')[-1])


### Get a list of all firstnames
- *During the process duplicates were removed at multiple points to reduce computing efforts*

In [None]:
all_raw_df = glob.glob('./raw_suggestions*')
name_list = []
for i in all_raw_df:
    chunks = pd.read_csv(i,
                         usecols=['queryterm','suggestions'],
                         chunksize=chunksize)
    btw17_rawdata_df = btw17_rawdata_df.append(pd.concat(chunks))
    name_list.extend(btw17_rawdata_df.queryterm.drop_duplicates().to_list())
    name_list = list(set(name_list))
print(len(name_list))
name_list[:10]


## Genderize.io API
> This API supports access to a database of hundreds of thousand names in various notations
> It takes a list of name as input, sends a request to the Genderize.io server and delivers a
> list containing a dictionary for each requested name listing a count, the gender (male/female/None) and a probability.
>
> However the number of requests per day is limited to 1000 (per IP address).
> This isn't a problem for us though since we've got 326 unique names in the btw17 dataset
> and XX unique names in the eu dataset.

### Get gender for each name and create a DataFrame
n	2017-02-13 17:13:46	firefox	de	http://clients1.google.de/complete/search	["Jan van Aken",["jan van aken","jan van aken ...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN


In [10]:
gender_list_btw17 = Genderize().get(name_list)

In [11]:
pprint.pprint(gender_list_btw17[:25])

[{'count': 2406, 'gender': 'male', 'name': 'Gunnar', 'probability': 0.99},
 {'count': 755, 'gender': 'male', 'name': 'Ates', 'probability': 0.9},
 {'count': 1, 'gender': 'male', 'name': 'bundeskanzlerin', 'probability': 1.0},
 {'count': 245870, 'gender': 'male', 'name': 'Michael', 'probability': 0.99},
 {'count': 16593, 'gender': 'male', 'name': 'Maurice', 'probability': 0.98},
 {'count': 787, 'gender': 'male', 'name': 'Shen', 'probability': 0.65},
 {'count': 57806, 'gender': 'male', 'name': 'Florian', 'probability': 0.99},
 {'count': 38513, 'gender': 'female', 'name': 'Renata', 'probability': 0.99},
 {'count': 501011, 'gender': 'male', 'name': 'David', 'probability': 0.99},
 {'count': 1, 'gender': 'male', 'name': 'jungeunion', 'probability': 1.0},
 {'count': 11677, 'gender': 'male', 'name': 'Franz', 'probability': 0.97},
 {'count': 1028, 'gender': 'male', 'name': 'von', 'probability': 0.75},
 {'count': 41244, 'gender': 'male', 'name': 'Jimmy', 'probability': 0.98},
 {'count': 12170, '

Store results in a DataFrame and save to disc

In [None]:
btw17_gender_df = pd.DataFrame(gender_list_btw17)
btw17_gender_df = btw17_gender_df.rename(columns = {'name':'f_name','probability':'gender_probability'})
btw17_gender_df = btw17_gender_df[['f_name','gender','gender_probability']]
btw17_gender_df.head(3)

In [13]:
btw17_gender_df.to_csv('btw17_name_gender_df.csv')





## Merge gender data with btw17 data and save to file

In [10]:
btw17_rawdata_df = btw17_rawdata_df.merge(btw17_gender_df, left_on = 'queryterm', right_on = 'f_name')[['gender','suggestions']]
btw17_rawdata_df = btw17_rawdata_df[~btw17_rawdata_df.gender.isnull()]
btw17_rawdata_df.head(3)

Unnamed: 0,gender,suggestions
0,male,"[' privat', 'http://www.jan-van-aken.de/', ' b..."
1,male,"[' privat', ' bundestag', ' biografie', ' twit..."
2,male,"['', ' privat', ' biografie', ' bundestag', ' ..."


In [6]:
btw17_rawdata_df = pd.to_csv('./btw17_rawdata_gender.csv')

In [3]:
chunksize= 10 ** 6
chunks = pd.read_csv('./btw17_rawdata_gender.csv', usecols=['suggestions'], chunksize=chunksize)
btw17_rawdata_df = pd.concat(chunks)

## Get a list of all suggestion terms


suggestion_list[:10]

# Load and Preprocess Dataset
## Preprocess suggestion terms
>*Flatten the nested list to get one big list of suggestion terms per gender*
>
>**Preprocessing Pipeline:**
>- Remove punctuation
>- Remove names from the suggestion terms to get objective terms
>   - This includes person names and location names
>- Only use unigrams
>- Replace umlauts
>- Remove digits
>- Set strings to lowercase
>- Remove urls
>- Remove nltk stopwords
>- Remove terms that consist of only 2 chars or less
>- Strip whitespaces
>- Remove empty strings

In [24]:
with open('../dis25-2021/cities.txt', 'r') as f:
    locations = [term[:-1].lower() for term in f.readlines()]
cities = pd.read_csv('../dis25-2021/Liste-Staedte-in-Deutschland.csv', delimiter=';')
locations.extend([term.lower() for term in cities['Stadt'].to_list()])
locations.extend([term.lower() for term in cities['Bundesland'].to_list()])
locations.extend([term.lower() for term in cities['Landkreis'].to_list()])
countries = pd.read_csv('../dis25-2021/csv-data.csv', delimiter=';', encoding='windows-1252')
locations.extend([term.lower() for term in countries['Kurzform'].to_list()])
with open('../dis25-2021/countries.txt', 'r') as f:
    locations.extend([term[:-1].lower() for term in f.readlines()])


In [19]:
nltk_names = []
with open('../dis25-2021/male.txt') as male_names_nltk:
    nltk_names = [name.lower() for name in male_names_nltk.read().split('\n')]
with open('../dis25-2021/female.txt') as female_names_nltk:
    nltk_names.extend([name.lower() for name in female_names_nltk.read().split('\n')])
print(nltk_names[:20])
print(nltk_names[-20:])

['aamir', 'aaron', 'abbey', 'abbie', 'abbot', 'abbott', 'abby', 'abdel', 'abdul', 'abdulkarim', 'abdullah', 'abe', 'abel', 'abelard', 'abner', 'abraham', 'abram', 'ace', 'adair', 'adam']
['zenia', 'zia', 'zilvia', 'zita', 'zitella', 'zoe', 'zola', 'zonda', 'zondra', 'zonnya', 'zora', 'zorah', 'zorana', 'zorina', 'zorine', 'zsa zsa', 'zsazsa', 'zulema', 'zuzana', '']


In [20]:
#https://www.usna.edu/Users/cs/roche/courses/s15si335/proj1/files.php%3Ff=names.txt.html
usna_names =[]
with open('../dis25-2021/usna_names.txt') as usna_file:
    usna_names = [name.lower() for name in usna_file.read().split('\n')]


In [51]:
names = []
chunksize = 10 ** 6
chunks = pd.read_csv(os.path.join(source_path,'suggestions_20210719.csv'), chunksize=chunksize, usecols=['queryterm'])
btw17_df = pd.concat(chunk for chunk in chunks)
names.extend(btw17_df.queryterm.drop_duplicates().to_list())

chunksize = 10 ** 6
for i in all_files:
    chunks = pd.read_csv(i,
                         header=None,
                         names=['id','queryterm','date','client','lang','url','raw_data'],
                         usecols=['queryterm'],
                         chunksize=chunksize)
    temp_df = pd.concat(chunks)
    names.extend(temp_df.queryterm.drop_duplicates().to_list())

names = list(set(names))


In [52]:
## get a list of names
names = [subname.strip('()').lower() for name in names for subname in name.split() if len(subname) > 2]
names.extend(nltk_names)
names.extend(usna_names)
names.extend(locations)
names = list(set(names))

In [54]:
with open("./all_names_to_delete.txt", "w",encoding="utf-8") as textfile:
    for element in names:
        textfile. write(element + "\n")

## Load german dictionary file


In [28]:
with open('../dis25-2021/wordlist-german.txt', 'r', encoding='utf-8') as f:
    german_terms = [term[:-1].lower() for term in f.readlines()]
print(german_terms[:20])

['aa', 'aaa', 'aachen', 'aachener', 'aachenerin', 'aachenerinnen', 'aachenern', 'aacheners', 'aachens', 'aal', 'aalähnlich', 'aalähnliche', 'aalähnlichem', 'aalähnlichen', 'aalähnlicher', 'aalähnliches', 'aalangelfischerei', 'aalangeln', 'aalangelns', 'aalartig']


## Perform Preprocessing and save results

In [58]:
suggestions = preprocessing.preprocess(preprocessing,suggestions=suggestion_list, german_terms=german_terms)

NameError: name 'delete_names' is not defined

In [36]:
print(suggestions[:10])
len(suggestions)

['transaktionsanalyse', 'wissenschaftsministerium', 'ehrenerklärung', 'zentralabitur', 'jugendreferent', 'elektronik', 'kaufoption', 'trainerschein', 'physiotherapie', 'herzfehler']


5255

In [37]:
textfile = open("./suggestion_terms_NN.txt", "w",encoding="utf-8")
for element in suggestions:
    textfile. write(element + "\n")
textfile.close()