## Libraries

In [6]:
import unicodedata
import pandas as pd
import numpy as np

## Normalizing the names to lowercase ascii to maximize the matchings between names from the name list and server list, in case of differences in spelling and uppercase/lowercase

In [2]:
#from https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

def toLowercaseASCII(inputStr):
    noAccents = remove_accents(inputStr)
    return noAccents.rstrip().lower()

## Preparing servers information dataframe

In [None]:
df = pd.read_csv("csv/servers.csv", sep=';')
print("Loaded original csv")
print("Total servers (raw): " + str(len(df)))
print("Servers without money: " + str(len(df)-len(df.dropna(subset=['money']))))
print("Servers without post: " + str(len(df)-len(df.dropna(subset=['post']))))
print("Servers without class: " + str(len(df)-len(df.dropna(subset=['class']))))
print("Servers without level: " + str(len(df)-len(df.dropna(subset=['level']))))
print("Servers without org: " + str(len(df)-len(df.dropna(subset=['org']))))
noNaNDF = df.dropna(subset=['org', 'class', 'post', 'money'])
print("Servers with money, post, class and org: " + str(len(noNaNDF)))

#converting the names to ASCII lowercase format
for index, row in noNaNDF.iterrows():
    name = row['name']
    newName = toLowercaseASCII(name)
    noNaNDF.loc[index, 'name']=newName

In [None]:
noNaNDF.to_csv("csv/serversTreated.csv")

## Creating sets, with normalized content, for female and male names

In [23]:
femaleNames = set()
maleNames = set()

def isFemale(name):
    return (name in femaleNames)

def isMale(name):
    return (name in maleNames)

def knownName(name):
    return isFemale(name) or isMale(name)

def getGenderFromLastChar(name):
    if name.endswith('a'):
        return 'F'
    elif name.endswith('o'):
        return 'M'
    else:
        return 'X'

def getGender(row):
    name = row['name']
    names = name.split()
    if len(names) <= 0:
        return 'X'
    firstName = names[0]
    #print(firstName)
    if firstName in femaleNames:
        return 'F'
    elif firstName in maleNames:
        return 'M'
    else:
        if(len(names) > 1):
            firstName = names[0] + " " + names[1]
            if firstName in femaleNames:
                return 'F'
            elif firstName in maleNames:
                return 'M'
            else:
                return getGenderFromLastChar(names[0])
        else:
            return getGenderFromLastChar(names[0])

def putLinesInSet(filePath, collection):
    lines = open(filePath)
    for line in lines:
        asciiLine
        collection.add(toLowercaseASCII(line))
        
putLinesInSet("csv/maleNames.csv", maleNames)
putLinesInSet("csv/femaleNames.csv", femaleNames)
print("Male names: " + str(len(maleNames)))
print("Female names: " + str(len(femaleNames)))
print("Names for both genders: " + str(len(femaleNames.intersection(maleNames))))
#print(getGender({"name":"ana caroline alguma coisa"}))
#print(getGender({"name":"pitagoras alves"}))
#print(femaleNames)

Male names: 3786
Female names: 2947
Names for both genders: 0


In [24]:
noGenderDF = pd.read_csv("csv/serversTreated.csv")
noGenderDF['gender'] = np.nan
noGenderDF['gender'] = noGenderDF.apply(getGender, axis=1)
noGenderServers = 0
for index, row in noGenderDF.iterrows():
    if row['gender'] == 'X':
        noGenderServers += 1
print("Servers without gender defined: " + str(noGenderServers))
noGenderDF.head()

Servers without gender defined: 824


Unnamed: 0.1,Unnamed: 0,name,money,post,class,level,org,gender
0,0,adelardo adelino dantas de medeiros,"17.833,66",PROFESSOR DO MAGISTERIO SUPERIOR,7,704.0,PRO-REITORIA DE GRADUACAO,M
1,1,abmael bezerra de oliveira,"10.388,52",PROFESSOR DO MAGISTERIO SUPERIOR,6,604.0,DEPARTAMENTO DE ENGENHARIA ELETRICA,X
2,2,adailton garcia da silva,"7.464,42",TECNICO EM AGROPECUARIA,D,,ESCOLA AGRICOLA DE JUNDIAI - UAECA,M
3,3,ada cristina scudelari,"19.995,43",PROFESSOR DO MAGISTERIO SUPERIOR,8,801.0,DEPARTAMENTO DE ENGENHARIA CIVIL,F
4,7,adamo perrucci,"10.557,63",PROFESSOR MAGISTERIO SUPERIOR -VISITANTE,4,601.0,DEPT DE DIREITO PROCESSUAL PROPEDEUTICA,M
