## Libraries

In [2]:
import unicodedata
import pandas as pd
import numpy as np

## Normalizing the names to lowercase ascii to maximize the matchings between names from the name list and server list, in case of differences in spelling and uppercase/lowercase

In [3]:
#from https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

def toLowercaseASCII(inputStr):
    noAccents = remove_accents(inputStr)
    return noAccents.rstrip().lower()

## Preparing servers information dataframe

In [None]:
df = pd.read_csv("csv/servers.csv", sep=';')
print("Loaded original csv")
print("Total servers (raw): " + str(len(df)))
print("Servers without money: " + str(len(df)-len(df.dropna(subset=['money']))))
print("Servers without post: " + str(len(df)-len(df.dropna(subset=['post']))))
print("Servers without class: " + str(len(df)-len(df.dropna(subset=['class']))))
print("Servers without level: " + str(len(df)-len(df.dropna(subset=['level']))))
print("Servers without org: " + str(len(df)-len(df.dropna(subset=['org']))))
noNaNDF = df.dropna(subset=['org', 'class', 'post', 'money'])
print("Servers with money, post, class and org: " + str(len(noNaNDF)))

#converting the names to ASCII lowercase format
for index, row in noNaNDF.iterrows():
    name = row['name']
    newName = toLowercaseASCII(name)
    noNaNDF.loc[index, 'name']=newName

In [None]:
noNaNDF.to_csv("csv/serversTreated.csv")

## Creating sets, with normalized content, for female and male names

In [13]:
femaleNames = set()
maleNames = set()
femaleLastCharOcurrence = dict()
maleLastCharOcurrence = dict()

def isFemale(name):
    return (name in femaleNames)

def isMale(name):
    return (name in maleNames)

def knownName(name):
    return isFemale(name) or isMale(name)

def getGenderFromLastChar(name):
    if name.endswith('a'):
        return 'F'
    elif name.endswith('o'):
        return 'M'
    else:
        return 'X'

def getGender(row):
    name = row['name']
    names = name.split()
    if len(names) <= 0:
        return 'X'
    firstName = names[0]
    #print(firstName)
    if firstName in femaleNames:
        return 'F'
    elif firstName in maleNames:
        return 'M'
    else:
        if(len(names) > 1):
            firstName = names[0] + " " + names[1]
            if firstName in femaleNames:
                return 'F'
            elif firstName in maleNames:
                return 'M'
            else:
                return getGenderFromLastChar(names[0])
        else:
            return getGenderFromLastChar(names[0])

def putLinesInSet(filePath, collection, lastChars):
    lines = open(filePath)
    for line in lines:
        asciiLine = toLowercaseASCII(line)
        collection.add(asciiLine)
        splitedWords = asciiLine.split()
        firstWord = splitedWords[0]
        lastChar = firstWord[-1]
        if lastChar in lastChars:
            lastChars[lastChar] = lastChars[lastChar] + 1
        else:
            lastChars[lastChar] = 1
        
putLinesInSet("csv/maleNames.csv", maleNames, maleLastCharOcurrence)
putLinesInSet("csv/femaleNames.csv", femaleNames, femaleLastCharOcurrence)
print("Male names: " + str(len(maleNames)))
print("Female names: " + str(len(femaleNames)))
print("Names for both genders: " + str(len(femaleNames.intersection(maleNames))))
print("Male last chars: ")
for key, value in maleLastCharOcurrence.items():
    print(key + ": " + str((value/len(maleNames))*100) + "%")
    if key in femaleLastCharOcurrence:
        print("\t" + key + ": " + str((femaleLastCharOcurrence[key]/len(femaleNames))*100) + "%, in females")

Male names: 3786
Female names: 2947
Names for both genders: 0
Male last chars: 
n: 9.270998415213946%
	n: 2.1038344078724123%, in females
l: 4.91283676703645%
	l: 1.7305734645402102%, in females
o: 35.89540412044374%
	o: 0.6447234475738038%, in females
r: 7.422081352350767%
	r: 0.9161859518154056%, in females
m: 1.9281563655573164%
	m: 0.3732609433322022%, in females
d: 1.8225039619651346%
	d: 0.2714625042416016%, in females
e: 7.686212361331219%
	e: 17.509331523583306%, in females
s: 9.667194928684628%
	s: 2.4092297251442143%, in females
h: 1.1885895404120443%
	h: 1.0858500169664065%, in females
z: 0.7923930269413629%
	z: 0.1357312521208008%, in females
i: 5.018489170628632%
	i: 4.309467254835426%, in females
b: 0.29054410987849977%
	b: 0.10179843909060061%, in females
c: 0.39619651347068147%
f: 0.2377179080824089%
u: 2.7469624933967247%
	u: 0.2714625042416016%, in females
y: 1.9281563655573164%
	y: 3.32541567695962%, in females
a: 7.3956682514527206%
	a: 65.9993213437394%, in females

In [25]:
noGenderDF = pd.read_csv("csv/serversTreated.csv")
noGenderDF['gender'] = np.nan
noGenderDF['gender'] = noGenderDF.apply(getGender, axis=1)
noGenderServers = 0
for index, row in noGenderDF.iterrows():
    if row['gender'] == 'X':
        noGenderServers += 1
print("Servers without gender defined: " + str(noGenderServers))
noGenderDF.head()

noGenderDF.to_csv("csv/serversTreatedWithGender.csv")

Servers without gender defined: 824


In [43]:
import math

def moneyToFloat(x):
    noDots = x.replace('.', '')
    noCurls = noDots.replace(',','.')
    return float(noCurls)
def parseLevel(x):
    f = float(x)
    if math.isnan(f):
        return float('NaN')
    elif(f < 101):
        return float('NaN')
    noDecimal = math.floor(f / 100) * 100
    decimal = (((f/10)-int(f/10))*10)
    decimal = decimal - 1
    if decimal < 0:
        decimal = 0
    decimal = decimal * 25.0
    return noDecimal + decimal

genderDF = pd.read_csv("csv/serversTreatedWithGender.csv")
#genderDF['money'] = genderDF['money'].apply(lambda x: moneyToFloat(x))
genderDF['level'] = genderDF['level'].apply(lambda x: parseLevel(x))
#genderDF.to_csv("csv/serversTreatedWithGender.csv")
genderDF.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,name,money,post,class,level,org,gender
0,0,0,adelardo adelino dantas de medeiros,"17.833,66",PROFESSOR DO MAGISTERIO SUPERIOR,7,775.0,PRO-REITORIA DE GRADUACAO,M
1,1,1,abmael bezerra de oliveira,"10.388,52",PROFESSOR DO MAGISTERIO SUPERIOR,6,675.0,DEPARTAMENTO DE ENGENHARIA ELETRICA,X
2,2,2,adailton garcia da silva,"7.464,42",TECNICO EM AGROPECUARIA,D,,ESCOLA AGRICOLA DE JUNDIAI - UAECA,M
3,3,3,ada cristina scudelari,"19.995,43",PROFESSOR DO MAGISTERIO SUPERIOR,8,800.0,DEPARTAMENTO DE ENGENHARIA CIVIL,F
4,4,7,adamo perrucci,"10.557,63",PROFESSOR MAGISTERIO SUPERIOR -VISITANTE,4,600.0,DEPT DE DIREITO PROCESSUAL PROPEDEUTICA,M
