## Libraries

In [1]:
import unicodedata
import pandas as pd
import numpy as np

## Normalizing the names to lowercase ascii to maximize the matchings between names from the name list and server list, in case of differences in spelling and uppercase/lowercase

In [4]:
#from https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

def toLowercaseASCII(inputStr):
    noAccents = remove_accents(inputStr)
    return noAccents.rstrip().lower()

## Preparing servers information dataframe

In [5]:
df = pd.read_csv("csv/servers.csv", sep=';')
print("Loaded original csv")
print("Total servers (raw): " + str(len(df)))
print("Servers without money: " + str(len(df)-len(df.dropna(subset=['money']))))
print("Servers without post: " + str(len(df)-len(df.dropna(subset=['post']))))
print("Servers without class: " + str(len(df)-len(df.dropna(subset=['class']))))
print("Servers without level: " + str(len(df)-len(df.dropna(subset=['level']))))
print("Servers without org: " + str(len(df)-len(df.dropna(subset=['org']))))
noNaNDF = df.dropna(subset=['org', 'post', 'money'])
print("Servers with money, post, and org: " + str(len(noNaNDF)))
noNaNDF['name'] = noNaNDF['name'].apply(lambda x: toLowercaseASCII(x))
#converting the names to ASCII lowercase format
#for index, row in noNaNDF.iterrows():
#    name = row['name']
#    newName = toLowercaseASCII(name)
#    noNaNDF.loc[index, 'name']=newName

Loaded original csv
Total servers (raw): 6206
Servers without money: 496
Servers without post: 513
Servers without class: 513
Servers without level: 3505
Servers without org: 554
Servers with money, post, and org: 5651


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [6]:
noNaNDF.to_csv("csv/serversTreated.csv")

## Creating sets, with normalized content, for female and male names

In [7]:
femaleNames = set()
maleNames = set()
femaleLastCharOcurrence = dict()
maleLastCharOcurrence = dict()

def isFemale(name):
    return (name in femaleNames)

def isMale(name):
    return (name in maleNames)

def knownName(name):
    return isFemale(name) or isMale(name)

def getGenderFromLastChar(name):
    if name.endswith('a'):
        return 'F'
    elif name.endswith('o'):
        return 'M'
    else:
        return 'X'

def getGender(row):
    name = row['name']
    names = name.split()
    if len(names) <= 0:
        return 'X'
    firstName = names[0]
    #print(firstName)
    if firstName in femaleNames:
        return 'F'
    elif firstName in maleNames:
        return 'M'
    else:
        if(len(names) > 1):
            firstName = names[0] + " " + names[1]
            if firstName in femaleNames:
                return 'F'
            elif firstName in maleNames:
                return 'M'
            else:
                return 'X'
        else:
            return 'x'

def putLinesInSet(filePath, collection, lastChars):
    lines = open(filePath)
    for line in lines:
        asciiLine = toLowercaseASCII(line)
        collection.add(asciiLine)
        splitedWords = asciiLine.split()
        if len(splitedWords) >= 1:
            firstWord = splitedWords[0]
            lastChar = firstWord[-1]
            if lastChar in lastChars:
                lastChars[lastChar] = lastChars[lastChar] + 1
            else:
                lastChars[lastChar] = 1

def getGendersFromNameDataFrame(filePath):
    data = pd.read_csv(filePath)
    print(data.head(3))
    maleNamesFile = open("csv/maleNames.csv","a")
    femaleNamesFile = open("csv/femaleNames.csv","a")
    for index, row in data.iterrows():
        gender = row['gender']
        if gender == 'M':
            maleNames.add(row['name'])
            maleNamesFile.write(row['name']+"\n")
        elif gender == 'F':
            femaleNames.add(row['name'])
            femaleNamesFile.write(row['name']+"\n")
    femaleNamesFile.close()
    maleNamesFile.close()

putLinesInSet("csv/maleNames.csv", maleNames, maleLastCharOcurrence)
putLinesInSet("csv/femaleNames.csv", femaleNames, femaleLastCharOcurrence)
getGendersFromNameDataFrame("csv/unknownNames.csv")
print("Male names: " + str(len(maleNames)))
print("Female names: " + str(len(femaleNames)))
print("Names for both genders: " + str(len(femaleNames.intersection(maleNames))))
print("Male last chars: ")
for key, value in maleLastCharOcurrence.items():
    print(key + ": " + str((value/len(maleNames))*100) + "%")
    if key in femaleLastCharOcurrence:
        print("\t" + key + ": " + str((femaleLastCharOcurrence[key]/len(femaleNames))*100) + "%, in females")

      name  count gender
0  zulamar      1      M
1   helano      1      M
2    heric      1      M
Male names: 4298
Female names: 3573
Names for both genders: 4
Male last chars: 
n: 11.09818520241973%
	n: 1.8751749230338652%, in females
l: 4.699860400186133%
	l: 1.4833473271760425%, in females
o: 34.993020009306655%
	o: 0.559753708368318%, in females
r: 7.794322940902745%
	r: 0.8956059333893087%, in females
m: 1.7915309446254073%
	m: 0.33585222502099077%, in females
d: 1.7449976733364354%
	d: 0.2518891687657431%, in females
e: 7.422056770590972%
	e: 20.347047299188358%, in females
s: 9.422987436016752%
	s: 2.18303946263644%, in females
h: 1.093531875290833%
	h: 0.9235936188077246%, in females
z: 0.860865518845975%
	z: 0.16792611251049538%, in females
i: 5.002326663564449%
	i: 4.254128183599216%, in females
b: 0.2791996277338297%
	b: 0.08396305625524769%, in females
c: 0.3955328059562588%
f: 0.23266635644485809%
u: 2.5127966496044674%
	u: 0.2518891687657431%, in females
y: 2.3033969288

In [8]:
noGenderDF = pd.read_csv("csv/serversTreated.csv")
noGenderDF['gender'] = np.nan
noGenderDF['gender'] = noGenderDF.apply(getGender, axis=1)
noGenderServers = 0
for index, row in noGenderDF.iterrows():
    if row['gender'] == 'X':
        noGenderServers += 1
print("Servers without gender defined: " + str(noGenderServers))
noGenderDF.head()

noGenderDF.to_csv("csv/serversTreatedWithGender.csv")

Servers without gender defined: 2


In [9]:
undefinedNames = dict()
for index, row in noGenderDF[noGenderDF.gender == 'X'].iterrows():
    names = row['name'].split()[0]
    #if()
    if names in undefinedNames:
        undefinedNames[names] += 1
    else:
        undefinedNames[names] = 1
print(str(len(undefinedNames)) + " undefined first names.")
series = pd.Series(undefinedNames)
series.sort_values(ascending=False, inplace=True)
series.rename("count", inplace=True)
unknownNamesDF = pd.DataFrame(series)
unknownNamesDF['gender'] = 'X'
print(unknownNamesDF)
unknownNamesDF.to_csv("csv/unknownNames.csv")

2 undefined first names.
         count gender
mardeni      1      X
aruzza       1      X


In [10]:
import math

def moneyToFloat(x):
    if x is float:
        return x
    noDots = x.replace('.', '')
    noCurls = noDots.replace(',','.')
    return float(noCurls)
def parseLevel(x):
    f = float(x)
    if math.isnan(f):
        return float('NaN')
    elif(f < 101):
        return float('NaN')
    noDecimal = math.floor(f / 100) * 100
    decimal = (((f/10)-int(f/10))*10)
    decimal = decimal - 1
    if decimal < 0:
        decimal = 0
    decimal = decimal * 25.0
    return noDecimal + decimal

genderDF = pd.read_csv("csv/serversTreatedWithGender.csv")
genderDF['money'] = genderDF['money'].apply(lambda x: moneyToFloat(x))
genderDF['level'] = genderDF['level'].apply(lambda x: parseLevel(x))
genderDF.to_csv("csv/serversTreatedWithGender.csv")
genderDF.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,name,money,post,class,level,org,gender
0,0,0,adelardo adelino dantas de medeiros,17833.66,PROFESSOR DO MAGISTERIO SUPERIOR,7,775.0,PRO-REITORIA DE GRADUACAO,M
1,1,1,abmael bezerra de oliveira,10388.52,PROFESSOR DO MAGISTERIO SUPERIOR,6,675.0,DEPARTAMENTO DE ENGENHARIA ELETRICA,M
2,2,2,adailton garcia da silva,7464.42,TECNICO EM AGROPECUARIA,D,,ESCOLA AGRICOLA DE JUNDIAI - UAECA,M
3,3,3,ada cristina scudelari,19995.43,PROFESSOR DO MAGISTERIO SUPERIOR,8,800.0,DEPARTAMENTO DE ENGENHARIA CIVIL,F
4,4,7,adamo perrucci,10557.63,PROFESSOR MAGISTERIO SUPERIOR -VISITANTE,4,600.0,DEPT DE DIREITO PROCESSUAL PROPEDEUTICA,M
