## Libraries

In [1]:
import unicodedata
import pandas as pd
import numpy as np

## Normalizing the names to lowercase ascii to maximize the matchings between names from the name list and server list, in case of differences in spelling and uppercase/lowercase

In [2]:
#from https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

def toLowercaseASCII(inputStr):
    noAccents = remove_accents(inputStr)
    return noAccents.rstrip().lower()

## Preparing servers information dataframe

In [3]:
df = pd.read_csv("csv/servers.csv", sep=';')
print("Loaded original csv")
print("Total servers (raw): " + str(len(df)))
print("Servers without money: " + str(len(df)-len(df.dropna(subset=['money']))))
print("Servers without post: " + str(len(df)-len(df.dropna(subset=['post']))))
print("Servers without class: " + str(len(df)-len(df.dropna(subset=['class']))))
print("Servers without level: " + str(len(df)-len(df.dropna(subset=['level']))))
print("Servers without org: " + str(len(df)-len(df.dropna(subset=['org']))))
noNaNDF = df.dropna(subset=['org', 'post', 'money'])
print("Servers with money, post, and org: " + str(len(noNaNDF)))
noNaNDF['name'] = noNaNDF['name'].apply(lambda x: toLowercaseASCII(x))
#converting the names to ASCII lowercase format
#for index, row in noNaNDF.iterrows():
#    name = row['name']
#    newName = toLowercaseASCII(name)
#    noNaNDF.loc[index, 'name']=newName

Loaded original csv
Total servers (raw): 12410
Servers without money: 994
Servers without post: 1028
Servers without class: 1028
Servers without level: 7012
Servers without org: 1110
Servers with money, post, and org: 11298


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [4]:
noNaNDF.to_csv("csv/serversTreated.csv")

## Creating sets, with normalized content, for female and male names

In [9]:
femaleNames = set()
maleNames = set()
femaleLastCharOcurrence = dict()
maleLastCharOcurrence = dict()

def isFemale(name):
    return (name in femaleNames)

def isMale(name):
    return (name in maleNames)

def knownName(name):
    return isFemale(name) or isMale(name)

def getGenderFromLastChar(name):
    if name.endswith('a'):
        return 'F'
    elif name.endswith('o'):
        return 'M'
    else:
        return 'X'

def getGender(row):
    name = row['name']
    names = name.split()
    if len(names) <= 0:
        return 'X'
    firstName = names[0]
    #print(firstName)
    if firstName in femaleNames:
        return 'F'
    elif firstName in maleNames:
        return 'M'
    else:
        if(len(names) > 1):
            firstName = names[0] + " " + names[1]
            if firstName in femaleNames:
                return 'F'
            elif firstName in maleNames:
                return 'M'
            else:
                return 'X'
        else:
            return 'x'

def putLinesInSet(filePath, collection, lastChars):
    lines = open(filePath)
    for line in lines:
        asciiLine = toLowercaseASCII(line)
        collection.add(asciiLine)
        splitedWords = asciiLine.split()
        if len(splitedWords) >= 1:
            firstWord = splitedWords[0]
            lastChar = firstWord[-1]
            if lastChar in lastChars:
                lastChars[lastChar] = lastChars[lastChar] + 1
            else:
                lastChars[lastChar] = 1

def getGendersFromNameDataFrame(filePath):
    data = pd.read_csv(filePath)
    print(data.head(3))
    maleNamesFile = open("csv/maleNames.csv","a")
    femaleNamesFile = open("csv/femaleNames.csv","a")
    for index, row in data.iterrows():
        gender = row['gender']
        if gender == 'M':
            maleNames.add(row['name'])
            maleNamesFile.write(row['name']+"\n")
        elif gender == 'F':
            femaleNames.add(row['name'])
            femaleNamesFile.write(row['name']+"\n")
    femaleNamesFile.close()
    maleNamesFile.close()

putLinesInSet("csv/maleNames.csv", maleNames, maleLastCharOcurrence)
putLinesInSet("csv/femaleNames.csv", femaleNames, femaleLastCharOcurrence)
getGendersFromNameDataFrame("csv/unknownNames.csv")
print("Male names: " + str(len(maleNames)))
print("Female names: " + str(len(femaleNames)))
print("Names for both genders: " + str(len(femaleNames.intersection(maleNames))))
print("Male last chars: ")
for key, value in maleLastCharOcurrence.items():
    print(key + ": " + str((value/len(maleNames))*100) + "%")
    if key in femaleLastCharOcurrence:
        print("\t" + key + ": " + str((femaleLastCharOcurrence[key]/len(femaleNames))*100) + "%, in females")

      name count gender
0  zulamar     1      M
1   helano     1      M
2    heric     1      M
Male names: 4298
Female names: 3571
Names for both genders: 4
Male last chars: 
n: 8.841321544904607%
	n: 1.792215065807897%, in females
l: 4.513727315030247%
	l: 1.4281713805656677%, in females
o: 32.48022335970219%
	o: 0.5320638476617194%, in females
r: 6.747324336900884%
	r: 0.784094091290955%, in females
m: 1.72173103769195%
	m: 0.3080369644357323%, in females
d: 1.6053978594695206%
	d: 0.2520302436292355%, in females
e: 6.863657515123313%
	e: 15.989918790254832%, in females
s: 8.631921824104234%
	s: 1.9882385886306357%, in females
h: 1.0702652396463472%
	h: 0.8961075329039485%, in females
z: 0.7445323406235459%
	z: 0.14001680201624195%, in females
i: 4.5369939506747325%
	i: 3.6404368524222908%, in females
b: 0.2559329920893439%
	b: 0.08401008120974517%, in females
c: 0.37226617031177295%
f: 0.20939972080037225%
u: 2.44299674267101%
	u: 0.22402688322598713%, in females
y: 1.7449976733364

In [10]:
noGenderDF = pd.read_csv("csv/serversTreated.csv")
noGenderDF['gender'] = np.nan
noGenderDF['gender'] = noGenderDF.apply(getGender, axis=1)
noGenderServers = 0
for index, row in noGenderDF.iterrows():
    if row['gender'] == 'X':
        noGenderServers += 1
print("Servers without gender defined: " + str(noGenderServers))
noGenderDF.head()

noGenderDF.to_csv("csv/serversTreatedWithGender.csv")

Servers without gender defined: 8


In [45]:
undefinedNames = dict()
for index, row in noGenderDF[noGenderDF.gender == 'X'].iterrows():
    names = row['name'].split()[0]
    #if()
    if names in undefinedNames:
        undefinedNames[names] += 1
    else:
        undefinedNames[names] = 1
print(str(len(undefinedNames)) + " undefined first names.")
series = pd.Series(undefinedNames)
series.sort_values(ascending=False, inplace=True)
series.rename("count", inplace=True)
unknownNamesDF = pd.DataFrame(series)
unknownNamesDF['gender'] = 'X'
print(unknownNamesDF)
unknownNamesDF.to_csv("csv/unknownNames.csv")

[883 rows x 2 columns]


In [11]:
import math

def moneyToFloat(x):
    if x is float:
        return x
    noDots = x.replace('.', '')
    noCurls = noDots.replace(',','.')
    return float(noCurls)
def parseLevel(x):
    f = float(x)
    if math.isnan(f):
        return float('NaN')
    elif(f < 101):
        return float('NaN')
    noDecimal = math.floor(f / 100) * 100
    decimal = (((f/10)-int(f/10))*10)
    decimal = decimal - 1
    if decimal < 0:
        decimal = 0
    decimal = decimal * 25.0
    return noDecimal + decimal

genderDF = pd.read_csv("csv/serversTreatedWithGender.csv")
genderDF['money'] = genderDF['money'].apply(lambda x: moneyToFloat(x))
genderDF['level'] = genderDF['level'].apply(lambda x: parseLevel(x))
genderDF.to_csv("csv/serversTreatedWithGender.csv")
genderDF.head()