## Dictionary-based tools to match user names and their gender

In [83]:
import re
import random
import pandas as pd

In [84]:
import gender_guesser.detector as gender
d = gender.Detector(case_sensitive=False)

print(d.get_gender(u"GianMaria", u'italy'))
print(d.get_gender(u"gianmaria", u'italy'))
print(d.get_gender(u"Flora", u'italy'))
print(d.get_gender(u"pierpaolo", u'italy'))
print(d.get_gender(u"mariavittoria", u'italy'))
print(d.get_gender(u"maria vittoria", u'italy'))
print(d.get_gender(u"annaviola", u'italy'))
print(d.get_gender(u"Giangiacomo", u'italy'))

# too many unknowns...


male
male
female
male
unknown
unknown
unknown
unknown


In [85]:
# read csv to pandas
df_names = pd.read_csv('../data/gender_classification/gender_firstnames_ITA.csv', sep=',')
df_names['is_male'] = (df_names['male'] > df_names['female']*10)
df_names['is_female'] = (df_names['female'] > df_names['male']*10)
df_male = df_names[df_names['is_male']==True].sort_values(by='tot', ascending=False)
df_female =df_names[df_names['is_female']==True].sort_values(by='tot', ascending=False)

In [86]:

def process_composite_names(df):
    # Create a new DataFrame to store the modified data
    new_df = pd.DataFrame(columns=df.columns)

    # Iterate through the rows of the original DataFrame
    for _ , row in df.iterrows():
        name = row['nome']
        if '.' not in name:
            # Check if the "nome" column contains a space
            if ' ' in name:
                # Retrieve the first name in the firstname
                first_firstname = name.split()[0]
                # Create a row for the first_firstname, copying the numerical columns
                new_row = row.copy()
                new_row['nome'] = first_firstname
                new_df = new_df.append(new_row, ignore_index=True)
                # also create a row with the first and second name combined
                new_row = row.copy()
                new_row['nome'] = name.replace(' ', '')
                new_df = new_df.append(new_row, ignore_index=True)
            else:
                # If it doesn't contain a space, simply copy the row to the new DataFrame
                new_df = new_df.append(row, ignore_index=True)

    # Combine rows with the same "nome" in the new DataFrame by grouping and summing
    new_df = new_df.groupby('nome').agg({
        'tot': 'sum',
        'male': 'sum',
        'female': 'sum',
        'is_male': 'max',
        'is_female': 'max'
    }).reset_index()

    return new_df

df_male_new = process_composite_names(df_male)
df_female_new = process_composite_names(df_female)


  new_df = new_df.append(row, ignore_index=True)
  new_df = new_df.append(row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)
  new_df = new_df.append(row, ignore_index=True)
  new_df = new_df.append(row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)


In [87]:
# remove names with less than 100 occurrences
print('male-------------------')
print(len(df_male))
print(len(df_male_new))
df_male_new = df_male_new[df_male_new['tot']>30]
print(len(df_male_new))
print('female-----------------')
print(len(df_female))
print(len(df_female_new))
df_female_new = df_female_new[df_female_new['tot']>30]
print(len(df_female_new))

male-------------------
23844
23595
3287
female-----------------
8833
8820
1122


In [90]:
# save to csv
df_names_new = pd.concat((df_male_new, df_female_new), axis=0).sort_values(by='tot', ascending=False)
df_names_new.to_csv('../data/gender_classification/gender_firstnames_ITA_processed.csv', sep=',', index=False)

In [91]:
# print
pd.set_option('display.max_rows', 100000)
print(df_names_new.sort_values(by='tot', ascending=False)[:100000])

                      nome     tot    male female  is_male  is_female
11579             GIUSEPPE  204446  204399     47     True      False
11021             GIOVANNI  152660  152607     53     True      False
2093               ANTONIO  149463  149458      5     True      False
8719             FRANCESCO  116954  116951      3     True      False
15167                MARIO   93718   93718      0     True      False
14314                LUIGI   93106   93094     12     True      False
19794              ROBERTO   73949   73949      0     True      False
17780                PAOLO   71182   71177      5     True      False
5123                 MARIA   69007      77  68930    False       True
1586                ANGELO   68449   68433     16     True      False
8975                FRANCO   60392   60392      0     True      False
14846                MARCO   59561   59551     10     True      False
5800              DOMENICO   58402   58402      0     True      False
22841             VI

In [111]:
# check how many names are ufually male but less than 20x more frequently then female
df_names_new[(df_names_new['is_male']==True) & (df_names_new['male']<20*df_names_new['female'])]


Unnamed: 0,nome,tot,male,female,is_male,is_female
6146,EDDI,232,217,15,True,False
14966,MARCOMARIA,205,195,10,True,False
16928,NICOLAMARIA,169,159,10,True,False
9621,GENTILE,159,149,10,True,False
13071,IVONE,119,109,10,True,False
672,ALDOGIUSEPPE,103,98,5,True,False
14080,LUCAMARIA,81,76,5,True,False
5455,DAVIDEMARIA,63,58,5,True,False
10971,GIOVACCHINO,63,58,5,True,False
13094,JADER,57,52,5,True,False
