## Dictionary-based tools to match user names and their gender

In [16]:
import re
import random
import pandas as pd
import sqlite3  # Assuming you're using SQLite as the database
from tqdm import tqdm

In [6]:
import gender_guesser.detector as gender
d = gender.Detector(case_sensitive=False)

print(d.get_gender(u"GianMaria", u'italy'))
print(d.get_gender(u"gianmaria", u'italy'))
print(d.get_gender(u"Flora", u'italy'))
print(d.get_gender(u"pierpaolo", u'italy'))
print(d.get_gender(u"mariavittoria", u'italy'))
print(d.get_gender(u"maria vittoria", u'italy'))
print(d.get_gender(u"annaviola", u'italy'))
print(d.get_gender(u"Giangiacomo", u'italy'))

# too many unknowns...


ModuleNotFoundError: No module named 'gender_guesser'

In [18]:
# read csv to pandas
df_names = pd.read_csv('../data/gender_classification/gender_firstnames_ITA.csv', sep=',')
df_names['is_male'] = (df_names['male'] > df_names['female']*10)
df_names['is_female'] = (df_names['female'] > df_names['male']*10)
df_male = df_names[df_names['is_male']==True].sort_values(by='tot', ascending=False)
df_female =df_names[df_names['is_female']==True].sort_values(by='tot', ascending=False)

In [19]:

def process_composite_names(df):
    # Create a new DataFrame to store the modified data
    new_df = pd.DataFrame(columns=df.columns)

    # Iterate through the rows of the original DataFrame
    for _ , row in df.iterrows():
        name = row['nome']
        if '.' not in name:
            # Check if the "nome" column contains a space
            if ' ' in name:
                # Retrieve the first name in the firstname
                first_firstname = name.split()[0]
                # Create a row for the first_firstname, copying the numerical columns
                new_row = row.copy()
                new_row['nome'] = first_firstname
                new_df = new_df.append(new_row, ignore_index=True)
                # also create a row with the first and second name combined
                new_row = row.copy()
                new_row['nome'] = name.replace(' ', '')
                new_df = new_df.append(new_row, ignore_index=True)
            else:
                # If it doesn't contain a space, simply copy the row to the new DataFrame
                new_df = new_df.append(row, ignore_index=True)

    # Combine rows with the same "nome" in the new DataFrame by grouping and summing
    new_df = new_df.groupby('nome').agg({
        'tot': 'sum',
        'male': 'sum',
        'female': 'sum',
        'is_male': 'max',
        'is_female': 'max'
    }).reset_index()

    return new_df

df_male_new = process_composite_names(df_male)
df_female_new = process_composite_names(df_female)


  new_df = new_df.append(row, ignore_index=True)
  new_df = new_df.append(row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)
  new_df = new_df.append(row, ignore_index=True)
  new_df = new_df.append(row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)
  new_df = new_df.append(new_row, ignore_index=True)


In [20]:
# remove names with less than 100 occurrences
print('male-------------------')
print(len(df_male))
print(len(df_male_new))
df_male_new = df_male_new[df_male_new['tot']>30]
print(len(df_male_new))
print('female-----------------')
print(len(df_female))
print(len(df_female_new))
df_female_new = df_female_new[df_female_new['tot']>30]
print(len(df_female_new))

male-------------------
23844
23595
3287
female-----------------
8833
8820
1122


In [22]:
# append resulting dfs with most common male and female names
df_names_new = pd.concat((df_male_new, df_female_new), axis=0).sort_values(by='tot', ascending=False)
# rename column 'nome' to 'name'
df_names_new.rename(columns={'nome': 'name'}, inplace=True)
# lowercase all names
df_names_new['name'] = df_names_new['name'].str.lower()
# save to csv
df_names_new.to_csv('../data/gender_classification/gender_firstnames_ITA_processed.csv', sep=',', index=False)

In [23]:
# print
pd.set_option('display.max_rows', 100000)
print(df_names_new.sort_values(by='tot', ascending=False)[:100000])

                      name     tot    male female  is_male  is_female
11579             giuseppe  204446  204399     47     True      False
11021             giovanni  152660  152607     53     True      False
2093               antonio  149463  149458      5     True      False
8719             francesco  116954  116951      3     True      False
15167                mario   93718   93718      0     True      False
14314                luigi   93106   93094     12     True      False
19794              roberto   73949   73949      0     True      False
17780                paolo   71182   71177      5     True      False
5123                 maria   69007      77  68930    False       True
1586                angelo   68449   68433     16     True      False
8975                franco   60392   60392      0     True      False
14846                marco   59561   59551     10     True      False
5800              domenico   58402   58402      0     True      False
22841             vi

In [24]:
# check how many names are ufually male but less than 20x more frequently then female
df_names_new[(df_names_new['is_male']==True) & (df_names_new['male']<20*df_names_new['female'])]


Unnamed: 0,name,tot,male,female,is_male,is_female
6146,eddi,232,217,15,True,False
14966,marcomaria,205,195,10,True,False
16928,nicolamaria,169,159,10,True,False
9621,gentile,159,149,10,True,False
13071,ivone,119,109,10,True,False
672,aldogiuseppe,103,98,5,True,False
14080,lucamaria,81,76,5,True,False
5455,davidemaria,63,58,5,True,False
10971,giovacchino,63,58,5,True,False
13094,jader,57,52,5,True,False


## Build a function that, given a user full name, returns their gender

In [66]:
s = "marina"
r = df_names_new.query(f"name=='{s}'")["is_male"]
r.values[0] if len(r) > 0 else None


False

In [62]:
r.values[0] if len(r) > 0 else None

True

In [7]:
sample_full_names = [
    "tito costa",
    "monica micu",
    "vintage",
    "guan",
    "simone brunozzi",
    "pas",
    "justme",
    "roberto garofalo",
    "sandira",
    "alessandro longo",
    "cristian bracci",
    "matteo fogli",
    "francesco a frigenti",
    "mafe",
    "vanz",
    "federico giacanelli",
    "helene maquet",
    "roberto bonanzinga inreach ventures",
    "emilio",
    "marco servetto",
    "alessandro sanvitale",
    "fulvio spada",
    "benoit mouren",
    "marco",
    "superlorenz",
    "ramentaoist",
    "simon",
    "gianluca pezzi",
]

def is_name_male(row, df_names):
    name = row['name'].split(' ')[0].lower()
    is_male = df_names.query(f"name=='{name}'")["is_male"]
    return is_male.values[0] if len(is_male) > 0 else None

# read table of firstnames and associated gender
fpath = '../data/gender_classification/gender_firstnames_ITA_processed.csv'
df_names = pd.read_csv(fpath, sep=',')

# apply function to each name
df_sample = pd.DataFrame(sample_full_names, columns=['name'])
df_sample['is_male'] = df_sample.apply(is_name_male, axis=1, args=(df_names,))
df_sample


Unnamed: 0,name,is_male
0,tito costa,True
1,monica micu,False
2,vintage,
3,guan,
4,simone brunozzi,True
5,pas,
6,justme,
7,roberto garofalo,True
8,sandira,
9,alessandro longo,True


In [34]:
column_names = [
    "user_id",
    "username",
    "full_name",
    "location",
    "join_year",
    "join_month",
    "join_day",
    "bio",
    "tweets",
    "following",
    "followers",
    "likes",
    "male_name",
    "female_name",
    "loc_count",
    "location_clean",
    "foreign_country",
    "all_regions",
    "region_pos",
    "region",
    "term_for_italy",
    "name_city_engl",
    "condition",
    "city_id",
    "all_cities",
    "city_pos",
    "region_code",
]

def is_name_male(row, df_names):
    name = row['full_name'].split(' ')[0].lower()
    is_male = df_names.query(f"name=='{name}'")["is_male"]
    return is_male.values[0] if len(is_male) > 0 else None

# read table of firstnames and associated gender
fpath = '../data/gender_classification/gender_firstnames_ITA_processed.csv'
df_names = pd.read_csv(fpath, sep=',')

db_file = '/g100_work/IscrC_mental/data/database/MENTALISM_update.db'
table_name = 'user_geocoded'
chunk_size = 50000

# Create a database connection
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

# Get the total number of rows
cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
total_rows = cursor.fetchone()[0]

# Initialize an empty DataFrame
result_df = pd.DataFrame()

# Initialize a tqdm progress bar
progress_bar = tqdm(total=total_rows, unit="row", desc="Processing")

# Loop through the data in chunks
for offset in range(0, total_rows, chunk_size):
    # Query the database for a chunk of rows
    cursor.execute(f"SELECT * FROM {table_name} LIMIT {chunk_size} OFFSET {offset}")
    rows = cursor.fetchall()

    # Create a DataFrame from the fetched rows
    chunk_df = pd.DataFrame(rows, columns=column_names)

    # Apply the label function and update the "is_male" column
    chunk_df["is_male"] = chunk_df.apply(is_name_male, axis=1, args=(df_names,))

    # Filter out rows with None labels and append to the result DataFrame
    filtered_chunk_df = chunk_df.dropna(subset=["is_male"])
    result_df = pd.concat([result_df, filtered_chunk_df], ignore_index=True)
    
    # Update the progress bar
    progress_bar.update(len(rows))

# Close the tqdm progress bar
progress_bar.close()

# Close the database connection
conn.close()



Processing:   0%|          | 0/2148915 [01:06<?, ?row/s][A
  result_df = pd.concat([result_df, filtered_chunk_df], ignore_index=True)
  result_df = pd.concat([result_df, filtered_chunk_df], ignore_index=True)
  result_df = pd.concat([result_df, filtered_chunk_df], ignore_index=True)
  result_df = pd.concat([result_df, filtered_chunk_df], ignore_index=True)
  result_df = pd.concat([result_df, filtered_chunk_df], ignore_index=True)
  result_df = pd.concat([result_df, filtered_chunk_df], ignore_index=True)
  result_df = pd.concat([result_df, filtered_chunk_df], ignore_index=True)
  result_df = pd.concat([result_df, filtered_chunk_df], ignore_index=True)
  result_df = pd.concat([result_df, filtered_chunk_df], ignore_index=True)
  result_df = pd.concat([result_df, filtered_chunk_df], ignore_index=True)
  result_df = pd.concat([result_df, filtered_chunk_df], ignore_index=True)
  result_df = pd.concat([result_df, filtered_chunk_df], ignore_index=True)
  result_df = pd.concat([result_df, fil

In [83]:
outf = '../data/gender_classification/user_gender.csv'
result_df = result_df.drop(['male_name', 'female_name'], axis=1) if 'male_name' in result_df.columns else result_df
result_df.to_csv(outf, sep=';', index=False)

In [84]:
result_df.sample(10)[['full_name','is_male', 'region_pos']]

Unnamed: 0,full_name,is_male,region_pos
840166,viola,False,0.0
298242,elisabetta fusconi,False,
61605,daris amadio,True,
737509,giuseppe,True,
1173883,filippo gariboldi,True,
752675,roberto call,True,
328844,bruno ramogida,True,
1226034,giuseppe tessitore,True,
480414,stefano banfi,True,8.0
718105,valeria,False,5.0


In [85]:
result_df_with_region = result_df.dropna(subset=["region_code"])
print(len(result_df))
print(len(result_df_with_region))

1268704
297387


In [80]:
columns_to_drop = [
    "is_female",
]

result_df[['region_code','bio']]


In [81]:
result_df.columns

Index(['user_id', 'username', 'full_name', 'location', 'join_year',
       'join_month', 'join_day', 'bio', 'tweets', 'following', 'followers',
       'likes', 'loc_count', 'location_clean', 'foreign_country',
       'all_regions', 'region_pos', 'region', 'term_for_italy',
       'name_city_engl', 'condition', 'city_id', 'all_cities', 'city_pos',
       'region_code', 'is_male'],
      dtype='object')