# Required Libraries

In [305]:
!pip install Faker
!pip install names-dataset



Using faker we can generate fake names in different languages:

In [306]:
from faker import Faker

In [307]:
en_fake = Faker()
de_fake = Faker('de_DE')

### By using names dataset you can get gender and country info from a name:

In [308]:
from names_dataset import NameDataset

# This line takes time some time as the database is massive.
nd = NameDataset()

In [343]:
nd.search('Alexandra')

{'first_name': {'country': {'Colombia': 0.195,
   'Germany': 0.087,
   'Spain': 0.045,
   'France': 0.211,
   'United Kingdom': 0.046,
   'Italy': 0.091,
   'Peru': 0.064,
   'Portugal': 0.052,
   'Russian Federation': 0.068,
   'United States': 0.14},
  'gender': {'Female': 0.99, 'Male': 0.01},
  'rank': {'Colombia': 113,
   'Germany': 94,
   'Spain': 287,
   'France': 122,
   'United Kingdom': 316,
   'Italy': 401,
   'Peru': 149,
   'Portugal': 60,
   'Russian Federation': 170,
   'United States': 312}},
 'last_name': {'country': {'Canada': 0.03,
   'Chile': 0.138,
   'Colombia': 0.117,
   'Spain': 0.042,
   'France': 0.082,
   'United Kingdom': 0.07,
   'Italy': 0.177,
   'Peru': 0.058,
   'Portugal': 0.172,
   'United States': 0.113},
  'gender': {},
  'rank': {'Canada': 753,
   'Chile': 441,
   'Colombia': 1022,
   'Spain': 1360,
   'France': 1608,
   'United Kingdom': 1604,
   'Italy': 2077,
   'Peru': 805,
   'Portugal': 140,
   'United States': 2352}}}

In [344]:
# Alexandra is 99% likely to be Female
nd.search('Alexandra')['first_name']['gender']

{'Female': 0.99, 'Male': 0.01}

# Generate Fake Data

Let's first define a function that randomly genrates English or German names.

In [311]:
import pandas as pd
import numpy as np

In [312]:
def make_name():
    """
    This function generate a German name in 50% of the times and
    an English name in the other 50% of the times.
    """
    if np.random.rand() > 0.5:
        return en_fake.name()
    
    return de_fake.name()

Now it is so easy to generate random German and English names.

In [313]:
[make_name() for _ in range(10)]

['Raimund Drub',
 'Veronique Rohleder',
 'Lisa Joseph',
 'Alex Brewer',
 'Monika Hornig',
 'Jessica Price',
 'Diedrich Striebitz-Hermighausen',
 'Tim Rodriguez',
 'Magdalena Hauffer',
 'Randy Johnson']

#### let's build our dataset now:

In [314]:
df = pd.DataFrame({
    'Name': [make_name() for _ in range(100)]
})

In [315]:
df

Unnamed: 0,Name
0,Misty Bailey
1,Dr. Lilly Kitzmann B.A.
2,Georgios Siering-Ackermann
3,Bernhardine Steckel
4,Michelle Flores
...,...
95,Shirley Moreno
96,Alla Klemm B.Eng.
97,Sean Stewart
98,Donald Powell


In [316]:
full_name = de_fake.name()
full_name

'Giovanna Warmer'

In [317]:
names = len(full_name.split())

In [318]:
names

2

## Remove Titiles From name

In [319]:
def remove_titles(name):
    titles = ["Prof.", "Dr.", "Mr.", "Ms.", "Mrs.", "Frau", "Herr"]  # Add more titles if needed
    for title in titles:
        if name.startswith(title):
            name = name[len(title):].strip()  # Remove title and leading spaces
            break
    return name

## Extract First name Mid and Last name

To predict gender and country using a name, we need the first name and last names separate. We can simply assume that the first part of a name is the first name and the last par is last name. For example:

In [320]:
def extract_name(first_name, mid_name, last_name):
    if names > 2:
        first_name, mid_name,last_name = full_name.split()
        result = first_name, mid_name,last_name
    else:
        first_name,last_name = full_name.split()
        result = first_name,last_name
result

('Susann', 'Oderwald', 'B.Eng.')

In [321]:
first_name

'Susann'

In [327]:
df['Name'] = df['Name'].apply(remove_titles)

In [329]:
df['First Name'] = df['Name'].apply(lambda full_name: full_name.split()[0])

In [330]:
df['Mid Name'] = df['Name'].apply(lambda full_name: full_name.split()[1] if len(full_name.split()) > 2 else None)

In [331]:
df['Last Name'] = df['Name'].apply(lambda full_name: full_name.split()[-1])

In [332]:
df

Unnamed: 0,Name,First Name,Mid Name,Last Name
0,Misty Bailey,Misty,,Bailey
1,Lilly Kitzmann B.A.,Lilly,Kitzmann,B.A.
2,Georgios Siering-Ackermann,Georgios,,Siering-Ackermann
3,Bernhardine Steckel,Bernhardine,,Steckel
4,Michelle Flores,Michelle,,Flores
...,...,...,...,...
95,Shirley Moreno,Shirley,,Moreno
96,Alla Klemm B.Eng.,Alla,Klemm,B.Eng.
97,Sean Stewart,Sean,,Stewart
98,Donald Powell,Donald,,Powell


# Predict Gender

Now let's generate gender and country info from a name. Let's write a function that given a first name, returns gender and country.



In [333]:
def name_to_gender(first_name):
    info = nd.search(first_name)['first_name']
    if info is None:
        return None
    
    return max(info['gender'], key=info['gender'].get)

In [337]:
name_to_gender('Kevin')

'Male'

In [338]:
# For unknown names, it returns None
print(name_to_gender('abc'))

None


Let's apply this function on our dataframe and extract gender:

In [339]:
df['Gender'] = df['First Name'].apply(lambda fn: name_to_gender(fn))

In [340]:
df

Unnamed: 0,Name,First Name,Mid Name,Last Name,Gender
0,Misty Bailey,Misty,,Bailey,Female
1,Lilly Kitzmann B.A.,Lilly,Kitzmann,B.A.,Female
2,Georgios Siering-Ackermann,Georgios,,Siering-Ackermann,Male
3,Bernhardine Steckel,Bernhardine,,Steckel,Female
4,Michelle Flores,Michelle,,Flores,Female
...,...,...,...,...,...
95,Shirley Moreno,Shirley,,Moreno,Female
96,Alla Klemm B.Eng.,Alla,Klemm,B.Eng.,Female
97,Sean Stewart,Sean,,Stewart,Male
98,Donald Powell,Donald,,Powell,Male
