# Building a Dataset
Quick tutorial on building your very own dataset (and how to save it)

In [2]:
# Libraries
import pandas as pd

## How Many?

In [7]:
# Number of rows or users to create
num_users = 10000

## Feature Names

In [8]:
#  A list of 10 features
features = [
    "id",
    "gender",
    "subscriber",
    "email",
    "name",
    "last_login",
    "education",
    "dob",
    "bio",
    "rating"
]

# Creating a DF for these features
df = pd.DataFrame(columns=features)

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,favorite_color


## IDs

In [9]:
import uuid

df['id'] = [uuid.uuid4().hex for i in range(num_users)]

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,favorite_color
0,9a50fc9169174379af3e971ce5394cc9,,,,,,,,,
1,1ec47bf78af14b2690074bbe8d59af9d,,,,,,,,,
2,1a80b13d65d445f98cd005b4bf448aef,,,,,,,,,
3,53af9306fc32451ba63f44cf3c9b1057,,,,,,,,,
4,9c65dda5dad34fb5a96c4f8eb9a4f044,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
9995,2a70d654d4ca4a7a9791982e572a7f24,,,,,,,,,
9996,224da323b9c84b2cb35d5c65e09f6cd7,,,,,,,,,
9997,e205b088c91142a78087f247a6a7d20d,,,,,,,,,
9998,18b026642d924d6da327cc36cb2ae0b4,,,,,,,,,


## Gender

In [12]:
import random

genders = ["male", "female", "non-binary"]


df['gender'] = random.choices(
    genders, 
    weights=(49,49,1), 
    k=num_users
)

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,favorite_color
0,9a50fc9169174379af3e971ce5394cc9,female,,,,,,,,
1,1ec47bf78af14b2690074bbe8d59af9d,male,,,,,,,,
2,1a80b13d65d445f98cd005b4bf448aef,female,,,,,,,,
3,53af9306fc32451ba63f44cf3c9b1057,male,,,,,,,,
4,9c65dda5dad34fb5a96c4f8eb9a4f044,male,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
9995,2a70d654d4ca4a7a9791982e572a7f24,male,,,,,,,,
9996,224da323b9c84b2cb35d5c65e09f6cd7,female,,,,,,,,
9997,e205b088c91142a78087f247a6a7d20d,male,,,,,,,,
9998,18b026642d924d6da327cc36cb2ae0b4,male,,,,,,,,


## Subscriber

In [13]:
# Choices
choice = [True, False]

df['subscriber'] = random.choices(
    choice, 
    k=num_users
)

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,favorite_color
0,9a50fc9169174379af3e971ce5394cc9,female,True,,,,,,,
1,1ec47bf78af14b2690074bbe8d59af9d,male,True,,,,,,,
2,1a80b13d65d445f98cd005b4bf448aef,female,True,,,,,,,
3,53af9306fc32451ba63f44cf3c9b1057,male,True,,,,,,,
4,9c65dda5dad34fb5a96c4f8eb9a4f044,male,False,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
9995,2a70d654d4ca4a7a9791982e572a7f24,male,True,,,,,,,
9996,224da323b9c84b2cb35d5c65e09f6cd7,female,True,,,,,,,
9997,e205b088c91142a78087f247a6a7d20d,male,True,,,,,,,
9998,18b026642d924d6da327cc36cb2ae0b4,male,True,,,,,,,


## Name

In [15]:
# Installing the needed library
!pip install faker

Collecting faker
  Downloading Faker-8.12.1-py3-none-any.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 4.2 MB/s eta 0:00:01
Collecting text-unidecode==1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 12.9 MB/s eta 0:00:01
Installing collected packages: text-unidecode, faker
Successfully installed faker-8.12.1 text-unidecode-1.3


In [16]:
from faker import Faker

faker = Faker()

def name_gen(gender):
    """
    Quickly generates a name based on gender
    """
    if gender=='male':
        return faker.name_male()
    elif gender=='female':
        return faker.name_female()
    
    return faker.name()

df['name'] = [name_gen(i) for i in df['gender']]

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,favorite_color
0,9a50fc9169174379af3e971ce5394cc9,female,True,,Michelle Miller,,,,,
1,1ec47bf78af14b2690074bbe8d59af9d,male,True,,Colin Gay,,,,,
2,1a80b13d65d445f98cd005b4bf448aef,female,True,,Tammy Stanley,,,,,
3,53af9306fc32451ba63f44cf3c9b1057,male,True,,Mark Vargas,,,,,
4,9c65dda5dad34fb5a96c4f8eb9a4f044,male,False,,Evan Cox,,,,,
...,...,...,...,...,...,...,...,...,...,...
9995,2a70d654d4ca4a7a9791982e572a7f24,male,True,,Timothy Fry,,,,,
9996,224da323b9c84b2cb35d5c65e09f6cd7,female,True,,Shawna Forbes,,,,,
9997,e205b088c91142a78087f247a6a7d20d,male,True,,Evan Graham,,,,,
9998,18b026642d924d6da327cc36cb2ae0b4,male,True,,Paul Brown,,,,,


## Email

In [26]:
def emailGen(name, duplicateFound=False):
    """
    Generates a random email address based on the given name. 
    Adds a number at the end if a duplicate address was found.
    """
    
    # Lowercasing and splitting
    name = name.lower().split(" ")
    
    # Random character to insert in the name
    chars = [".", "_"]
    
    new_name = name[0] + random.choice(chars) + name[1] 
    
    if duplicateFound:
        
        # Random number to insert at the end
        num = random.randint(0,100)
        
        # Inserting at the end
        new_name = new_name + str(num)
        
        return new_name + "@fakemail.com"
    
    return new_name + "@fakemail.com"
    

emails = []

for name in df['name']:
    
    # Generating the email
    email = emailGen(name)
    
    # Looping until a unique email is generated
    while email in emails:
        
        email = emailGen(name, duplicateFound=True)
    
    # Attaching the new email to the list
    emails.append(email)
    
df['email'] = emails
    
# Checking if the emails are all unique
print(df['email'].nunique()==num_users)

df

True


Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,favorite_color
0,9a50fc9169174379af3e971ce5394cc9,female,True,michelle_miller@fakemail.com,Michelle Miller,,,,,
1,1ec47bf78af14b2690074bbe8d59af9d,male,True,colin.gay@fakemail.com,Colin Gay,,,,,
2,1a80b13d65d445f98cd005b4bf448aef,female,True,tammy_stanley@fakemail.com,Tammy Stanley,,,,,
3,53af9306fc32451ba63f44cf3c9b1057,male,True,mark.vargas@fakemail.com,Mark Vargas,,,,,
4,9c65dda5dad34fb5a96c4f8eb9a4f044,male,False,evan_cox@fakemail.com,Evan Cox,,,,,
...,...,...,...,...,...,...,...,...,...,...
9995,2a70d654d4ca4a7a9791982e572a7f24,male,True,timothy.fry@fakemail.com,Timothy Fry,,,,,
9996,224da323b9c84b2cb35d5c65e09f6cd7,female,True,shawna.forbes@fakemail.com,Shawna Forbes,,,,,
9997,e205b088c91142a78087f247a6a7d20d,male,True,evan.graham@fakemail.com,Evan Graham,,,,,
9998,18b026642d924d6da327cc36cb2ae0b4,male,True,paul_brown@fakemail.com,Paul Brown,,,,,
