# Building a Dataset
Quick tutorial on building your very own dataset and saving it

In [16]:
# Libraries
import pandas as pd
import uuid
import random
from faker import Faker
import datetime

## Dataset Size

In [17]:
# Number of rows or users to create
num_users = 100000

## Feature Names

In [18]:
#  A list of 10 features
features = [
    "id",
    "gender",
    "subscriber",
    "name",
    "email",
    "last_login",
    "dob",
    "education",
    "bio",
    "rating"
]

# Creating a DF for these features
df = pd.DataFrame(columns=features)

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating


## IDs

In [19]:
df['id'] = [uuid.uuid4().hex for i in range(num_users)]

# Checking if all IDs are unique
print(df['id'].nunique()==num_users)

df

True


Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,69c0ba9e7f804c36968560d59e3ddbb9,,,,,,,,,
1,82d5659be16846fbbb8d490c34acee3d,,,,,,,,,
2,40faa706127d46079371142a38d9d9e6,,,,,,,,,
3,9c5db531c40b4d658e7159a92b4c9c56,,,,,,,,,
4,341c6644d2f748458d28bb41e6bd6dc6,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
99995,c43a31ef9e9c47fdb8dbfe544602d92c,,,,,,,,,
99996,7ff1071321eb4aecb9c840c312cb1444,,,,,,,,,
99997,e653d886b87b4b3d9b604a659d0ad9db,,,,,,,,,
99998,eb58694d6e044e3d991ccd16ae9ad412,,,,,,,,,


## Gender

In [20]:
genders = ["male", "female", "na"]


df['gender'] = random.choices(
    genders, 
    weights=(47,47,6), 
    k=num_users
)

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,69c0ba9e7f804c36968560d59e3ddbb9,male,,,,,,,,
1,82d5659be16846fbbb8d490c34acee3d,female,,,,,,,,
2,40faa706127d46079371142a38d9d9e6,female,,,,,,,,
3,9c5db531c40b4d658e7159a92b4c9c56,female,,,,,,,,
4,341c6644d2f748458d28bb41e6bd6dc6,male,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
99995,c43a31ef9e9c47fdb8dbfe544602d92c,male,,,,,,,,
99996,7ff1071321eb4aecb9c840c312cb1444,female,,,,,,,,
99997,e653d886b87b4b3d9b604a659d0ad9db,male,,,,,,,,
99998,eb58694d6e044e3d991ccd16ae9ad412,female,,,,,,,,


## Subscriber

In [21]:
# Choices
choice = [True, False]

df['subscriber'] = random.choices(
    choice, 
    k=num_users
)

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,69c0ba9e7f804c36968560d59e3ddbb9,male,True,,,,,,,
1,82d5659be16846fbbb8d490c34acee3d,female,True,,,,,,,
2,40faa706127d46079371142a38d9d9e6,female,False,,,,,,,
3,9c5db531c40b4d658e7159a92b4c9c56,female,False,,,,,,,
4,341c6644d2f748458d28bb41e6bd6dc6,male,False,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
99995,c43a31ef9e9c47fdb8dbfe544602d92c,male,False,,,,,,,
99996,7ff1071321eb4aecb9c840c312cb1444,female,False,,,,,,,
99997,e653d886b87b4b3d9b604a659d0ad9db,male,False,,,,,,,
99998,eb58694d6e044e3d991ccd16ae9ad412,female,True,,,,,,,


## Name

In [22]:
# Instantiating faker
faker = Faker()

def name_gen(gender):
    """
    Quickly generates a name based on gender
    """
    if gender=='male':
        return faker.name_male()
    elif gender=='female':
        return faker.name_female()
    
    return faker.name()

# Generating names for each user
df['name'] = [name_gen(i) for i in df['gender']]

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,69c0ba9e7f804c36968560d59e3ddbb9,male,True,Stephen Rowe,,,,,,
1,82d5659be16846fbbb8d490c34acee3d,female,True,Ann Duke,,,,,,
2,40faa706127d46079371142a38d9d9e6,female,False,Jean Newman,,,,,,
3,9c5db531c40b4d658e7159a92b4c9c56,female,False,Colleen Galvan,,,,,,
4,341c6644d2f748458d28bb41e6bd6dc6,male,False,Kyle Price,,,,,,
...,...,...,...,...,...,...,...,...,...,...
99995,c43a31ef9e9c47fdb8dbfe544602d92c,male,False,Walter Johnson,,,,,,
99996,7ff1071321eb4aecb9c840c312cb1444,female,False,Nancy Ho,,,,,,
99997,e653d886b87b4b3d9b604a659d0ad9db,male,False,Jason Ayala,,,,,,
99998,eb58694d6e044e3d991ccd16ae9ad412,female,True,Maria Morris,,,,,,


## Email

In [23]:
def emailGen(name, duplicateFound=False):
    """
    Generates a random email address based on the given name. 
    Adds a number at the end if a duplicate address was found.
    """
    # Fake domain name to use
    dom = "@fakemail.com"
    
    # Lowercasing and splitting
    name = name.lower().split(" ")
    
    # Random character to insert in the name
    chars = [".", "_"]
    
    new_name = name[0] + random.choice(chars) + name[1] 
    
    # Further distinguishing the email if a duplicate was found
    if duplicateFound:
        
        # Random number to insert at the end
        num = random.randint(0,100)
        
        # Inserting at the end
        new_name = new_name + str(num)
        
    # Returning the email address with the domain name attached
    return new_name + dom
    

emails = []

for name in df['name']:
    
    # Generating the email
    email = emailGen(name)
    
    # Looping until a unique email is generated
    while email in emails:
        
        # Creating an email with a random number
        email = emailGen(name, duplicateFound=True)
    
    # Attaching the new email to the list
    emails.append(email)
    
df['email'] = emails
    
# Checking if the emails are all unique
print(df['email'].nunique()==num_users)

df

True


Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,69c0ba9e7f804c36968560d59e3ddbb9,male,True,Stephen Rowe,stephen_rowe@fakemail.com,,,,,
1,82d5659be16846fbbb8d490c34acee3d,female,True,Ann Duke,ann.duke@fakemail.com,,,,,
2,40faa706127d46079371142a38d9d9e6,female,False,Jean Newman,jean.newman@fakemail.com,,,,,
3,9c5db531c40b4d658e7159a92b4c9c56,female,False,Colleen Galvan,colleen.galvan@fakemail.com,,,,,
4,341c6644d2f748458d28bb41e6bd6dc6,male,False,Kyle Price,kyle.price@fakemail.com,,,,,
...,...,...,...,...,...,...,...,...,...,...
99995,c43a31ef9e9c47fdb8dbfe544602d92c,male,False,Walter Johnson,walter_johnson1@fakemail.com,,,,,
99996,7ff1071321eb4aecb9c840c312cb1444,female,False,Nancy Ho,nancy_ho@fakemail.com,,,,,
99997,e653d886b87b4b3d9b604a659d0ad9db,male,False,Jason Ayala,jason_ayala59@fakemail.com,,,,,
99998,eb58694d6e044e3d991ccd16ae9ad412,female,True,Maria Morris,maria_morris@fakemail.com,,,,,


## Last Login

In [24]:
def randomtimes(start, end, n):
    """
    Generates random time stamps based on a given amount between two time periods.
    """
    # The timestamp format
    frmt = "%Y-%m-%d %H:%M:%S"
    
    # Formatting the two time periods
    stime = datetime.datetime.strptime(start, frmt)
    etime = datetime.datetime.strptime(end, frmt)
    
    # Creating the pool for random times
    td = etime - stime
    
    # Generating a list with the random times
    times = [(random.random() * td + stime).strftime(frmt) for _ in range(n)]
    
    return times

# Setting the start and end times
start = "2021-08-01 00:00:00"

end = "2021-08-24 00:00:00"

df['last_login'] = randomtimes(start, end, num_users)

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,69c0ba9e7f804c36968560d59e3ddbb9,male,True,Stephen Rowe,stephen_rowe@fakemail.com,2021-08-22 18:15:01,,,,
1,82d5659be16846fbbb8d490c34acee3d,female,True,Ann Duke,ann.duke@fakemail.com,2021-08-01 08:12:33,,,,
2,40faa706127d46079371142a38d9d9e6,female,False,Jean Newman,jean.newman@fakemail.com,2021-08-19 21:25:37,,,,
3,9c5db531c40b4d658e7159a92b4c9c56,female,False,Colleen Galvan,colleen.galvan@fakemail.com,2021-08-19 12:44:40,,,,
4,341c6644d2f748458d28bb41e6bd6dc6,male,False,Kyle Price,kyle.price@fakemail.com,2021-08-18 03:18:22,,,,
...,...,...,...,...,...,...,...,...,...,...
99995,c43a31ef9e9c47fdb8dbfe544602d92c,male,False,Walter Johnson,walter_johnson1@fakemail.com,2021-08-09 18:31:07,,,,
99996,7ff1071321eb4aecb9c840c312cb1444,female,False,Nancy Ho,nancy_ho@fakemail.com,2021-08-02 20:45:07,,,,
99997,e653d886b87b4b3d9b604a659d0ad9db,male,False,Jason Ayala,jason_ayala59@fakemail.com,2021-08-04 20:09:25,,,,
99998,eb58694d6e044e3d991ccd16ae9ad412,female,True,Maria Morris,maria_morris@fakemail.com,2021-08-16 17:46:57,,,,


## Date of Birth

In [25]:
def random_dob(start, end, n):
    """
    Generating a list of a set number of timestamps
    """
    
    # The timestamp format
    frmt = "%Y-%m-%d"
    
    # Formatting the two time periods
    stime = datetime.datetime.strptime(start, frmt)
    etime = datetime.datetime.strptime(end, frmt)
    
    # Creating the pool for random times
    td = etime - stime
    
    # Generating a list with the random times
    times = [(random.random() * td + stime).strftime(frmt) for _ in range(n)]
    
    return times

df['dob'] = random_dob("1980-01-01", "2006-01-01", num_users)

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,69c0ba9e7f804c36968560d59e3ddbb9,male,True,Stephen Rowe,stephen_rowe@fakemail.com,2021-08-22 18:15:01,1980-04-06,,,
1,82d5659be16846fbbb8d490c34acee3d,female,True,Ann Duke,ann.duke@fakemail.com,2021-08-01 08:12:33,1998-10-12,,,
2,40faa706127d46079371142a38d9d9e6,female,False,Jean Newman,jean.newman@fakemail.com,2021-08-19 21:25:37,1992-12-11,,,
3,9c5db531c40b4d658e7159a92b4c9c56,female,False,Colleen Galvan,colleen.galvan@fakemail.com,2021-08-19 12:44:40,1989-09-22,,,
4,341c6644d2f748458d28bb41e6bd6dc6,male,False,Kyle Price,kyle.price@fakemail.com,2021-08-18 03:18:22,1992-09-06,,,
...,...,...,...,...,...,...,...,...,...,...
99995,c43a31ef9e9c47fdb8dbfe544602d92c,male,False,Walter Johnson,walter_johnson1@fakemail.com,2021-08-09 18:31:07,1996-01-18,,,
99996,7ff1071321eb4aecb9c840c312cb1444,female,False,Nancy Ho,nancy_ho@fakemail.com,2021-08-02 20:45:07,1990-12-16,,,
99997,e653d886b87b4b3d9b604a659d0ad9db,male,False,Jason Ayala,jason_ayala59@fakemail.com,2021-08-04 20:09:25,1996-02-20,,,
99998,eb58694d6e044e3d991ccd16ae9ad412,female,True,Maria Morris,maria_morris@fakemail.com,2021-08-16 17:46:57,1989-06-28,,,


## Current Education

In [26]:
def getEducation(dob):
    """
    Assigns an education level based on the given date of birth
    """
    # Current date
    now = datetime.datetime.now()
    
    # Date of birth
    dob = datetime.datetime.strptime(dob, "%Y-%m-%d")
    
    # Subtracting the times to get an age
    age = int((now - dob).days/365.25)
    
    # Returning education level based on age
    if age <= 18:
        return 'high school'
    elif age <= 22:
        return 'undergrad'
    elif age <= 25:
        return 'grad'
    else:
        return 'employed'

df['education'] = [getEducation(i) for i in df['dob']]

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,69c0ba9e7f804c36968560d59e3ddbb9,male,True,Stephen Rowe,stephen_rowe@fakemail.com,2021-08-22 18:15:01,1980-04-06,employed,,
1,82d5659be16846fbbb8d490c34acee3d,female,True,Ann Duke,ann.duke@fakemail.com,2021-08-01 08:12:33,1998-10-12,undergrad,,
2,40faa706127d46079371142a38d9d9e6,female,False,Jean Newman,jean.newman@fakemail.com,2021-08-19 21:25:37,1992-12-11,employed,,
3,9c5db531c40b4d658e7159a92b4c9c56,female,False,Colleen Galvan,colleen.galvan@fakemail.com,2021-08-19 12:44:40,1989-09-22,employed,,
4,341c6644d2f748458d28bb41e6bd6dc6,male,False,Kyle Price,kyle.price@fakemail.com,2021-08-18 03:18:22,1992-09-06,employed,,
...,...,...,...,...,...,...,...,...,...,...
99995,c43a31ef9e9c47fdb8dbfe544602d92c,male,False,Walter Johnson,walter_johnson1@fakemail.com,2021-08-09 18:31:07,1996-01-18,grad,,
99996,7ff1071321eb4aecb9c840c312cb1444,female,False,Nancy Ho,nancy_ho@fakemail.com,2021-08-02 20:45:07,1990-12-16,employed,,
99997,e653d886b87b4b3d9b604a659d0ad9db,male,False,Jason Ayala,jason_ayala59@fakemail.com,2021-08-04 20:09:25,1996-02-20,grad,,
99998,eb58694d6e044e3d991ccd16ae9ad412,female,True,Maria Morris,maria_morris@fakemail.com,2021-08-16 17:46:57,1989-06-28,employed,,


## Bio

In [27]:
def makeBio(subscriber):
    """
    Making a short or long bio depending their subscription status.
    """
    
    if subscriber==True:
        
        # Randomizing bio length but skewed towards longer bios
        bio_len = random.choices([10,20], weights=(10,90), k=1)[0]
        
    elif subscriber==False:
        
        # Randomizing bio length but skewed towards shorter bios
        bio_len = random.choices([1,3], weights=(10,90), k=1)[0]
        
    return faker.sentence(bio_len)
    

df['bio'] = [makeBio(i) for i in df['subscriber']]

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,69c0ba9e7f804c36968560d59e3ddbb9,male,True,Stephen Rowe,stephen_rowe@fakemail.com,2021-08-22 18:15:01,1980-04-06,employed,Ten loss reduce hair place weight boy account ...,
1,82d5659be16846fbbb8d490c34acee3d,female,True,Ann Duke,ann.duke@fakemail.com,2021-08-01 08:12:33,1998-10-12,undergrad,Sing model ball carry let daughter factor wind...,
2,40faa706127d46079371142a38d9d9e6,female,False,Jean Newman,jean.newman@fakemail.com,2021-08-19 21:25:37,1992-12-11,employed,Take military join.,
3,9c5db531c40b4d658e7159a92b4c9c56,female,False,Colleen Galvan,colleen.galvan@fakemail.com,2021-08-19 12:44:40,1989-09-22,employed,Light PM page government.,
4,341c6644d2f748458d28bb41e6bd6dc6,male,False,Kyle Price,kyle.price@fakemail.com,2021-08-18 03:18:22,1992-09-06,employed,Back all respond.,
...,...,...,...,...,...,...,...,...,...,...
99995,c43a31ef9e9c47fdb8dbfe544602d92c,male,False,Walter Johnson,walter_johnson1@fakemail.com,2021-08-09 18:31:07,1996-01-18,grad,Take group whole.,
99996,7ff1071321eb4aecb9c840c312cb1444,female,False,Nancy Ho,nancy_ho@fakemail.com,2021-08-02 20:45:07,1990-12-16,employed,Crime.,
99997,e653d886b87b4b3d9b604a659d0ad9db,male,False,Jason Ayala,jason_ayala59@fakemail.com,2021-08-04 20:09:25,1996-02-20,grad,Investment seat when practice.,
99998,eb58694d6e044e3d991ccd16ae9ad412,female,True,Maria Morris,maria_morris@fakemail.com,2021-08-16 17:46:57,1989-06-28,employed,Doctor not exist your letter focus successful ...,


## Rating

In [28]:
# The different ratings available
ratings = [1,2,3,4,5]

# Weighted ratings with a skew towards the ends
df['rating'] = random.choices(
    ratings, 
    weights=(30,10,10,10,30), 
    k=num_users
)

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,69c0ba9e7f804c36968560d59e3ddbb9,male,True,Stephen Rowe,stephen_rowe@fakemail.com,2021-08-22 18:15:01,1980-04-06,employed,Ten loss reduce hair place weight boy account ...,5
1,82d5659be16846fbbb8d490c34acee3d,female,True,Ann Duke,ann.duke@fakemail.com,2021-08-01 08:12:33,1998-10-12,undergrad,Sing model ball carry let daughter factor wind...,5
2,40faa706127d46079371142a38d9d9e6,female,False,Jean Newman,jean.newman@fakemail.com,2021-08-19 21:25:37,1992-12-11,employed,Take military join.,5
3,9c5db531c40b4d658e7159a92b4c9c56,female,False,Colleen Galvan,colleen.galvan@fakemail.com,2021-08-19 12:44:40,1989-09-22,employed,Light PM page government.,5
4,341c6644d2f748458d28bb41e6bd6dc6,male,False,Kyle Price,kyle.price@fakemail.com,2021-08-18 03:18:22,1992-09-06,employed,Back all respond.,5
...,...,...,...,...,...,...,...,...,...,...
99995,c43a31ef9e9c47fdb8dbfe544602d92c,male,False,Walter Johnson,walter_johnson1@fakemail.com,2021-08-09 18:31:07,1996-01-18,grad,Take group whole.,5
99996,7ff1071321eb4aecb9c840c312cb1444,female,False,Nancy Ho,nancy_ho@fakemail.com,2021-08-02 20:45:07,1990-12-16,employed,Crime.,5
99997,e653d886b87b4b3d9b604a659d0ad9db,male,False,Jason Ayala,jason_ayala59@fakemail.com,2021-08-04 20:09:25,1996-02-20,grad,Investment seat when practice.,4
99998,eb58694d6e044e3d991ccd16ae9ad412,female,True,Maria Morris,maria_morris@fakemail.com,2021-08-16 17:46:57,1989-06-28,employed,Doctor not exist your letter focus successful ...,5


## Saving as a CSV file

In [29]:
df.to_csv('dataset.csv')

In [30]:
# Viewing the saved csv file
csv_df = pd.read_csv('dataset.csv', index_col=0)

csv_df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,69c0ba9e7f804c36968560d59e3ddbb9,male,True,Stephen Rowe,stephen_rowe@fakemail.com,2021-08-22 18:15:01,1980-04-06,employed,Ten loss reduce hair place weight boy account ...,5
1,82d5659be16846fbbb8d490c34acee3d,female,True,Ann Duke,ann.duke@fakemail.com,2021-08-01 08:12:33,1998-10-12,undergrad,Sing model ball carry let daughter factor wind...,5
2,40faa706127d46079371142a38d9d9e6,female,False,Jean Newman,jean.newman@fakemail.com,2021-08-19 21:25:37,1992-12-11,employed,Take military join.,5
3,9c5db531c40b4d658e7159a92b4c9c56,female,False,Colleen Galvan,colleen.galvan@fakemail.com,2021-08-19 12:44:40,1989-09-22,employed,Light PM page government.,5
4,341c6644d2f748458d28bb41e6bd6dc6,male,False,Kyle Price,kyle.price@fakemail.com,2021-08-18 03:18:22,1992-09-06,employed,Back all respond.,5
...,...,...,...,...,...,...,...,...,...,...
99995,c43a31ef9e9c47fdb8dbfe544602d92c,male,False,Walter Johnson,walter_johnson1@fakemail.com,2021-08-09 18:31:07,1996-01-18,grad,Take group whole.,5
99996,7ff1071321eb4aecb9c840c312cb1444,female,False,Nancy Ho,nancy_ho@fakemail.com,2021-08-02 20:45:07,1990-12-16,employed,Crime.,5
99997,e653d886b87b4b3d9b604a659d0ad9db,male,False,Jason Ayala,jason_ayala59@fakemail.com,2021-08-04 20:09:25,1996-02-20,grad,Investment seat when practice.,4
99998,eb58694d6e044e3d991ccd16ae9ad412,female,True,Maria Morris,maria_morris@fakemail.com,2021-08-16 17:46:57,1989-06-28,employed,Doctor not exist your letter focus successful ...,5
