# Building a Dataset
Quick tutorial on building your very own dataset (and how to save it)

In [1]:
# Libraries
import pandas as pd

## How Many?

In [2]:
# Number of rows or users to create
num_users = 10000

## Feature Names

In [3]:
#  A list of 10 features
features = [
    "id",
    "gender",
    "subscriber",
    "name",
    "email",
    "last_login",
    "dob",
    "education",
    "bio",
    "rating"
]

# Creating a DF for these features
df = pd.DataFrame(columns=features)

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,rating


## IDs

In [4]:
import uuid

df['id'] = [uuid.uuid4().hex for i in range(num_users)]

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,rating
0,ac8074c120f245c5b354bf55fa6045e0,,,,,,,,,
1,8889fb15914142a7bd0cfb078c4d2283,,,,,,,,,
2,92d5f3e001a34dc98f8a8eb605f8f02c,,,,,,,,,
3,deb47292592e433482816ef1e117d6bd,,,,,,,,,
4,1fa5764cb3764fb9a1f63472887f4dc5,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
9995,c1323ab860be4d3c8870ce603ab353ea,,,,,,,,,
9996,2073bab29db546ed83996aa1edaccce3,,,,,,,,,
9997,815c826922c6419fb4181b6f07b41871,,,,,,,,,
9998,282d88eaa3ff4ca1b9db684711a37bf7,,,,,,,,,


## Gender

In [5]:
import random

genders = ["male", "female", "non-binary"]


df['gender'] = random.choices(
    genders, 
    weights=(49,49,2), 
    k=num_users
)

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,rating
0,ac8074c120f245c5b354bf55fa6045e0,male,,,,,,,,
1,8889fb15914142a7bd0cfb078c4d2283,female,,,,,,,,
2,92d5f3e001a34dc98f8a8eb605f8f02c,female,,,,,,,,
3,deb47292592e433482816ef1e117d6bd,male,,,,,,,,
4,1fa5764cb3764fb9a1f63472887f4dc5,female,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
9995,c1323ab860be4d3c8870ce603ab353ea,male,,,,,,,,
9996,2073bab29db546ed83996aa1edaccce3,female,,,,,,,,
9997,815c826922c6419fb4181b6f07b41871,female,,,,,,,,
9998,282d88eaa3ff4ca1b9db684711a37bf7,female,,,,,,,,


## Subscriber

In [6]:
# Choices
choice = [True, False]

df['subscriber'] = random.choices(
    choice, 
    k=num_users
)

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,rating
0,ac8074c120f245c5b354bf55fa6045e0,male,True,,,,,,,
1,8889fb15914142a7bd0cfb078c4d2283,female,False,,,,,,,
2,92d5f3e001a34dc98f8a8eb605f8f02c,female,True,,,,,,,
3,deb47292592e433482816ef1e117d6bd,male,True,,,,,,,
4,1fa5764cb3764fb9a1f63472887f4dc5,female,True,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
9995,c1323ab860be4d3c8870ce603ab353ea,male,False,,,,,,,
9996,2073bab29db546ed83996aa1edaccce3,female,False,,,,,,,
9997,815c826922c6419fb4181b6f07b41871,female,True,,,,,,,
9998,282d88eaa3ff4ca1b9db684711a37bf7,female,True,,,,,,,


## Name

In [7]:
# Installing the needed library
!pip install faker



In [8]:
from faker import Faker

faker = Faker()

def name_gen(gender):
    """
    Quickly generates a name based on gender
    """
    if gender=='male':
        return faker.name_male()
    elif gender=='female':
        return faker.name_female()
    
    return faker.name()

df['name'] = [name_gen(i) for i in df['gender']]

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,rating
0,ac8074c120f245c5b354bf55fa6045e0,male,True,,Thomas Kim,,,,,
1,8889fb15914142a7bd0cfb078c4d2283,female,False,,Alisha Burton,,,,,
2,92d5f3e001a34dc98f8a8eb605f8f02c,female,True,,Sharon Gutierrez,,,,,
3,deb47292592e433482816ef1e117d6bd,male,True,,Joshua Williams,,,,,
4,1fa5764cb3764fb9a1f63472887f4dc5,female,True,,Michelle Tucker,,,,,
...,...,...,...,...,...,...,...,...,...,...
9995,c1323ab860be4d3c8870ce603ab353ea,male,False,,Mr. Jeremy Campbell DDS,,,,,
9996,2073bab29db546ed83996aa1edaccce3,female,False,,Shelley Allen,,,,,
9997,815c826922c6419fb4181b6f07b41871,female,True,,Courtney Walton,,,,,
9998,282d88eaa3ff4ca1b9db684711a37bf7,female,True,,Courtney Thompson,,,,,


## Email

In [9]:
def emailGen(name, duplicateFound=False):
    """
    Generates a random email address based on the given name. 
    Adds a number at the end if a duplicate address was found.
    """
    # Fake domain name
    dom = "@fakemail.com"
    
    # Lowercasing and splitting
    name = name.lower().split(" ")
    
    # Random character to insert in the name
    chars = [".", "_"]
    
    new_name = name[0] + random.choice(chars) + name[1] 
    
    if duplicateFound:
        
        # Random number to insert at the end
        num = random.randint(0,100)
        
        # Inserting at the end
        new_name = new_name + str(num)
        
        return new_name + dom
    
    return new_name + dom
    

emails = []

for name in df['name']:
    
    # Generating the email
    email = emailGen(name)
    
    # Looping until a unique email is generated
    while email in emails:
        
        email = emailGen(name, duplicateFound=True)
    
    # Attaching the new email to the list
    emails.append(email)
    
df['email'] = emails
    
# Checking if the emails are all unique
print(df['email'].nunique()==num_users)

df

True


Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,rating
0,ac8074c120f245c5b354bf55fa6045e0,male,True,thomas_kim@fakemail.com,Thomas Kim,,,,,
1,8889fb15914142a7bd0cfb078c4d2283,female,False,alisha.burton@fakemail.com,Alisha Burton,,,,,
2,92d5f3e001a34dc98f8a8eb605f8f02c,female,True,sharon_gutierrez@fakemail.com,Sharon Gutierrez,,,,,
3,deb47292592e433482816ef1e117d6bd,male,True,joshua_williams@fakemail.com,Joshua Williams,,,,,
4,1fa5764cb3764fb9a1f63472887f4dc5,female,True,michelle.tucker@fakemail.com,Michelle Tucker,,,,,
...,...,...,...,...,...,...,...,...,...,...
9995,c1323ab860be4d3c8870ce603ab353ea,male,False,mr..jeremy@fakemail.com,Mr. Jeremy Campbell DDS,,,,,
9996,2073bab29db546ed83996aa1edaccce3,female,False,shelley.allen@fakemail.com,Shelley Allen,,,,,
9997,815c826922c6419fb4181b6f07b41871,female,True,courtney_walton@fakemail.com,Courtney Walton,,,,,
9998,282d88eaa3ff4ca1b9db684711a37bf7,female,True,courtney_thompson@fakemail.com,Courtney Thompson,,,,,


## Last Login

In [10]:
import datetime

def randomtimes(start, end, n):
    """
    Generates random time stamps based on a given amount between two time periods.
    """
    # The timestamp format
    frmt = "%Y-%m-%d %H:%M:%S"
    
    # Formatting the two time periods
    stime = datetime.datetime.strptime(start, frmt)
    etime = datetime.datetime.strptime(end, frmt)
    
    # Creating the pool for random times
    td = etime - stime
    
    # Generating a list with the random times
    times = [(random.random() * td + stime).strftime(frmt) for _ in range(n)]
    
    return times

# Setting the start and end times
start = "2021-08-01 00:00:00"

end = "2021-08-24 00:00:00"

df['last_login'] = randomtimes(start, end, num_users)

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,rating
0,ac8074c120f245c5b354bf55fa6045e0,male,True,thomas_kim@fakemail.com,Thomas Kim,2021-08-18 17:13:02,,,,
1,8889fb15914142a7bd0cfb078c4d2283,female,False,alisha.burton@fakemail.com,Alisha Burton,2021-08-08 16:06:47,,,,
2,92d5f3e001a34dc98f8a8eb605f8f02c,female,True,sharon_gutierrez@fakemail.com,Sharon Gutierrez,2021-08-04 14:54:09,,,,
3,deb47292592e433482816ef1e117d6bd,male,True,joshua_williams@fakemail.com,Joshua Williams,2021-08-08 12:27:49,,,,
4,1fa5764cb3764fb9a1f63472887f4dc5,female,True,michelle.tucker@fakemail.com,Michelle Tucker,2021-08-20 05:00:11,,,,
...,...,...,...,...,...,...,...,...,...,...
9995,c1323ab860be4d3c8870ce603ab353ea,male,False,mr..jeremy@fakemail.com,Mr. Jeremy Campbell DDS,2021-08-14 12:27:11,,,,
9996,2073bab29db546ed83996aa1edaccce3,female,False,shelley.allen@fakemail.com,Shelley Allen,2021-08-20 01:36:31,,,,
9997,815c826922c6419fb4181b6f07b41871,female,True,courtney_walton@fakemail.com,Courtney Walton,2021-08-08 14:12:25,,,,
9998,282d88eaa3ff4ca1b9db684711a37bf7,female,True,courtney_thompson@fakemail.com,Courtney Thompson,2021-08-23 10:10:42,,,,


## Date of Birth

In [11]:
def random_dob(start, end, n):
    """
    Generating a list of a set number of timestamps
    """
    
    # The timestamp format
    frmt = "%Y-%m-%d"
    
    # Formatting the two time periods
    stime = datetime.datetime.strptime(start, frmt)
    etime = datetime.datetime.strptime(end, frmt)
    
    # Creating the pool for random times
    td = etime - stime
    
    # Generating a list with the random times
    times = [(random.random() * td + stime).strftime(frmt) for _ in range(n)]
    
    return times

df['dob'] = random_dob("1980-01-01", "2006-01-01", num_users)

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,rating
0,ac8074c120f245c5b354bf55fa6045e0,male,True,thomas_kim@fakemail.com,Thomas Kim,2021-08-18 17:13:02,,1994-03-14,,
1,8889fb15914142a7bd0cfb078c4d2283,female,False,alisha.burton@fakemail.com,Alisha Burton,2021-08-08 16:06:47,,1982-10-24,,
2,92d5f3e001a34dc98f8a8eb605f8f02c,female,True,sharon_gutierrez@fakemail.com,Sharon Gutierrez,2021-08-04 14:54:09,,2003-04-08,,
3,deb47292592e433482816ef1e117d6bd,male,True,joshua_williams@fakemail.com,Joshua Williams,2021-08-08 12:27:49,,1982-05-03,,
4,1fa5764cb3764fb9a1f63472887f4dc5,female,True,michelle.tucker@fakemail.com,Michelle Tucker,2021-08-20 05:00:11,,2005-01-04,,
...,...,...,...,...,...,...,...,...,...,...
9995,c1323ab860be4d3c8870ce603ab353ea,male,False,mr..jeremy@fakemail.com,Mr. Jeremy Campbell DDS,2021-08-14 12:27:11,,1985-03-01,,
9996,2073bab29db546ed83996aa1edaccce3,female,False,shelley.allen@fakemail.com,Shelley Allen,2021-08-20 01:36:31,,1986-03-31,,
9997,815c826922c6419fb4181b6f07b41871,female,True,courtney_walton@fakemail.com,Courtney Walton,2021-08-08 14:12:25,,2000-11-25,,
9998,282d88eaa3ff4ca1b9db684711a37bf7,female,True,courtney_thompson@fakemail.com,Courtney Thompson,2021-08-23 10:10:42,,1993-12-15,,


## Current Education

In [12]:
def getEducation(dob):
    """
    Assigns an education level based on the given date of birth
    """
    # Current date
    now = datetime.datetime.now()
    
    # Date of birth
    dob = datetime.datetime.strptime(dob, "%Y-%m-%d")
    
    # Subtracting the times to get an age
    age = int((now - dob).days/365.25)
    
    # Returning education level based on age
    if age <= 18:
        return 'high school'
    elif age <= 22:
        return 'undergrad'
    elif age <= 25:
        return 'grad'
    else:
        return 'worker'

df['education'] = [getEducation(i) for i in df['dob']]

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,rating
0,ac8074c120f245c5b354bf55fa6045e0,male,True,thomas_kim@fakemail.com,Thomas Kim,2021-08-18 17:13:02,worker,1994-03-14,,
1,8889fb15914142a7bd0cfb078c4d2283,female,False,alisha.burton@fakemail.com,Alisha Burton,2021-08-08 16:06:47,worker,1982-10-24,,
2,92d5f3e001a34dc98f8a8eb605f8f02c,female,True,sharon_gutierrez@fakemail.com,Sharon Gutierrez,2021-08-04 14:54:09,high school,2003-04-08,,
3,deb47292592e433482816ef1e117d6bd,male,True,joshua_williams@fakemail.com,Joshua Williams,2021-08-08 12:27:49,worker,1982-05-03,,
4,1fa5764cb3764fb9a1f63472887f4dc5,female,True,michelle.tucker@fakemail.com,Michelle Tucker,2021-08-20 05:00:11,high school,2005-01-04,,
...,...,...,...,...,...,...,...,...,...,...
9995,c1323ab860be4d3c8870ce603ab353ea,male,False,mr..jeremy@fakemail.com,Mr. Jeremy Campbell DDS,2021-08-14 12:27:11,worker,1985-03-01,,
9996,2073bab29db546ed83996aa1edaccce3,female,False,shelley.allen@fakemail.com,Shelley Allen,2021-08-20 01:36:31,worker,1986-03-31,,
9997,815c826922c6419fb4181b6f07b41871,female,True,courtney_walton@fakemail.com,Courtney Walton,2021-08-08 14:12:25,undergrad,2000-11-25,,
9998,282d88eaa3ff4ca1b9db684711a37bf7,female,True,courtney_thompson@fakemail.com,Courtney Thompson,2021-08-23 10:10:42,worker,1993-12-15,,


## Bio

In [13]:
df['bio'] = [faker.sentence(10) for _ in range(num_users)]

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,rating
0,ac8074c120f245c5b354bf55fa6045e0,male,True,thomas_kim@fakemail.com,Thomas Kim,2021-08-18 17:13:02,worker,1994-03-14,Wide agent available fill poor mother.,
1,8889fb15914142a7bd0cfb078c4d2283,female,False,alisha.burton@fakemail.com,Alisha Burton,2021-08-08 16:06:47,worker,1982-10-24,Amount these identify main wall direction pare...,
2,92d5f3e001a34dc98f8a8eb605f8f02c,female,True,sharon_gutierrez@fakemail.com,Sharon Gutierrez,2021-08-04 14:54:09,high school,2003-04-08,Somebody east authority turn if establish comm...,
3,deb47292592e433482816ef1e117d6bd,male,True,joshua_williams@fakemail.com,Joshua Williams,2021-08-08 12:27:49,worker,1982-05-03,His know particularly within across few likely.,
4,1fa5764cb3764fb9a1f63472887f4dc5,female,True,michelle.tucker@fakemail.com,Michelle Tucker,2021-08-20 05:00:11,high school,2005-01-04,Tv available employee purpose yourself section...,
...,...,...,...,...,...,...,...,...,...,...
9995,c1323ab860be4d3c8870ce603ab353ea,male,False,mr..jeremy@fakemail.com,Mr. Jeremy Campbell DDS,2021-08-14 12:27:11,worker,1985-03-01,Happy little company share life hear scientist...,
9996,2073bab29db546ed83996aa1edaccce3,female,False,shelley.allen@fakemail.com,Shelley Allen,2021-08-20 01:36:31,worker,1986-03-31,Purpose same policy series husband do.,
9997,815c826922c6419fb4181b6f07b41871,female,True,courtney_walton@fakemail.com,Courtney Walton,2021-08-08 14:12:25,undergrad,2000-11-25,Which accept more civil southern think talk fi...,
9998,282d88eaa3ff4ca1b9db684711a37bf7,female,True,courtney_thompson@fakemail.com,Courtney Thompson,2021-08-23 10:10:42,worker,1993-12-15,Heart art size effort laugh state my tradition...,


## Rating

In [14]:
# The different ratings available
ratings = [1,2,3,4,5]

# Weighted ratings with a skew towards the ends
df['rating'] = random.choices(
    ratings, 
    weights=(30,10,10,10,30), 
    k=num_users
)

df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,rating
0,ac8074c120f245c5b354bf55fa6045e0,male,True,thomas_kim@fakemail.com,Thomas Kim,2021-08-18 17:13:02,worker,1994-03-14,Wide agent available fill poor mother.,5
1,8889fb15914142a7bd0cfb078c4d2283,female,False,alisha.burton@fakemail.com,Alisha Burton,2021-08-08 16:06:47,worker,1982-10-24,Amount these identify main wall direction pare...,1
2,92d5f3e001a34dc98f8a8eb605f8f02c,female,True,sharon_gutierrez@fakemail.com,Sharon Gutierrez,2021-08-04 14:54:09,high school,2003-04-08,Somebody east authority turn if establish comm...,3
3,deb47292592e433482816ef1e117d6bd,male,True,joshua_williams@fakemail.com,Joshua Williams,2021-08-08 12:27:49,worker,1982-05-03,His know particularly within across few likely.,1
4,1fa5764cb3764fb9a1f63472887f4dc5,female,True,michelle.tucker@fakemail.com,Michelle Tucker,2021-08-20 05:00:11,high school,2005-01-04,Tv available employee purpose yourself section...,1
...,...,...,...,...,...,...,...,...,...,...
9995,c1323ab860be4d3c8870ce603ab353ea,male,False,mr..jeremy@fakemail.com,Mr. Jeremy Campbell DDS,2021-08-14 12:27:11,worker,1985-03-01,Happy little company share life hear scientist...,3
9996,2073bab29db546ed83996aa1edaccce3,female,False,shelley.allen@fakemail.com,Shelley Allen,2021-08-20 01:36:31,worker,1986-03-31,Purpose same policy series husband do.,1
9997,815c826922c6419fb4181b6f07b41871,female,True,courtney_walton@fakemail.com,Courtney Walton,2021-08-08 14:12:25,undergrad,2000-11-25,Which accept more civil southern think talk fi...,4
9998,282d88eaa3ff4ca1b9db684711a37bf7,female,True,courtney_thompson@fakemail.com,Courtney Thompson,2021-08-23 10:10:42,worker,1993-12-15,Heart art size effort laugh state my tradition...,1


## Saving as a CSV file

In [15]:
df.to_csv('dataset.csv')

In [19]:
# Viewing the saved csv file
csv_df = pd.read_csv('dataset.csv', index_col=0)

csv_df

Unnamed: 0,id,gender,subscriber,email,name,last_login,education,dob,bio,rating
0,ac8074c120f245c5b354bf55fa6045e0,male,True,thomas_kim@fakemail.com,Thomas Kim,2021-08-18 17:13:02,worker,1994-03-14,Wide agent available fill poor mother.,5
1,8889fb15914142a7bd0cfb078c4d2283,female,False,alisha.burton@fakemail.com,Alisha Burton,2021-08-08 16:06:47,worker,1982-10-24,Amount these identify main wall direction pare...,1
2,92d5f3e001a34dc98f8a8eb605f8f02c,female,True,sharon_gutierrez@fakemail.com,Sharon Gutierrez,2021-08-04 14:54:09,high school,2003-04-08,Somebody east authority turn if establish comm...,3
3,deb47292592e433482816ef1e117d6bd,male,True,joshua_williams@fakemail.com,Joshua Williams,2021-08-08 12:27:49,worker,1982-05-03,His know particularly within across few likely.,1
4,1fa5764cb3764fb9a1f63472887f4dc5,female,True,michelle.tucker@fakemail.com,Michelle Tucker,2021-08-20 05:00:11,high school,2005-01-04,Tv available employee purpose yourself section...,1
...,...,...,...,...,...,...,...,...,...,...
9995,c1323ab860be4d3c8870ce603ab353ea,male,False,mr..jeremy@fakemail.com,Mr. Jeremy Campbell DDS,2021-08-14 12:27:11,worker,1985-03-01,Happy little company share life hear scientist...,3
9996,2073bab29db546ed83996aa1edaccce3,female,False,shelley.allen@fakemail.com,Shelley Allen,2021-08-20 01:36:31,worker,1986-03-31,Purpose same policy series husband do.,1
9997,815c826922c6419fb4181b6f07b41871,female,True,courtney_walton@fakemail.com,Courtney Walton,2021-08-08 14:12:25,undergrad,2000-11-25,Which accept more civil southern think talk fi...,4
9998,282d88eaa3ff4ca1b9db684711a37bf7,female,True,courtney_thompson@fakemail.com,Courtney Thompson,2021-08-23 10:10:42,worker,1993-12-15,Heart art size effort laugh state my tradition...,1
