# Building a Dataset
Quick tutorial on building your very own dataset (and how to save it)

In [1]:
# Libraries
import pandas as pd

## Dataset Size

In [2]:
# Number of rows or users to create
num_users = 100000

## Feature Names

In [3]:
#  A list of 10 features
features = [
    "id",
    "gender",
    "subscriber",
    "name",
    "email",
    "last_login",
    "dob",
    "education",
    "bio",
    "rating"
]

# Creating a DF for these features
df = pd.DataFrame(columns=features)

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating


## IDs

In [4]:
import uuid

df['id'] = [uuid.uuid4().hex for i in range(num_users)]

# Checking if all IDs are unique
print(df['id'].nunique()==num_users)

df

True


Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,cb6d3a52b3df4e6985b992c4d730e36e,,,,,,,,,
1,65b856edf7a04b1c95274ed0615614b5,,,,,,,,,
2,4152d737ee174517bd4691bc726ea71d,,,,,,,,,
3,1e8864c43ef644c293ef8a743e61b52a,,,,,,,,,
4,804e09681bb5438fa42223e7ae8d8843,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
99995,902ba973682a4a26b4e96ee95728a657,,,,,,,,,
99996,937a85742fd649758e63df29fd08f132,,,,,,,,,
99997,a89ec3b5acb64a2089bcf18569485e64,,,,,,,,,
99998,a81d4643cad649b5897a39654363b6f1,,,,,,,,,


## Gender

In [5]:
import random

genders = ["male", "female", "non-binary"]


df['gender'] = random.choices(
    genders, 
    weights=(49,49,2), 
    k=num_users
)

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,cb6d3a52b3df4e6985b992c4d730e36e,female,,,,,,,,
1,65b856edf7a04b1c95274ed0615614b5,male,,,,,,,,
2,4152d737ee174517bd4691bc726ea71d,male,,,,,,,,
3,1e8864c43ef644c293ef8a743e61b52a,female,,,,,,,,
4,804e09681bb5438fa42223e7ae8d8843,male,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
99995,902ba973682a4a26b4e96ee95728a657,female,,,,,,,,
99996,937a85742fd649758e63df29fd08f132,male,,,,,,,,
99997,a89ec3b5acb64a2089bcf18569485e64,male,,,,,,,,
99998,a81d4643cad649b5897a39654363b6f1,female,,,,,,,,


## Subscriber

In [6]:
# Choices
choice = [True, False]

df['subscriber'] = random.choices(
    choice, 
    k=num_users
)

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,cb6d3a52b3df4e6985b992c4d730e36e,female,False,,,,,,,
1,65b856edf7a04b1c95274ed0615614b5,male,False,,,,,,,
2,4152d737ee174517bd4691bc726ea71d,male,False,,,,,,,
3,1e8864c43ef644c293ef8a743e61b52a,female,True,,,,,,,
4,804e09681bb5438fa42223e7ae8d8843,male,True,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
99995,902ba973682a4a26b4e96ee95728a657,female,True,,,,,,,
99996,937a85742fd649758e63df29fd08f132,male,True,,,,,,,
99997,a89ec3b5acb64a2089bcf18569485e64,male,True,,,,,,,
99998,a81d4643cad649b5897a39654363b6f1,female,True,,,,,,,


## Name

In [7]:
# Installing the needed library
!pip install faker



In [8]:
from faker import Faker

faker = Faker()

def name_gen(gender):
    """
    Quickly generates a name based on gender
    """
    if gender=='male':
        return faker.name_male()
    elif gender=='female':
        return faker.name_female()
    
    return faker.name()

df['name'] = [name_gen(i) for i in df['gender']]

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,cb6d3a52b3df4e6985b992c4d730e36e,female,False,Patricia Mcdowell,,,,,,
1,65b856edf7a04b1c95274ed0615614b5,male,False,James Cortez,,,,,,
2,4152d737ee174517bd4691bc726ea71d,male,False,Michael Whitney,,,,,,
3,1e8864c43ef644c293ef8a743e61b52a,female,True,Lori Marshall,,,,,,
4,804e09681bb5438fa42223e7ae8d8843,male,True,Jeffrey Perry,,,,,,
...,...,...,...,...,...,...,...,...,...,...
99995,902ba973682a4a26b4e96ee95728a657,female,True,Debra Singleton,,,,,,
99996,937a85742fd649758e63df29fd08f132,male,True,James Romero,,,,,,
99997,a89ec3b5acb64a2089bcf18569485e64,male,True,Joshua Morales,,,,,,
99998,a81d4643cad649b5897a39654363b6f1,female,True,Lisa Donovan,,,,,,


## Email

In [9]:
def emailGen(name, duplicateFound=False):
    """
    Generates a random email address based on the given name. 
    Adds a number at the end if a duplicate address was found.
    """
    # Fake domain name
    dom = "@fakemail.com"
    
    # Lowercasing and splitting
    name = name.lower().split(" ")
    
    # Random character to insert in the name
    chars = [".", "_"]
    
    new_name = name[0] + random.choice(chars) + name[1] 
    
    if duplicateFound:
        
        # Random number to insert at the end
        num = random.randint(0,100)
        
        # Inserting at the end
        new_name = new_name + str(num)
        
        return new_name + dom
    
    return new_name + dom
    

emails = []

for name in df['name']:
    
    # Generating the email
    email = emailGen(name)
    
    # Looping until a unique email is generated
    while email in emails:
        
        email = emailGen(name, duplicateFound=True)
    
    # Attaching the new email to the list
    emails.append(email)
    
df['email'] = emails
    
# Checking if the emails are all unique
print(df['email'].nunique()==num_users)

df

True


Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,cb6d3a52b3df4e6985b992c4d730e36e,female,False,Patricia Mcdowell,patricia_mcdowell@fakemail.com,,,,,
1,65b856edf7a04b1c95274ed0615614b5,male,False,James Cortez,james_cortez@fakemail.com,,,,,
2,4152d737ee174517bd4691bc726ea71d,male,False,Michael Whitney,michael_whitney@fakemail.com,,,,,
3,1e8864c43ef644c293ef8a743e61b52a,female,True,Lori Marshall,lori.marshall@fakemail.com,,,,,
4,804e09681bb5438fa42223e7ae8d8843,male,True,Jeffrey Perry,jeffrey.perry@fakemail.com,,,,,
...,...,...,...,...,...,...,...,...,...,...
99995,902ba973682a4a26b4e96ee95728a657,female,True,Debra Singleton,debra.singleton@fakemail.com,,,,,
99996,937a85742fd649758e63df29fd08f132,male,True,James Romero,james_romero@fakemail.com,,,,,
99997,a89ec3b5acb64a2089bcf18569485e64,male,True,Joshua Morales,joshua_morales@fakemail.com,,,,,
99998,a81d4643cad649b5897a39654363b6f1,female,True,Lisa Donovan,lisa.donovan@fakemail.com,,,,,


## Last Login

In [10]:
import datetime

def randomtimes(start, end, n):
    """
    Generates random time stamps based on a given amount between two time periods.
    """
    # The timestamp format
    frmt = "%Y-%m-%d %H:%M:%S"
    
    # Formatting the two time periods
    stime = datetime.datetime.strptime(start, frmt)
    etime = datetime.datetime.strptime(end, frmt)
    
    # Creating the pool for random times
    td = etime - stime
    
    # Generating a list with the random times
    times = [(random.random() * td + stime).strftime(frmt) for _ in range(n)]
    
    return times

# Setting the start and end times
start = "2021-08-01 00:00:00"

end = "2021-08-24 00:00:00"

df['last_login'] = randomtimes(start, end, num_users)

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,cb6d3a52b3df4e6985b992c4d730e36e,female,False,Patricia Mcdowell,patricia_mcdowell@fakemail.com,2021-08-04 22:36:51,,,,
1,65b856edf7a04b1c95274ed0615614b5,male,False,James Cortez,james_cortez@fakemail.com,2021-08-09 10:55:46,,,,
2,4152d737ee174517bd4691bc726ea71d,male,False,Michael Whitney,michael_whitney@fakemail.com,2021-08-18 18:14:46,,,,
3,1e8864c43ef644c293ef8a743e61b52a,female,True,Lori Marshall,lori.marshall@fakemail.com,2021-08-05 10:59:47,,,,
4,804e09681bb5438fa42223e7ae8d8843,male,True,Jeffrey Perry,jeffrey.perry@fakemail.com,2021-08-06 01:29:41,,,,
...,...,...,...,...,...,...,...,...,...,...
99995,902ba973682a4a26b4e96ee95728a657,female,True,Debra Singleton,debra.singleton@fakemail.com,2021-08-21 08:55:28,,,,
99996,937a85742fd649758e63df29fd08f132,male,True,James Romero,james_romero@fakemail.com,2021-08-06 16:10:18,,,,
99997,a89ec3b5acb64a2089bcf18569485e64,male,True,Joshua Morales,joshua_morales@fakemail.com,2021-08-17 22:17:29,,,,
99998,a81d4643cad649b5897a39654363b6f1,female,True,Lisa Donovan,lisa.donovan@fakemail.com,2021-08-21 19:46:16,,,,


## Date of Birth

In [11]:
def random_dob(start, end, n):
    """
    Generating a list of a set number of timestamps
    """
    
    # The timestamp format
    frmt = "%Y-%m-%d"
    
    # Formatting the two time periods
    stime = datetime.datetime.strptime(start, frmt)
    etime = datetime.datetime.strptime(end, frmt)
    
    # Creating the pool for random times
    td = etime - stime
    
    # Generating a list with the random times
    times = [(random.random() * td + stime).strftime(frmt) for _ in range(n)]
    
    return times

df['dob'] = random_dob("1980-01-01", "2006-01-01", num_users)

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,cb6d3a52b3df4e6985b992c4d730e36e,female,False,Patricia Mcdowell,patricia_mcdowell@fakemail.com,2021-08-04 22:36:51,1980-02-01,,,
1,65b856edf7a04b1c95274ed0615614b5,male,False,James Cortez,james_cortez@fakemail.com,2021-08-09 10:55:46,1994-12-22,,,
2,4152d737ee174517bd4691bc726ea71d,male,False,Michael Whitney,michael_whitney@fakemail.com,2021-08-18 18:14:46,1995-12-24,,,
3,1e8864c43ef644c293ef8a743e61b52a,female,True,Lori Marshall,lori.marshall@fakemail.com,2021-08-05 10:59:47,1995-10-05,,,
4,804e09681bb5438fa42223e7ae8d8843,male,True,Jeffrey Perry,jeffrey.perry@fakemail.com,2021-08-06 01:29:41,1987-10-11,,,
...,...,...,...,...,...,...,...,...,...,...
99995,902ba973682a4a26b4e96ee95728a657,female,True,Debra Singleton,debra.singleton@fakemail.com,2021-08-21 08:55:28,1986-04-03,,,
99996,937a85742fd649758e63df29fd08f132,male,True,James Romero,james_romero@fakemail.com,2021-08-06 16:10:18,1993-12-02,,,
99997,a89ec3b5acb64a2089bcf18569485e64,male,True,Joshua Morales,joshua_morales@fakemail.com,2021-08-17 22:17:29,2003-01-04,,,
99998,a81d4643cad649b5897a39654363b6f1,female,True,Lisa Donovan,lisa.donovan@fakemail.com,2021-08-21 19:46:16,1994-06-26,,,


## Current Education

In [12]:
def getEducation(dob):
    """
    Assigns an education level based on the given date of birth
    """
    # Current date
    now = datetime.datetime.now()
    
    # Date of birth
    dob = datetime.datetime.strptime(dob, "%Y-%m-%d")
    
    # Subtracting the times to get an age
    age = int((now - dob).days/365.25)
    
    # Returning education level based on age
    if age <= 18:
        return 'high school'
    elif age <= 22:
        return 'undergrad'
    elif age <= 25:
        return 'grad'
    else:
        return 'employed'

df['education'] = [getEducation(i) for i in df['dob']]

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,cb6d3a52b3df4e6985b992c4d730e36e,female,False,Patricia Mcdowell,patricia_mcdowell@fakemail.com,2021-08-04 22:36:51,1980-02-01,employed,,
1,65b856edf7a04b1c95274ed0615614b5,male,False,James Cortez,james_cortez@fakemail.com,2021-08-09 10:55:46,1994-12-22,employed,,
2,4152d737ee174517bd4691bc726ea71d,male,False,Michael Whitney,michael_whitney@fakemail.com,2021-08-18 18:14:46,1995-12-24,grad,,
3,1e8864c43ef644c293ef8a743e61b52a,female,True,Lori Marshall,lori.marshall@fakemail.com,2021-08-05 10:59:47,1995-10-05,grad,,
4,804e09681bb5438fa42223e7ae8d8843,male,True,Jeffrey Perry,jeffrey.perry@fakemail.com,2021-08-06 01:29:41,1987-10-11,employed,,
...,...,...,...,...,...,...,...,...,...,...
99995,902ba973682a4a26b4e96ee95728a657,female,True,Debra Singleton,debra.singleton@fakemail.com,2021-08-21 08:55:28,1986-04-03,employed,,
99996,937a85742fd649758e63df29fd08f132,male,True,James Romero,james_romero@fakemail.com,2021-08-06 16:10:18,1993-12-02,employed,,
99997,a89ec3b5acb64a2089bcf18569485e64,male,True,Joshua Morales,joshua_morales@fakemail.com,2021-08-17 22:17:29,2003-01-04,high school,,
99998,a81d4643cad649b5897a39654363b6f1,female,True,Lisa Donovan,lisa.donovan@fakemail.com,2021-08-21 19:46:16,1994-06-26,employed,,


## Bio

In [13]:
def makeBio(subscriber):
    """
    Making a short or long bio depending their subscription status.
    """
    
    if subscriber==True:
        
        # Randomizing bio length but skewed towards longer bios
        bio_len = random.choices([10,20], weights=(10,90), k=1)[0]
        
    elif subscriber==False:
        
        # Randomizing bio length but skewed towards shorter bios
        bio_len = random.choices([1,3], weights=(10,90), k=1)[0]
        
    return faker.sentence(bio_len)
    

df['bio'] = [makeBio(i) for i in df['subscriber']]

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,cb6d3a52b3df4e6985b992c4d730e36e,female,False,Patricia Mcdowell,patricia_mcdowell@fakemail.com,2021-08-04 22:36:51,1980-02-01,employed,Mean radio whose.,
1,65b856edf7a04b1c95274ed0615614b5,male,False,James Cortez,james_cortez@fakemail.com,2021-08-09 10:55:46,1994-12-22,employed,To director.,
2,4152d737ee174517bd4691bc726ea71d,male,False,Michael Whitney,michael_whitney@fakemail.com,2021-08-18 18:14:46,1995-12-24,grad,Create chair.,
3,1e8864c43ef644c293ef8a743e61b52a,female,True,Lori Marshall,lori.marshall@fakemail.com,2021-08-05 10:59:47,1995-10-05,grad,Bar why kitchen page manager story conference ...,
4,804e09681bb5438fa42223e7ae8d8843,male,True,Jeffrey Perry,jeffrey.perry@fakemail.com,2021-08-06 01:29:41,1987-10-11,employed,Create against none well light method close ci...,
...,...,...,...,...,...,...,...,...,...,...
99995,902ba973682a4a26b4e96ee95728a657,female,True,Debra Singleton,debra.singleton@fakemail.com,2021-08-21 08:55:28,1986-04-03,employed,Official sure clearly travel take approach lon...,
99996,937a85742fd649758e63df29fd08f132,male,True,James Romero,james_romero@fakemail.com,2021-08-06 16:10:18,1993-12-02,employed,Election case measure less national first more...,
99997,a89ec3b5acb64a2089bcf18569485e64,male,True,Joshua Morales,joshua_morales@fakemail.com,2021-08-17 22:17:29,2003-01-04,high school,How take husband receive enter he process whic...,
99998,a81d4643cad649b5897a39654363b6f1,female,True,Lisa Donovan,lisa.donovan@fakemail.com,2021-08-21 19:46:16,1994-06-26,employed,Figure political network pressure hold success...,


## Rating

In [14]:
# The different ratings available
ratings = [1,2,3,4,5]

# Weighted ratings with a skew towards the ends
df['rating'] = random.choices(
    ratings, 
    weights=(30,10,10,10,30), 
    k=num_users
)

df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,cb6d3a52b3df4e6985b992c4d730e36e,female,False,Patricia Mcdowell,patricia_mcdowell@fakemail.com,2021-08-04 22:36:51,1980-02-01,employed,Mean radio whose.,1
1,65b856edf7a04b1c95274ed0615614b5,male,False,James Cortez,james_cortez@fakemail.com,2021-08-09 10:55:46,1994-12-22,employed,To director.,5
2,4152d737ee174517bd4691bc726ea71d,male,False,Michael Whitney,michael_whitney@fakemail.com,2021-08-18 18:14:46,1995-12-24,grad,Create chair.,1
3,1e8864c43ef644c293ef8a743e61b52a,female,True,Lori Marshall,lori.marshall@fakemail.com,2021-08-05 10:59:47,1995-10-05,grad,Bar why kitchen page manager story conference ...,5
4,804e09681bb5438fa42223e7ae8d8843,male,True,Jeffrey Perry,jeffrey.perry@fakemail.com,2021-08-06 01:29:41,1987-10-11,employed,Create against none well light method close ci...,3
...,...,...,...,...,...,...,...,...,...,...
99995,902ba973682a4a26b4e96ee95728a657,female,True,Debra Singleton,debra.singleton@fakemail.com,2021-08-21 08:55:28,1986-04-03,employed,Official sure clearly travel take approach lon...,2
99996,937a85742fd649758e63df29fd08f132,male,True,James Romero,james_romero@fakemail.com,2021-08-06 16:10:18,1993-12-02,employed,Election case measure less national first more...,4
99997,a89ec3b5acb64a2089bcf18569485e64,male,True,Joshua Morales,joshua_morales@fakemail.com,2021-08-17 22:17:29,2003-01-04,high school,How take husband receive enter he process whic...,5
99998,a81d4643cad649b5897a39654363b6f1,female,True,Lisa Donovan,lisa.donovan@fakemail.com,2021-08-21 19:46:16,1994-06-26,employed,Figure political network pressure hold success...,3


## Saving as a CSV file

In [15]:
df.to_csv('dataset.csv')

In [16]:
# Viewing the saved csv file
csv_df = pd.read_csv('dataset.csv', index_col=0)

csv_df

Unnamed: 0,id,gender,subscriber,name,email,last_login,dob,education,bio,rating
0,cb6d3a52b3df4e6985b992c4d730e36e,female,False,Patricia Mcdowell,patricia_mcdowell@fakemail.com,2021-08-04 22:36:51,1980-02-01,employed,Mean radio whose.,1
1,65b856edf7a04b1c95274ed0615614b5,male,False,James Cortez,james_cortez@fakemail.com,2021-08-09 10:55:46,1994-12-22,employed,To director.,5
2,4152d737ee174517bd4691bc726ea71d,male,False,Michael Whitney,michael_whitney@fakemail.com,2021-08-18 18:14:46,1995-12-24,grad,Create chair.,1
3,1e8864c43ef644c293ef8a743e61b52a,female,True,Lori Marshall,lori.marshall@fakemail.com,2021-08-05 10:59:47,1995-10-05,grad,Bar why kitchen page manager story conference ...,5
4,804e09681bb5438fa42223e7ae8d8843,male,True,Jeffrey Perry,jeffrey.perry@fakemail.com,2021-08-06 01:29:41,1987-10-11,employed,Create against none well light method close ci...,3
...,...,...,...,...,...,...,...,...,...,...
99995,902ba973682a4a26b4e96ee95728a657,female,True,Debra Singleton,debra.singleton@fakemail.com,2021-08-21 08:55:28,1986-04-03,employed,Official sure clearly travel take approach lon...,2
99996,937a85742fd649758e63df29fd08f132,male,True,James Romero,james_romero@fakemail.com,2021-08-06 16:10:18,1993-12-02,employed,Election case measure less national first more...,4
99997,a89ec3b5acb64a2089bcf18569485e64,male,True,Joshua Morales,joshua_morales@fakemail.com,2021-08-17 22:17:29,2003-01-04,high school,How take husband receive enter he process whic...,5
99998,a81d4643cad649b5897a39654363b6f1,female,True,Lisa Donovan,lisa.donovan@fakemail.com,2021-08-21 19:46:16,1994-06-26,employed,Figure political network pressure hold success...,3
