In [None]:
### In this notebook, we will create a dataset include the 1000 random people with different characteristics such as age, income and risk tolerance for our project

import pandas as pd 
import numpy as np

In [None]:
np.random.seed = 42

n_users = 1000

# Generate random age for 1000 people
age = np.random.randint(20,70,n_users)

# Generate random income for 1000 people 
# Using lognormal for a strictly positive values as well as realistic (more low to moderate incomes, less high incomes, right skewed data)

income = np.random.lognormal(mean = 11, sigma = 0.5, size = n_users).astype(int) 

# Generate goals of each persona
goals = np.random.choice(['retirement', 'home', 'wealth_growth'], size = n_users, p = [0.5,0.2,0.3])

# Define the function risk_score to calculate the risk tolerance index for each of the persona
def risk_score(age, income):
    base = 10 - (age - 20)/6 #we assume that young people tends to have higher risk while the older will most likely have lower risk
    income_adj = np.log(income) - 11 #we use np.log to reflect the dimishing returns (higher income tends to positive impact their risk score - lower risk, while lower income tends to increase the risk score)
    return np.clip(base + income_adj + np.random.normal(0,1),0,10)

# Define the time horizon for each investors
def time_horizon(age, goal):
    if goal == 'retirement':
        return max(65 - age, 5) #assume the retirement year is 65 years old
    if goal == 'wealth_growth':
        return np.random.randint(10,35)
    if goal == 'home':
        return np.random.randint(0,10)

df = pd.DataFrame({
    'age': age,
    'income': income,
    'goal': goals,
})

df['risk_score'] = df.apply(lambda row: risk_score(row.age,row.income),axis = 1)
df['time_horizon'] = df.apply(lambda row: time_horizon(row.age,row.goal), axis = 1)

df.head()

Unnamed: 0,age,income,goal,risk_score,time_horizon
0,64,19073,wealth_growth,1.483095,27
1,47,47154,retirement,5.296325,18
2,24,108610,wealth_growth,10.0,25
3,33,96931,retirement,7.784904,32
4,46,39184,retirement,6.011811,19


In [8]:
#save the dataframe as our dataset
df.to_csv('/Users/anhnguyendo/Documents/Python machine learning/robo_advisor/data/investors.csv', index_label = False)