In [9]:
# 1
import json
import random
from datetime import date, timedelta
import faker


In [10]:
# 2
fake = faker.Faker()

In [11]:
# 3
usernames = set()
usernames_no = 1000

# populate the set with 1,000 unique usernames
while len(usernames) < usernames_no:
    usernames.add(fake.user_name())

In [16]:
# 4
def get_random_name_and_gender():
    skew = .6 # 60% of users will be female
    male = random.random() > skew
    if male:
        return fake.name_male(), 'M'
    else:
        return fake.name_female(), 'F'

def get_users(usernames):
    users = []
    for username in usernames:
        name, gender = get_random_name_and_gender()
        user = {
            'username': username,
            'name': name,
            'gender': gender,
            'email': fake.email(),
            'age': fake.random_int(min = 18, max = 90),
            'address': fake.address()
        }
        users.append(json.dumps(user))
    return users

users = get_users(usernames)
users[:3]

['{"username": "brandon96", "name": "Jordan Williams", "gender": "M", "email": "randallowens@example.org", "age": 71, "address": "7271 Sawyer Pass Apt. 285\\nSouth Robertfort, WI 04736"}',
 '{"username": "mcclurejames", "name": "Amanda Martin", "gender": "F", "email": "shannonjenkins@example.net", "age": 56, "address": "1048 Peter Junctions Suite 308\\nNorth Michael, MA 33353"}',
 '{"username": "eleonard", "name": "Cassandra Moore", "gender": "F", "email": "laurahayes@example.com", "age": 36, "address": "3240 Andrew Row\\nNew Joyceton, PA 98381"}']

In [28]:
# 5
# campaign name format:
# InternalType_StartDate_EndDate_TargetAge_TargetGender_Currency
def get_type():
    # just some gibberish internal codes
    types = ['AKX', 'BYU', 'GRZ', 'KTR']
    return random.choice(types)

def get_start_end_dates():
    duration = random.randint(1, 2 * 365)
    offset = random.randint(-365, 365)
    start = date.today() - timedelta(days = offset)
    end = start + timedelta(days = duration)
    
    def _format_date(date_):
        return date_.strftime('%Y%m%d')
    return _format_date(start), _format_date(end)

def get_age():
    age = random.randrange(20, 46, 5)
    diff = random.randrange(5, 26, 5)
    return '{}-{}'.format(age, age + diff)

def get_gender():
    return random.choice(('M', 'F', 'B'))

def get_currency():
    return random.choice(('GBP', 'EUR', 'USD'))

def get_campaign_name():
    separator = '_'
    type_ = get_type()
    start, end = get_start_end_dates()
    age = get_age()
    gender = get_gender()
    currency = get_currency()
    return separator.join((
        type_,
        start,
        end,
        age,
        gender,
        currency
    ))

In [29]:
# 6
# campaign data:
# name, budget, spent, clicks, impressions
def get_campaign_data():
    name = get_campaign_name()
    budget = random.randint(10 ** 3, 10 ** 6)
    spent = random.randint(10 ** 2, budget)
    clicks = int(random.triangular(10 ** 2, 10 ** 5, 0.2 * 10 ** 5))
    impressions = int(random.gauss(0.5 * 10 ** 6, 2))
    return {
        'cmp_name': name,
        'cmp_bgt': budget,
        'cmp_spent': spent,
        'cmp_clicks': clicks,
        'cmp_impr': impressions
    }

In [30]:
# 7
def get_data(users):
    data = []
    for user in users:
        campaigns = [
            get_campaign_data()
                for _ in range(random.randint(2, 8))
        ]
        data.append({'user': user, 'campaigns': campaigns})
    return data

In [31]:
# 8
rough_data = get_data(users)
rough_data[:2] # let's take a peek

[{'user': '{"username": "brandon96", "name": "Jordan Williams", "gender": "M", "email": "randallowens@example.org", "age": 71, "address": "7271 Sawyer Pass Apt. 285\\nSouth Robertfort, WI 04736"}',
  'campaigns': [{'cmp_name': 'GRZ_20220910_20230425_35-55_B_EUR',
    'cmp_bgt': 695129,
    'cmp_spent': 206593,
    'cmp_clicks': 14715,
    'cmp_impr': 499998},
   {'cmp_name': 'KTR_20230613_20240109_20-40_B_USD',
    'cmp_bgt': 607601,
    'cmp_spent': 584645,
    'cmp_clicks': 53790,
    'cmp_impr': 499999},
   {'cmp_name': 'BYU_20230818_20230909_40-65_F_GBP',
    'cmp_bgt': 644070,
    'cmp_spent': 621204,
    'cmp_clicks': 42683,
    'cmp_impr': 500000},
   {'cmp_name': 'AKX_20230727_20240927_30-45_M_USD',
    'cmp_bgt': 954145,
    'cmp_spent': 72526,
    'cmp_clicks': 10006,
    'cmp_impr': 499997},
   {'cmp_name': 'BYU_20240316_20240922_35-40_B_EUR',
    'cmp_bgt': 175435,
    'cmp_spent': 79039,
    'cmp_clicks': 13667,
    'cmp_impr': 500001}]},
 {'user': '{"username": "mcclureja

In [32]:
# 9
data = []
for datum in rough_data:
    for campaign in datum['campaigns']:
        campaign.update({'user': datum['user']})
        data.append(campaign)
data[:2] # let's take another peek

[{'cmp_name': 'GRZ_20220910_20230425_35-55_B_EUR',
  'cmp_bgt': 695129,
  'cmp_spent': 206593,
  'cmp_clicks': 14715,
  'cmp_impr': 499998,
  'user': '{"username": "brandon96", "name": "Jordan Williams", "gender": "M", "email": "randallowens@example.org", "age": 71, "address": "7271 Sawyer Pass Apt. 285\\nSouth Robertfort, WI 04736"}'},
 {'cmp_name': 'KTR_20230613_20240109_20-40_B_USD',
  'cmp_bgt': 607601,
  'cmp_spent': 584645,
  'cmp_clicks': 53790,
  'cmp_impr': 499999,
  'user': '{"username": "brandon96", "name": "Jordan Williams", "gender": "M", "email": "randallowens@example.org", "age": 71, "address": "7271 Sawyer Pass Apt. 285\\nSouth Robertfort, WI 04736"}'}]

In [None]:
# 10
with open('data.json', 'w') as stream:
    stream.write(json.dumps(data))