In [16]:
import os,random,math
from datetime import date, datetime, timedelta
import pandas as pd
import numpy as np

pd.options.display.float_format = '{:.2f}'.format

# Constants/configs
DAYS = 30                                           # Number of days of data to generate 
COUNTRIES = 20                                      # Number of countries to generate data for
BASELINE_USERS_DAILY = 1200000                      # Average total users across all dimensions
DAILY_USER_CHANGE = 0.05                            # Determines how much user count can vary day to day
DAILY_VARIANCE_LIMIT = DAILY_USER_CHANGE * 5
DAILY_VARIANCE_LAG = 3                              # How many preceding days to consider when calculating day over day growth
MEAN_VIDEO_ENGAGEMENT = 2.2                         # Mean number of video plays per user across all shows
VAR_VIDEO_ENGAGEMENT = 1                            # Used as one std dev when assiging engagement rates to shows based on a normal distribution
DAILY_ENGAGEMENT_CHANGE = .05                       # Soft limit on how much video engagement can change by day to day for each simulated 
MOBILE_SHARE = .55                                  # Baseline percentage of mobile traffic
VAR_MOBILE_SHARE = .5                               # Used as one std dev when assiging device type values to records based on a normal distribution
DOW_INDEX = [1.0, 0.95, 0.9, 1.0, 1.0, 1.1, 1.2]    # Simulate weekly cyclicality, DOW_INDEX[0] is weight for Monday, DOW_INDEX[6] is weight for Sunday
VAR_DOW_INDEX = .1                                  # Used as one std dev when weighting traffic by day of the week based on a normal distribution


LATEST_DATE = (datetime.now() + pd.DateOffset(days=-1)).date()

# Country populations, taken from UN dataset
df = pd.read_csv('2023_world_population.csv')
df['population'] = pd.to_numeric(df['population'].str.replace(' ',''))
df['population_share'] = df['population'].apply(lambda x: float(x / df['population'].sum()) )

# List of content metadata to create data for
df_content_metadata = pd.read_csv('shows.csv')
df_content_metadata['base_users'] = df_content_metadata['ratings_index'] * BASELINE_USERS_DAILY
df_content_metadata['base_users_share'] = df_content_metadata['base_users'] / df_content_metadata['base_users'].sum()
df_content_metadata = df_content_metadata.sort_values(by='base_users',ascending=False)
#print(df_content_metadata)

raw = {
    'date': [],
    'country': [],
    'device_type': [],
    'video_category': [],
    'video_title': [],
    'users': [],
    'users_dod': [],
    'video_plays': [],
    
}

TOTAL_RECORDS = 0
for country in df['country'].unique()[:COUNTRIES]:
    country_init_time = datetime.now()
    df_c = df[df['country'] == country]
    for i in range(len(df_content_metadata)):
        show_init_time = datetime.now()
        category = df_content_metadata['category'].iloc[i]
        title = df_content_metadata['title'].iloc[i]
        norm = np.random.normal(MEAN_VIDEO_ENGAGEMENT, VAR_VIDEO_ENGAGEMENT, 100) # Generate a normal distribution of possible baseline engagement rates for each show
        video_engagement_rate = max(1,norm[random.randrange(0,100)]) # A ratio below 1:1 would not make sense here
        norm = np.random.normal(MOBILE_SHARE, VAR_MOBILE_SHARE, 100)
        mobile_share = norm[random.randrange(0,100)]
        device_types = {
            'mobile':mobile_share,
            'desktop':1-mobile_share
        }
        l = random.randrange(80,120) / 100 # add some variation in weekly cyclicality per show
        dow_mult = VAR_DOW_INDEX * l
        for device_type, device_v in device_types.items():
            for ii in range(DAYS):
                # We don't need the entire raw table to calculate values, limiting it prevents the script from becoming slower
                # as the number of records grows.
                mini_raw = {}
                for k,v in raw.items():
                    limit = -min(2000,len(raw[k]))
                    mini_raw[k] = v[limit:]
                _df = pd.DataFrame.from_dict(mini_raw)
                _df = _df[_df['country'] == country]
                _df = _df[_df['video_category'] == category]
                _df = _df[_df['video_title'] == title]
                _df = _df[_df['device_type'] == device_type]
                date = (LATEST_DATE - pd.DateOffset(DAYS) + pd.DateOffset(ii)).date()
                dow_modifier = dow_mult * DOW_INDEX[date.weekday()]
                user_count = None
                previous_user_count = None
                dod = None
                plays = None
                modifier = None
                user_count = BASELINE_USERS_DAILY * df_c['population_share'].iloc[0] * device_v * df_content_metadata['base_users_share'].iloc[i] * dow_modifier
                init_user_count = BASELINE_USERS_DAILY * df_c['population_share'].iloc[0] * device_v * df_content_metadata['base_users_share'].iloc[i] * dow_modifier
                if _df.empty == True:
                    dod = 1
                    plays = user_count * video_engagement_rate
                else:
                    #content_df = _df[_df['video_title'] == title]
                    prv = max([ii-1,0])
                    mn = max([ii-2, 0])
                    mx = ii+1
                    avg_dod = _df['users_dod'][mn:mx].mean()
                    avg_dist = _df['users'][mn:mx].mean() / init_user_count
                    if pd.isna(avg_dod) == True or avg_dod == 0:
                        avg_dod = 1
                    daily_user_var = (random.randrange(math.floor((1 - DAILY_USER_CHANGE)*100), math.floor((1 + DAILY_USER_CHANGE)*100) )/100)
                    daily_engagement_var = (random.randrange(math.floor((1 - DAILY_ENGAGEMENT_CHANGE)*100), math.floor((1 + DAILY_ENGAGEMENT_CHANGE)*100) )/100)
                    modifier = avg_dod * daily_user_var
                    limiter = 0
                    previous_user_count = init_user_count

                    # This section adds a lagging counterbalancing force that prevents simulated growth/decline from snowballing too far from the mean
                    # This also results in pretty, wavy lines to show on charts
                    limiter = 1-(np.square(avg_dist))      
                    modifier = modifier + (limiter / DAILY_VARIANCE_LAG)
                    previous_user_count = _df['users'].iloc[prv]
                    user_count = (previous_user_count * modifier ) 
                    dod = user_count / _df['users'].iloc[prv]
                    plays = math.floor(user_count * (video_engagement_rate * daily_engagement_var) )
                #print(f'>>> {date} | {country} | {device_type} | {category} | {title} | users = {previous_user_count} * {modifier} = {user_count:.0f} | plays: {plays:.0f}')
                raw['date'].append(date)
                raw['country'].append(country)
                raw['device_type'].append(device_type)
                raw['video_category'].append(category)
                raw['video_title'].append(title)
                raw['users'].append(max(0,math.floor(user_count))) # Negative numbers would not make sense!
                raw['users_dod'].append(dod)
                raw['video_plays'].append(max(0,math.floor(plays)))
                TOTAL_RECORDS = TOTAL_RECORDS + 1
        #print(f'----{title}: {datetime.now() - show_init_time}')
    print(f'{country}: {TOTAL_RECORDS} | {datetime.now() - country_init_time}')
           
df = pd.DataFrame.from_dict(raw)
df.to_csv('traffic_daily.csv',index=False)
df

  dod = user_count / _df['users'].iloc[prv]


India: 1920 | 0:00:08.790725
China: 3840 | 0:00:14.521455
United States of America: 5760 | 0:00:14.461044
Indonesia: 7680 | 0:00:14.522128
Pakistan: 9600 | 0:00:14.118171
Nigeria: 11520 | 0:00:14.367163
Brazil: 13440 | 0:00:14.434936
Bangladesh: 15360 | 0:00:14.417237
Russian Federation: 17280 | 0:00:14.265111
Mexico: 19200 | 0:00:14.362538
Ethiopia: 21120 | 0:00:14.462431
Japan: 23040 | 0:00:14.431410
Philippines: 24960 | 0:00:14.573888
Egypt: 26880 | 0:00:14.287534
Democratic Republic of the Congo: 28800 | 0:00:14.493972
Viet Nam: 30720 | 0:00:14.353437
Iran (Islamic Republic of): 32640 | 0:00:14.146376
Türkiye: 34560 | 0:00:14.232099
Germany: 36480 | 0:00:14.419598
Thailand: 38400 | 0:00:14.339544


Unnamed: 0,date,country,device_type,video_category,video_title,users,users_dod,video_plays
0,2024-07-27,India,mobile,education,Stanley Spadowski's Clubhouse,1241,1.00,2133
1,2024-07-28,India,mobile,education,Stanley Spadowski's Clubhouse,1245,1.00,2204
2,2024-07-29,India,mobile,education,Stanley Spadowski's Clubhouse,1158,0.93,1931
3,2024-07-30,India,mobile,education,Stanley Spadowski's Clubhouse,1021,0.88,1737
4,2024-07-31,India,mobile,education,Stanley Spadowski's Clubhouse,837,0.82,1481
...,...,...,...,...,...,...,...,...
38395,2024-08-21,Thailand,desktop,lifestyle,Volcano Worshippers Hour,0,,0
38396,2024-08-22,Thailand,desktop,lifestyle,Volcano Worshippers Hour,0,,0
38397,2024-08-23,Thailand,desktop,lifestyle,Volcano Worshippers Hour,0,,0
38398,2024-08-24,Thailand,desktop,lifestyle,Volcano Worshippers Hour,0,,0
