In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Random numbers are generated the same everytime the script runs
np.random.seed(42)

# Generate dates for 2 years
start_date = datetime(2010, 1, 1)
end_date = datetime(2024, 9, 15)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Initialize data with zeros
n_days = len(date_range)
data = {
    'DATE': date_range,
    'SUBSCRIBERS_GAINED': np.zeros(n_days, dtype=int),
    'SUBSCRIBERS_LOST': np.zeros(n_days, dtype=int),
    'VIEWS': np.zeros(n_days, dtype=int),
    'WATCH_HOURS': np.zeros(n_days, dtype=int),
    'LIKES': np.zeros(n_days, dtype=int),
    'SHARES': np.zeros(n_days, dtype=int),
    'COMMENTS': np.zeros(n_days, dtype=int)
}

# Create DataFrame
df = pd.DataFrame(data)

# Function to generate growth
def simulate_beauty_blogger_curve(n_days, peak_start, peak_end, decline_start):
    x = np.linspace(0, 1, n_days)
    curve = np.zeros(n_days)

    for i in range(n_days):
        xi = x[i]

        if xi < peak_start:
            # Slow rise before peak years (2010–2016)
            curve[i] = 2 * xi

        elif peak_start <= xi <= peak_end:
            # Steep rise during peak years (2016–2018)
            curve[i] = 2 * peak_start + (xi - peak_start) * 6

        elif xi > decline_start:
            # Decline after TikTok becomes popular (2021–now)
            curve[i] = 2 * peak_start + (peak_end - peak_start) * 6 - (xi - decline_start) * 5

        else:
            # Plateau between 2018 and 2021
            curve[i] = 2 * peak_start + (peak_end - peak_start) * 6

    return np.clip(curve, 0, None)

# Generate growth patterns
views = simulate_beauty_blogger_curve(n_days, 0.4, 0.6, 0.75) * 5000
subscribers_gained = views * 0.02
subscribers_lost = views * 0.005
watch_hours = views * 0.1
likes = views * 0.05
shares = views * 0.01
comments = views * 0.005

# Create initial DataFrame
df = pd.DataFrame({
    'DATE': date_range,
    'SUBSCRIBERS_GAINED': np.zeros(n_days, dtype=int),
    'SUBSCRIBERS_LOST': np.zeros(n_days, dtype=int),
    'VIEWS': np.zeros(n_days, dtype=int),
    'WATCH_HOURS': np.zeros(n_days, dtype=int),
    'LIKES': np.zeros(n_days, dtype=int),
    'SHARES': np.zeros(n_days, dtype=int),
    'COMMENTS': np.zeros(n_days, dtype=int)
})

# Add some randomness to each metric (but controlled)
variation = np.random.normal(1, 0.1, n_days)
df['VIEWS'] = np.maximum(0, (views * variation).astype(int))
df['SUBSCRIBERS_GAINED'] = np.maximum(0, (subscribers_gained * variation).astype(int))
df['SUBSCRIBERS_LOST'] = np.maximum(0, (subscribers_lost * variation).astype(int))
df['WATCH_HOURS'] = np.maximum(0, (watch_hours * variation).astype(int))
df['LIKES'] = np.maximum(0, (likes * variation).astype(int))
df['SHARES'] = np.maximum(0, (shares * variation).astype(int))
df['COMMENTS'] = np.maximum(0, (comments * variation).astype(int))

# Weekend boost
weekend_mask = (df['DATE'].dt.dayofweek >= 5)
df.loc[weekend_mask, ['VIEWS', 'WATCH_HOURS', 'LIKES']] = df.loc[weekend_mask, ['VIEWS', 'WATCH_HOURS', 'LIKES']] * 1.5

# Seasonal variation (summer boost using sine wave)
days_in_year = 366
summer_boost = np.sin(np.linspace(0, 2*np.pi, days_in_year))
df['VIEWS'] = df['VIEWS'] * (1 + 0.2 * summer_boost[df['DATE'].dt.dayofyear - 1])

# Simulate occasional viral days (still happen even during decline)
viral_days = np.random.choice(range(60, n_days), size=20, replace=False)
df.loc[viral_days, ['VIEWS', 'LIKES', 'SHARES', 'COMMENTS']] = df.loc[viral_days, ['VIEWS', 'LIKES', 'SHARES', 'COMMENTS']] * 4

# Final integer conversion
for col in df.columns:
    if col != 'DATE':
        df[col] = df[col].astype(int)

# Cumulative total subscribers
df['TOTAL_SUBSCRIBERS'] = (df['SUBSCRIBERS_GAINED'] - df['SUBSCRIBERS_LOST']).cumsum()

# Clip all numeric values to be non-negative
df[df.select_dtypes(include=[np.number]).columns] = df.select_dtypes(include=[np.number]).clip(lower=0)

# Save to CSV
df.to_csv('beauty_blogger_youtube_data.csv', index=False)

print("✅ Dataset generated and saved as 'beauty_blogger_youtube_data.csv'")

✅ Dataset generated and saved as 'beauty_blogger_youtube_data.csv'


  df.loc[weekend_mask, ['VIEWS', 'WATCH_HOURS', 'LIKES']] = df.loc[weekend_mask, ['VIEWS', 'WATCH_HOURS', 'LIKES']] * 1.5
  df.loc[weekend_mask, ['VIEWS', 'WATCH_HOURS', 'LIKES']] = df.loc[weekend_mask, ['VIEWS', 'WATCH_HOURS', 'LIKES']] * 1.5
  df.loc[weekend_mask, ['VIEWS', 'WATCH_HOURS', 'LIKES']] = df.loc[weekend_mask, ['VIEWS', 'WATCH_HOURS', 'LIKES']] * 1.5
