In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# --- 1. Data Modeling: Simulate Datasets ---

def generate_uuid():
    """Generates a simple UUID-like string."""
    return str(np.random.randint(100000000, 999999999))

def simulate_data():
    """
    Simulates the influencer campaign datasets.
    Returns:
        tuple: DataFrames for influencers, posts, tracking_data, and payouts.
    """
    print("Generating simulated data...")

    # Influencers Data
    influencer_data = [
        {'id': 'inf1', 'name': 'Cristiano Ronaldo', 'category': 'Sports with a ball', 'gender': 'Male', 'follower_count': 500000000, 'platform': 'Instagram'},
        {'id': 'inf2', 'name': 'Kylie Jenner', 'category': 'Fashion', 'gender': 'Female', 'follower_count': 380000000, 'platform': 'Instagram'},
        {'id': 'inf3', 'name': 'Charli D’Amelio', 'category': 'Dance', 'gender': 'Female', 'follower_count': 150000000, 'platform': 'TikTok'},
        {'id': 'inf4', 'name': 'MrBeast', 'category': 'Video games', 'gender': 'Male', 'follower_count': 120000000, 'platform': 'YouTube'},
        {'id': 'inf5', 'name': 'Leo Messi', 'category': 'Sports with a ball', 'gender': 'Male', 'follower_count': 450000000, 'platform': 'Instagram'},
        {'id': 'inf6', 'name': 'Addison Rae', 'category': 'Dance', 'gender': 'Female', 'follower_count': 80000000, 'platform': 'TikTok'},
        {'id': 'inf7', 'name': 'PewDiePie', 'category': 'Animation', 'gender': 'Male', 'follower_count': 110000000, 'platform': 'YouTube'},
        {'id': 'inf8', 'name': 'Selena Gomez', 'category': 'Music', 'gender': 'Female', 'follower_count': 400000000, 'platform': 'Instagram'},
        {'id': 'inf9', 'name': 'Khaby Lame', 'category': 'Comedy', 'gender': 'Male', 'follower_count': 160000000, 'platform': 'TikTok'},
        {'id': 'inf10', 'name': 'T-Series', 'category': 'Music & Dance', 'gender': 'N/A', 'follower_count': 250000000, 'platform': 'YouTube'},
    ]
    influencers_df = pd.DataFrame(influencer_data)

    # Products and Campaigns for Tracking Data
    products = ['Protein Powder', 'Vitamins', 'Supplements', 'Weight Loss', 'Fitness Gear']
    campaign_names = ['Summer Campaign', 'New Product Launch', 'Health & Wellness', 'Mega Sale']

    posts_list = []
    tracking_data_list = []
    payouts_list = []

    start_date = datetime(2023, 1, 1)
    end_date = datetime(2023, 12, 31)

    for _, influencer in influencers_df.iterrows():
        num_posts = np.random.randint(2, 7) # 2-6 posts per influencer
        for i in range(num_posts):
            post_id = generate_uuid()
            post_date = start_date + timedelta(days=np.random.randint(0, (end_date - start_date).days))

            # Posts Data
            reach = int(influencer['follower_count'] * (0.05 + np.random.rand() * 0.1)) # 5-15% of followers
            likes = int(reach * (0.01 + np.random.rand() * 0.05)) # 1-6% of reach
            comments = int(likes * (0.01 + np.random.rand() * 0.05)) # 1-6% of likes

            posts_list.append({
                'influencer_id': influencer['id'],
                'platform': influencer['platform'],
                'date': post_date.strftime('%Y-%m-%d'),
                'URL': f"https://example.com/{influencer['platform']}/{influencer['id']}/{post_id}",
                'caption': f"Check out this amazing {np.random.choice(products)} from HealthKart! #HealthKart #{influencer['category'].replace(' ', '')}",
                'reach': reach,
                'likes': likes,
                'comments': comments,
            })

            # Tracking Data
            num_orders = np.random.randint(50, 551) # 50-550 orders per post
            avg_product_price = 20 + np.random.rand() * 80 # $20-$100
            revenue = num_orders * avg_product_price

            tracking_data_list.append({
                'source': 'influencer_campaign',
                'campaign': np.random.choice(campaign_names),
                'influencer_id': influencer['id'],
                'user_id': generate_uuid(),
                'product': np.random.choice(products),
                'date': post_date.strftime('%Y-%m-%d'),
                'orders': num_orders,
                'revenue': round(revenue, 2),
            })

            # Payouts Data
            basis = np.random.choice(['post', 'order'])
            rate = 0
            total_payout = 0

            if basis == 'post':
                rate = round(500 + np.random.rand() * 1500, 2) # Flat fee $500-$2000
                total_payout = rate
            else: # basis == 'order'
                rate = round(0.5 + np.random.rand() * 2, 2) # Per order commission $0.5-$2.5
                total_payout = round(rate * num_orders, 2)

            payouts_list.append({
                'influencer_id': influencer['id'],
                'basis': basis,
                'rate': rate,
                'orders': num_orders, # Orders from tracking data for this post
                'total_payout': total_payout,
                'date': post_date.strftime('%Y-%m-%d'),
            })

    posts_df = pd.DataFrame(posts_list)
    tracking_data_df = pd.DataFrame(tracking_data_list)
    payouts_df = pd.DataFrame(payouts_list)

    print("Simulated data generation complete.")
    return influencers_df, posts_df, tracking_data_df, payouts_df

# --- 2. Features to Build ---

def calculate_kpis(df_tracking, df_payouts):
    """Calculates Total Revenue, Total Payout, ROI, and ROAS."""
    total_revenue = df_tracking['revenue'].sum()
    total_payout = df_payouts['total_payout'].sum()

    roi = ((total_revenue - total_payout) / total_payout) * 100 if total_payout > 0 else 0
    roas = total_revenue / total_payout if total_payout > 0 else 0

    return total_revenue, total_payout, roi, roas

def filter_data(influencers_df, posts_df, tracking_data_df, payouts_df, platform='All', category='All'):
    """
    Filters the datasets based on platform and category.
    Returns:
        tuple: Filtered DataFrames.
    """
    filtered_influencers = influencers_df.copy()

    if platform != 'All':
        filtered_influencers = filtered_influencers[filtered_influencers['platform'] == platform]
    if category != 'All':
        filtered_influencers = filtered_influencers[filtered_influencers['category'] == category]

    filtered_influencer_ids = filtered_influencers['id'].tolist()

    filtered_posts = posts_df[posts_df['influencer_id'].isin(filtered_influencer_ids)].copy()
    filtered_tracking_data = tracking_data_df[tracking_data_df['influencer_id'].isin(filtered_influencer_ids)].copy()
    filtered_payouts = payouts_df[payouts_df['influencer_id'].isin(filtered_influencer_ids)].copy()

    return filtered_influencers, filtered_posts, filtered_tracking_data, filtered_payouts

def get_top_influencers(influencers_df, tracking_data_df, payouts_df, posts_df, metric='revenue', count=5):
    """
    Identifies top influencers based on a given metric.
    Metrics can be 'revenue', 'roi', 'likes', 'comments', 'reach'.
    """
    # Merge tracking data with influencers to get names/platforms
    merged_tracking = pd.merge(tracking_data_df, influencers_df[['id', 'name', 'platform', 'category']],
                               left_on='influencer_id', right_on='id', how='left')

    # Calculate total revenue per influencer
    influencer_revenue = merged_tracking.groupby('influencer_id')['revenue'].sum().reset_index()
    influencer_revenue.rename(columns={'revenue': 'total_revenue'}, inplace=True)

    # Calculate total payout per influencer
    influencer_payout = payouts_df.groupby('influencer_id')['total_payout'].sum().reset_index()
    influencer_payout.rename(columns={'total_payout': 'total_payout'}, inplace=True)

    # Calculate total engagement (likes, comments, reach) per influencer
    influencer_engagement = posts_df.groupby('influencer_id')[['likes', 'comments', 'reach']].sum().reset_index()
    influencer_engagement.rename(columns={'likes': 'total_likes', 'comments': 'total_comments', 'reach': 'total_reach'}, inplace=True)

    # Merge all metrics
    influencer_metrics = pd.merge(influencer_revenue, influencer_payout, on='influencer_id', how='outer').fillna(0)
    influencer_metrics = pd.merge(influencer_metrics, influencer_engagement, on='influencer_id', how='outer').fillna(0)

    # Calculate ROI and ROAS for each influencer
    influencer_metrics['roi'] = influencer_metrics.apply(
        lambda row: ((row['total_revenue'] - row['total_payout']) / row['total_payout']) * 100 if row['total_payout'] > 0 else 0, axis=1
    )
    influencer_metrics['roas'] = influencer_metrics.apply(
        lambda row: row['total_revenue'] / row['total_payout'] if row['total_payout'] > 0 else 0, axis=1
    )

    # Add influencer details (name, platform, category)
    influencer_metrics = pd.merge(influencer_metrics, influencers_df[['id', 'name', 'platform', 'category']],
                                  left_on='influencer_id', right_on='id', how='left').drop(columns=['id'])

    # Sort and return top influencers
    if metric in influencer_metrics.columns:
        top_influencers = influencer_metrics.sort_values(by=metric, ascending=False).head(count)
        return top_influencers[['name', 'platform', 'category', metric, 'total_revenue', 'total_payout', 'roi', 'roas', 'total_likes', 'total_comments', 'total_reach']]
    else:
        print(f"Warning: Metric '{metric}' not found. Returning top influencers by revenue instead.")
        return influencer_metrics.sort_values(by='total_revenue', ascending=False).head(count)[['name', 'platform', 'category', 'total_revenue', 'total_payout', 'roi', 'roas', 'total_likes', 'total_comments', 'total_reach']]

def aggregate_data_by_date(tracking_data_df, payouts_df):
    """Aggregates revenue, orders, and payout by date."""
    # Convert date columns to datetime objects
    tracking_data_df['date'] = pd.to_datetime(tracking_data_df['date'])
    payouts_df['date'] = pd.to_datetime(payouts_df['date'])

    # Aggregate tracking data
    daily_tracking = tracking_data_df.groupby('date').agg(
        revenue=('revenue', 'sum'),
        orders=('orders', 'sum')
    ).reset_index()

    # Aggregate payouts data
    daily_payouts = payouts_df.groupby('date').agg(
        payout=('total_payout', 'sum')
    ).reset_index()

    # Merge daily data
    daily_summary = pd.merge(daily_tracking, daily_payouts, on='date', how='outer').fillna(0)
    daily_summary = daily_summary.sort_values(by='date').reset_index(drop=True)
    daily_summary['date'] = daily_summary['date'].dt.strftime('%Y-%m-%d') # Convert back to string for display

    return daily_summary

# --- Main Dashboard Logic ---

def run_dashboard():
    influencers_df, posts_df, tracking_data_df, payouts_df = simulate_data()

    print("\n--- Overall Campaign Performance ---")
    total_revenue, total_payout, roi, roas = calculate_kpis(tracking_data_df, payouts_df)
    print(f"Total Revenue: ${total_revenue:,.2f}")
    print(f"Total Payout: ${total_payout:,.2f}")
    print(f"ROI: {roi:,.2f}%")
    print(f"ROAS: {roas:,.2f}x")

    print("\n--- Data Over Time ---")
    daily_data = aggregate_data_by_date(tracking_data_df, payouts_df)
    print(daily_data.head()) # Show first few rows of daily data

    print("\n--- Insights: Top Influencers by Revenue (Overall) ---")
    top_revenue_influencers = get_top_influencers(influencers_df, tracking_data_df, payouts_df, posts_df, metric='total_revenue', count=5)
    print(top_revenue_influencers.to_string())

    print("\n--- Insights: Top Influencers by ROI (Overall) ---")
    top_roi_influencers = get_top_influencers(influencers_df, tracking_data_df, payouts_df, posts_df, metric='roi', count=5)
    print(top_roi_influencers.to_string())

    print("\n--- Insights: Top Influencers by Engagement (Likes) (Overall) ---")
    top_engagement_influencers = get_top_influencers(influencers_df, tracking_data_df, payouts_df, posts_df, metric='total_likes', count=5)
    print(top_engagement_influencers.to_string())

    # --- Filtering Example ---
    print("\n--- Filtering Example: Instagram, Sports with a ball ---")
    filtered_inf, filtered_posts, filtered_tracking, filtered_payouts = filter_data(
        influencers_df, posts_df, tracking_data_df, payouts_df, platform='Instagram', category='Sports with a ball'
    )
    filtered_total_revenue, filtered_total_payout, filtered_roi, filtered_roas = calculate_kpis(filtered_tracking, filtered_payouts)

    print(f"Filtered Total Revenue: ${filtered_total_revenue:,.2f}")
    print(f"Filtered Total Payout: ${filtered_total_payout:,.2f}")
    print(f"Filtered ROI: {filtered_roi:,.2f}%")
    print(f"Filtered ROAS: {filtered_roas:,.2f}x")

    print("\n--- Top Influencers by Revenue (Filtered: Instagram, Sports with a ball) ---")
    filtered_top_revenue_influencers = get_top_influencers(filtered_inf, filtered_tracking, filtered_payouts, filtered_posts, metric='total_revenue', count=3)
    print(filtered_top_revenue_influencers.to_string())

# Run the dashboard
if __name__ == "__main__":
    run_dashboard()

Generating simulated data...
Simulated data generation complete.

--- Overall Campaign Performance ---
Total Revenue: $574,610.42
Total Payout: $32,846.20
ROI: 1,649.40%
ROAS: 17.49x

--- Data Over Time ---
         date   revenue  orders   payout
0  2023-01-25  39535.61     417  1453.37
1  2023-02-09  20464.00     585   520.32
2  2023-02-13   5148.08     234   542.88
3  2023-02-18   9780.30     252   525.08
4  2023-02-23  32823.42     370  1328.83

--- Insights: Top Influencers by Revenue (Overall) ---
                name   platform            category  total_revenue  total_revenue  total_payout          roi       roas  total_likes  total_comments  total_reach
7          PewDiePie    YouTube           Animation      108188.93      108188.93       5208.06  1977.336475  20.773365      2211403           67611     75878830
0  Cristiano Ronaldo  Instagram  Sports with a ball       91926.45       91926.45       4643.23  1879.795315  19.797953      7034321          203985    232074802
4    