In [25]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import math
from io import StringIO
import re

pca_evaluation = pd.DataFrame({'Time Frame': [], 'View Weight': [], 'Ad Weight': [], 'Session Weight': [], 
                               'Average View Totals': [], 'Average Ad Totals': [], 'Average Session Totals': []})

def run_pca(csv_file_pathway):
    # Get file name
    with open(csv_file_pathway, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    file_name = lines[0].strip().strip('"')

    # Load the data
    csv_content = '\n'.join(lines[3:])  # skip the first three rows
    df = pd.read_csv(StringIO(csv_content), delimiter='\t')

    # Clean up columns
    df.columns = (
        df.columns
        .str.strip()
        .str.replace(r'["\']', '', regex=True)
        .str.replace(r'[,\s]+$', '', regex=True)
    )
    
    # Define columns
    required_cols = [
        "CMS Url",
        "View Quantity--All Users",
        "Watched Ads--All Users",
        "Session Quantity--All Users"
    ]
    
    numeric_cols = [
        "View Quantity--All Users",
        "Watched Ads--All Users",
        "Session Quantity--All Users"
    ]
    
    # Check for missing columns
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns in data: {missing_cols}")

    # Filter rows with required columns not null
    df_clean = df.dropna(subset=required_cols)
    if df_clean.empty:
        raise ValueError("No rows left after dropping rows with missing required values.")

    # Convert numeric columns to numbers
    for col in numeric_cols:
        df_clean[col] = (
            df_clean[col]
            .astype(str)
            .str.replace(r'[,"\']', '', regex=True)  # Remove commas, quotes
            .str.strip()
        )

        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
    
    # Drop rows with any NaNs in numeric columns
    df_clean = df_clean.dropna(subset=numeric_cols)
    if df_clean.empty:
        raise ValueError("No rows left after converting numeric columns and dropping NaNs.")

    # Assert all numeric columns are positive
    assert (df_clean[numeric_cols] >= 0).all().all(), "All scores should be positive"
    

    # Calculate scores
    df_clean["view_score"] = df_clean["View Quantity--All Users"] / (
        df_clean["View Quantity--All Users"].sum() / len(df_clean)
    )
    
    df_clean["ad_score"] = df_clean["Watched Ads--All Users"] / (
        df_clean["Watched Ads--All Users"].sum() / len(df_clean)
    )
    
    df_clean["session_score"] = df_clean["Session Quantity--All Users"] / (
        df_clean["Session Quantity--All Users"].sum() / len(df_clean)
    )
    
    # Drop rows with any NaNs in score columns
    df_clean = df_clean.dropna(subset=['view_score', 'ad_score', 'session_score']).copy()
    assert (df_clean[['view_score', 'ad_score', 'session_score']] >= 0).all().all(), "All scores should be positive"

    # Standardize scores
    scaler = MinMaxScaler()
    X = scaler.fit_transform(df_clean[['view_score', 'ad_score', 'session_score']])
    
    # Apply PCA
    pca = PCA(n_components=1)
    df_clean['pca_score_raw'] = pca.fit_transform(X)
    
    # Normalize PCA output to center around 1
    mean_score = abs(df_clean['pca_score_raw']).mean()
    df_clean['content_score'] = df_clean['pca_score_raw'] / mean_score
    
    # Extract component weights
    weights_sum = pca.components_[0].sum()
    view_weight = pca.components_[0][0] / weights_sum
    ad_weight = pca.components_[0][1] / weights_sum
    session_weight = pca.components_[0][2] / weights_sum
    
    weights_dict = {
        'Time Frame': file_name[31:],
        'View Weight': view_weight,
        'Ad Weight': ad_weight,
        'Session Weight': session_weight,
        'Average View Totals': df_clean["View Quantity--All Users"].sum() / len(df_clean),
        'Average Ad Totals': df_clean["Watched Ads--All Users"].sum() / len(df_clean),
        'Average Session Totals': df_clean["Session Quantity--All Users"].sum() / len(df_clean)
    }
    
    # Append the row to the DataFrame
    global pca_evaluation
    pca_evaluation = pd.concat([pca_evaluation, pd.DataFrame([weights_dict])], ignore_index=True)
    
# Loop through years and months
for year, start_month, end_month in [
    (2023, 12, 12),   # 2023-12 to 2023-12
    (2024, 1, 6),   # 2024-01 to 2024-06 (2024-07 to 2024-12 unavailable due to broken 'ad-started' event)
    (2025, 1, 5)     # 2025-01 to 2025-05
]: # Choose time range
    for month in range(start_month, end_month + 1):
        month_str = str(month).zfill(2)
        run_pca(f'/Users/parker.pape/Projects/Content Score A3/Data Table - Raw Content Score A3 Variables {year}-{month_str}.csv')

pca_evaluation.to_csv("/Users/parker.pape/Projects/Content Score A3/Article PCA A3 Output.csv", index=False)

print(pca_evaluation)

   Time Frame  View Weight  Ad Weight  Session Weight  Average View Totals  \
0     2023-12     0.347424   0.299590        0.352987         27737.437870   
1     2024-01     0.323329   0.351574        0.325098         56731.661290   
2     2024-02     0.331374   0.333300        0.335326         33463.174825   
3     2024-03     0.355507   0.297301        0.347192         23479.093525   
4     2024-04     0.350726   0.294448        0.354826         42254.903846   
5     2024-05     0.333625   0.328608        0.337767         31663.558282   
6     2024-06     0.343576   0.312091        0.344334         43897.682432   
7     2025-01     0.339495   0.317895        0.342610         64024.989071   
8     2025-02     0.356019   0.303241        0.340740         41244.128834   
9     2025-03     0.337023   0.324257        0.338720         42813.913043   
10    2025-04     0.351962   0.289687        0.358350         39549.410448   
11    2025-05     0.353158   0.286192        0.360650         41