In [6]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import math
from io import StringIO
import re

pca_evaluation = pd.DataFrame({'Time Frame': [], 'View Weight': [], 'Time Weight': [], 'Session Weight': [], 
                               'Average View Totals': [], 'Average Time Per View': [], 'Average Session Totals': []})

def run_pca(csv_file_pathway):
    # Get file name
    with open(csv_file_pathway, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    file_name = lines[0].strip().strip('"')

    # Load the data
    csv_content = '\n'.join(lines[3:])  # skip the first three rows
    df = pd.read_csv(StringIO(csv_content), delimiter='\t')

    # Clean up columns
    df.columns = (
        df.columns
        .str.strip()
        .str.replace(r'["\']', '', regex=True)
        .str.replace(r'[,\s]+$', '', regex=True)
    )
    
    # Define columns
    required_cols = [
        "CMS Url",
        "[Custom] Article Views--All Users",
        "Time Spent per Article--All Users",
        "[Amplitude] Session Totals--All Users"
    ]
    
    numeric_cols = [
        "[Custom] Article Views--All Users",
        "Time Spent per Article--All Users",
        "[Amplitude] Session Totals--All Users"
    ]
    
    # Check for missing columns
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns in data: {missing_cols}")

    # Filter rows with required columns not null
    df_clean = df.dropna(subset=required_cols)
    if df_clean.empty:
        raise ValueError("No rows left after dropping rows with missing required values.")

    # Convert numeric columns to numbers
    for col in numeric_cols:
        df_clean[col] = (
            df_clean[col]
            .astype(str)
            .str.replace(r'[,"\']', '', regex=True)  # Remove commas, quotes
            .str.strip()
        )

        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
    
    # Drop rows with any NaNs in numeric columns
    df_clean = df_clean.dropna(subset=numeric_cols)
    if df_clean.empty:
        raise ValueError("No rows left after converting numeric columns and dropping NaNs.")

    # Assert all numeric columns are positive
    assert (df_clean[numeric_cols] >= 0).all().all(), "All scores should be positive"

    # Calculate scores
    df_clean["view_score"] = df_clean["[Custom] Article Views--All Users"] / (
        df_clean["[Custom] Article Views--All Users"].sum() / len(df_clean)
    )
    
    df_clean["time_score"] = df_clean["Time Spent per Article--All Users"] / (
        df_clean["Time Spent per Article--All Users"].sum() / len(df_clean)
    )
    
    df_clean["session_score"] = df_clean["[Amplitude] Session Totals--All Users"] / (
        df_clean["[Amplitude] Session Totals--All Users"].sum() / len(df_clean)
    )
    
    # Drop rows with any NaNs in score columns
    df_clean = df_clean.dropna(subset=['view_score', 'time_score', 'session_score']).copy()
    assert (df_clean[['view_score', 'time_score', 'session_score']] >= 0).all().all(), "All scores should be positive"

    # Standardize scores
    scaler = StandardScaler()
    X = scaler.fit_transform(df_clean[['view_score', 'time_score', 'session_score']])
    
    # Apply PCA
    pca = PCA(n_components=1)
    df_clean['pca_score_raw'] = pca.fit_transform(X)
    
    # Normalize PCA output to center around 1
    mean_score = abs(df_clean['pca_score_raw']).mean()
    df_clean['content_score'] = df_clean['pca_score_raw'] / mean_score
    
    # Extract component weights
    weights_sum = pca.components_[0].sum()
    view_weight = pca.components_[0][0] / weights_sum
    time_weight = pca.components_[0][1] / weights_sum
    session_weight = pca.components_[0][2] / weights_sum
    
    weights_dict = {
        'Time Frame': file_name[31:],
        'View Weight': view_weight,
        'Time Weight': time_weight,
        'Session Weight': session_weight,
        'Average View Totals': df_clean["[Custom] Article Views--All Users"].sum() / len(df_clean),
        'Average Time Per View': df_clean["Time Spent per Article--All Users"].sum() / len(df_clean),
        'Average Session Totals': df_clean["[Amplitude] Session Totals--All Users"].sum() / len(df_clean)
    }
    
    # Append the row to the DataFrame
    global pca_evaluation
    pca_evaluation = pd.concat([pca_evaluation, pd.DataFrame([weights_dict])], ignore_index=True)
    
# Loop through years and months
for year, start_month, end_month in [
    (2023, 6, 12),   # 2023-06 to 2023-12
    (2024, 1, 12),   # 2024-01 to 2024-12
    (2025, 1, 5)     # 2025-01 to 2025-05
]: # Choose time range
    for month in range(start_month, end_month + 1):
        month_str = str(month).zfill(2)
        run_pca(f'/Users/parker.pape/Projects/Content Score A2/Data Table - Raw Content Score A2 Variables {year}-{month_str}.csv')

pca_evaluation.to_csv("/Users/parker.pape/Projects/Content Score A2/Article PCA A2 Output.csv", index=False)

print(pca_evaluation)

   Time Frame  View Weight  Time Weight  Session Weight  Average View Totals  \
0     2023-06     0.404477     0.189900        0.405623            103189.11   
1     2023-07     0.382863     0.233729        0.383409             51845.51   
2     2023-08     0.357363     0.286467        0.356170            169843.81   
3     2023-09     0.421533     0.152230        0.426237            125484.64   
4     2023-10     0.560952    -0.119646        0.558693             83373.21   
5     2023-10     0.532608    -0.064147        0.531538             43836.08   
6     2023-12     0.462173     0.075051        0.462777             67229.11   
7     2024-01     0.390331     0.219520        0.390149            115282.82   
8     2024-02     0.387500     0.235875        0.376626             51465.69   
9     2024-03     0.434893     0.123569        0.441538             38179.33   
10    2024-04     0.369034     0.261193        0.369774             71378.45   
11    2024-05     0.372336     0.252221 