In [7]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import math
from io import StringIO
import re

pca_evaluation = pd.DataFrame({'Time Frame': [], 'View Weight': [], 'Time Weight': [], 'Session Weight': [], 
                               'Average View Totals': [], 'Average Time Per View': [], 'Average Session Totals': []})

def run_pca(csv_file_pathway):
    # Get file name
    with open(csv_file_pathway, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    file_name = lines[0].strip().strip('"')

    # Load the data
    csv_content = '\n'.join(lines[3:])  # skip the first three rows
    df = pd.read_csv(StringIO(csv_content), delimiter='\t')

    # Clean up columns
    df.columns = (
        df.columns
        .str.strip()
        .str.replace(r'["\']', '', regex=True)
        .str.replace(r'[,\s]+$', '', regex=True)
    )
    
    # Define columns
    required_cols = [
        "CMS Url",
        "View Quantity--All Users",
        "Time Spent per View--All Users",
        "Session Quantity--All Users"
    ]
    
    numeric_cols = [
        "View Quantity--All Users",
        "Time Spent per View--All Users",
        "Session Quantity--All Users"
    ]
    
    # Check for missing columns
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns in data: {missing_cols}")

    # Filter rows with required columns not null
    df_clean = df.dropna(subset=required_cols)
    if df_clean.empty:
        raise ValueError("No rows left after dropping rows with missing required values.")

    # Convert numeric columns to numbers
    for col in numeric_cols:
        df_clean[col] = (
            df_clean[col]
            .astype(str)
            .str.replace(r'[,"\']', '', regex=True)  # Remove commas, quotes
            .str.strip()
        )

        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
    
    # Drop rows with any NaNs in numeric columns
    df_clean = df_clean.dropna(subset=numeric_cols)
    if df_clean.empty:
        raise ValueError("No rows left after converting numeric columns and dropping NaNs.")

    # Assert all numeric columns are positive
    assert (df_clean[numeric_cols] >= 0).all().all(), "All scores should be positive"

    # Calculate scores
    df_clean["view_score"] = df_clean["View Quantity--All Users"] / (
        df_clean["View Quantity--All Users"].sum() / len(df_clean)
    )
    
    df_clean["time_score"] = df_clean["Time Spent per View--All Users"] / (
        df_clean["Time Spent per View--All Users"].sum() / len(df_clean)
    )
    
    df_clean["session_score"] = df_clean["Session Quantity--All Users"] / (
        df_clean["Session Quantity--All Users"].sum() / len(df_clean)
    )
    
    # Drop rows with any NaNs in score columns
    df_clean = df_clean.dropna(subset=['view_score', 'time_score', 'session_score']).copy()
    assert (df_clean[['view_score', 'time_score', 'session_score']] >= 0).all().all(), "All scores should be positive"

    # Standardize scores
    scaler = StandardScaler()
    X = scaler.fit_transform(df_clean[['view_score', 'time_score', 'session_score']])
    
    # Apply PCA
    pca = PCA(n_components=1)
    df_clean['pca_score_raw'] = pca.fit_transform(X)
    
    # Normalize PCA output to center around 1
    mean_score = abs(df_clean['pca_score_raw']).mean()
    df_clean['content_score'] = df_clean['pca_score_raw'] / mean_score
    
    # Extract component weights
    weights_sum = abs(pca.components_[0]).sum()
    view_weight = abs(pca.components_[0][0]) / weights_sum
    time_weight = abs(pca.components_[0][1]) / weights_sum
    session_weight = abs(pca.components_[0][2]) / weights_sum
    
    weights_dict = {
        'Time Frame': file_name[31:],
        'View Weight': view_weight,
        'Time Weight': time_weight,
        'Session Weight': session_weight,
        'Average View Totals': df_clean["View Quantity--All Users"].sum() / len(df_clean),
        'Average Time Per View': df_clean["Time Spent per View--All Users"].sum() / len(df_clean),
        'Average Session Totals': df_clean["Session Quantity--All Users"].sum() / len(df_clean)
    }
    
    # Append the row to the DataFrame
    global pca_evaluation
    pca_evaluation = pd.concat([pca_evaluation, pd.DataFrame([weights_dict])], ignore_index=True)
    
# Loop through years and months
for year, start_month, end_month in [
    (2023, 12, 12),   # 2023-12 to 2023-12
    (2024, 1, 12),   # 2024-01 to 2024-12
    (2025, 1, 5)     # 2025-01 to 2025-05
]: # Choose time range
    for month in range(start_month, end_month + 1):
        month_str = str(month).zfill(2)
        run_pca(f'/Users/parker.pape/Projects/Content Score A2/Data Table - Raw Content Score A2 Variables {year}-{month_str}.csv')

pca_evaluation.to_csv("/Users/parker.pape/Projects/Content Score A2/Article PCA A2 Output.csv", index=False)

print(pca_evaluation)

   Time Frame  View Weight  Time Weight  Session Weight  Average View Totals  \
0     2023-12     0.477652     0.044679        0.477669         11926.675039   
1     2024-01     0.484807     0.030384        0.484808         21184.437819   
2     2024-02     0.483709     0.032577        0.483714          9267.572848   
3     2024-03     0.476485     0.047005        0.476510          7802.551661   
4     2024-04     0.485098     0.029782        0.485120         16102.551363   
5     2024-05     0.472554     0.054830        0.472616         14582.269618   
6     2024-06     0.475428     0.049135        0.475436         18323.387168   
7     2024-07     0.484186     0.031612        0.484201         30492.968750   
8     2024-08     0.494932     0.010132        0.494936         29561.986425   
9     2024-09     0.473465     0.053087        0.473448         63357.560197   
10    2024-10     0.498959     0.002083        0.498959         70499.257353   
11    2024-11     0.440126     0.119705 