In [None]:
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
import utils

In [None]:
sgpc_results = pd.read_csv('results/spgc_coco_results.csv')

In [None]:
# Group the data by book_id and pool_order
grouped = sgpc_results.groupby(['book_id', 'pool_order'])

# Initialize lists to store results
results = []

# Function to calculate WSSR
def calculate_wssr(y_obs, y_pred):
    """
    Weighted Sum of Squared Residuals
    WSSR = Σ (y_observed - y_predicted)^2
    Using equal weights for all points
    """
    return np.sum((y_obs - y_pred)**2)

# Process each group
for (book_id, pool_order), group in grouped:
    lags = group['lag'].values
    abs_coco = np.abs(group['coco_value'].values)
    
    result = {'book_id': book_id, 'pool_order': pool_order}
    
    # Fit power law function
    try:
        popt_pl, _ = curve_fit(
            utils.power_law,
            lags,
            abs_coco,
            bounds=([-np.inf, -np.inf], [np.inf, 0]),
            maxfev=5000
        )
        
        # Calculate predicted values and WSSR
        y_pred_pl = utils.power_law(lags, *popt_pl)
        wssr_pl = calculate_wssr(abs_coco, y_pred_pl)
        
        result.update({
            'pl_a': popt_pl[0],
            'pl_b': popt_pl[1],
            'pl_wssr': wssr_pl
        })
    except RuntimeError:
        pass
    
    # Fit stretched exponential function
    try:
        popt_se, _ = curve_fit(
            utils.stretched_exponential,
            lags,
            abs_coco,
            bounds=([0, 0, -np.inf], [np.inf, 1, np.inf]),
            maxfev=5000
        )
        
        # Calculate predicted values and WSSR
        y_pred_se = utils.stretched_exponential(lags, *popt_se)
        wssr_se = calculate_wssr(abs_coco, y_pred_se)
        
        result.update({
            'se_a': popt_se[0],
            'se_b': popt_se[1],
            'se_c': popt_se[2],
            'se_wssr': wssr_se
        })
    except RuntimeError:
        pass
    
    results.append(result)

# Convert results to DataFrame
fit_results = pd.DataFrame(results)

fit_results.head()

In [None]:
fit_results.to_csv('results/sgpc_fit_results.csv', index=False)

In [None]:
fit_results = pd.read_csv('results/sgpc_fit_results.csv')

In [None]:
# Create a summary dataframe grouped by pool_order
summary = pd.DataFrame()

# Get total counts per pool_order
total_counts = fit_results.groupby('pool_order').size().reset_index(name='total_count')

# Average SSR values
avg_wssr = fit_results.groupby('pool_order').agg({
    'pl_wssr': 'mean',
    'se_wssr': 'mean'
}).reset_index()

# Count non-null values to determine successful convergence
convergence_counts = fit_results.groupby('pool_order').agg({
    'pl_wssr': 'count',
    'se_wssr': 'count'
}).reset_index()

# Merge all information
summary = total_counts.merge(avg_wssr, on='pool_order')
summary = summary.merge(convergence_counts, on='pool_order', suffixes=('', '_count'))

# Calculate failed counts and percentages
summary['pl_failed_count'] = summary['total_count'] - summary['pl_wssr_count']
summary['se_failed_count'] = summary['total_count'] - summary['se_wssr_count']
summary['pl_failed_percentage'] = (summary['pl_failed_count'] / summary['total_count']) * 100
summary['se_failed_percentage'] = (summary['se_failed_count'] / summary['total_count']) * 100

# Rename columns for clarity
summary.rename(columns={
    'pl_wssr': 'avg_pl_wssr',
    'se_wssr': 'avg_se_wssr',
    'pl_wssr_count': 'pl_success_count',
    'se_wssr_count': 'se_success_count'
}, inplace=True)

# Reorder columns for better readability
summary = summary[['pool_order', 'total_count', 
                  'avg_pl_wssr', 'pl_success_count', 'pl_failed_count', 'pl_failed_percentage',
                  'avg_se_wssr', 'se_success_count', 'se_failed_count', 'se_failed_percentage']]

summary

In [None]:
summary_latex = summary[['pool_order', 'pl_failed_percentage', 'avg_pl_wssr', 'se_failed_percentage', 'avg_se_wssr']].to_latex()
print(summary_latex)