# Parquet Data Analysis Template
Interactive analysis of parquet files from the prediction service

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
plt.rcParams['figure.figsize'] = (12, 6)
sns.set_style('whitegrid')

In [None]:
# Load prediction results
predictions = pd.read_parquet('predictions/alpha.parquet')
print(f"Loaded {len(predictions)} predictions")
predictions.info()

In [None]:
# Basic exploration
print("First 10 rows:")
predictions.head(10)

In [None]:
# Summary statistics
predictions.describe()

In [None]:
# Fund-level analysis
fund_summary = predictions.groupby('class_id').agg({
    'ensemble_prediction': ['count', 'mean', 'std', 'min', 'max']
}).round(4)

print("Fund-level statistics:")
fund_summary

In [None]:
# Visualization: Distribution of predictions
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.hist(predictions['ensemble_prediction'], bins=30, alpha=0.7, color='skyblue')
plt.title('Distribution of Alpha Predictions')
plt.xlabel('Alpha Prediction')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
predictions.boxplot(column='ensemble_prediction', by='class_id', ax=plt.gca())
plt.title('Predictions by Fund')
plt.suptitle('')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Load ETL data for comparison
etl_data = pd.read_parquet('data/pilot_fact_class_month.parquet')
print(f"Loaded {len(etl_data)} ETL records")
etl_data.info()

In [None]:
# Merge predictions with ETL data for deeper analysis
merged = pd.merge(predictions, etl_data, on=['class_id', 'month_end'], how='left')
print(f"Merged dataset: {len(merged)} records")
merged.head()

In [None]:
# Time series analysis (if multiple time periods)
if predictions['month_end'].nunique() > 1:
    plt.figure(figsize=(14, 6))
    
    for fund_id in predictions['class_id'].unique():
        fund_data = predictions[predictions['class_id'] == fund_id].sort_values('month_end')
        plt.plot(fund_data['month_end'], fund_data['ensemble_prediction'], 
                marker='o', label=fund_id)
    
    plt.title('Alpha Predictions Over Time')
    plt.xlabel('Date')
    plt.ylabel('Alpha Prediction')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("Single time period - no time series analysis")

In [None]:
# Export summary to Excel for sharing
with pd.ExcelWriter('predictions/prediction_summary.xlsx', engine='openpyxl') as writer:
    predictions.to_excel(writer, sheet_name='Raw_Predictions', index=False)
    fund_summary.to_excel(writer, sheet_name='Fund_Summary')
    
print("📊 Summary exported to: predictions/prediction_summary.xlsx")