In [7]:
import pandas as pd
import re
from pathlib import Path

data_dir = Path('data')
episode_files = sorted(data_dir.glob('reddit_episode_*.csv'))

In [8]:
episode_stats = []

for file_path in episode_files:
    # Extract episode number from filename (e.g., reddit_episode_1.csv -> 1)
    match = re.search(r'reddit_episode_(\d+)\.csv', file_path.name)
    if match:
        episode_number = int(match.group(1))
        df = pd.read_csv(file_path)
        
        num_posts = len(df)
        num_comments = df['num_comments'].sum() if 'num_comments' in df.columns else 0
        score = df['score'].sum() if 'score' in df.columns else 0
        avg_comments_per_post = round(num_comments / num_posts, 2) if num_posts != 0 else 0 # div error
        
        episode_stats.append({
            'episode_number': episode_number,
            'num_posts': num_posts,
            'num_comments': num_comments,
            'score': score,
            'avg_comments_per_post': avg_comments_per_post
        })

episode_analytics = pd.DataFrame(episode_stats)
episode_analytics = episode_analytics.sort_values('episode_number').reset_index(drop=True)

episode_analytics



Unnamed: 0,episode_number,num_posts,num_comments,score,avg_comments_per_post
0,1,1,282,39,282.0
1,2,1,260,15,260.0
2,3,3,320,248,106.67
3,4,9,511,348,56.78
4,5,2,296,59,148.0
5,6,3,453,50,151.0
6,7,5,583,677,116.6
7,8,3,423,119,141.0
8,9,1,366,21,366.0
9,10,1,418,17,418.0
