In [None]:
# Import required libraries
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import Counter

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load the Data

In [None]:
# Load JSONL data
data = []
with open('data.jsonl', 'r') as f:
    for line in f:
        entry = json.loads(line)
        # Extract the nested 'data' field
        if 'data' in entry:
            data.append(entry['data'])
        else:
            data.append(entry)

df = pd.DataFrame(data)
print(f"Total number of posts: {len(df)}")
print(f"Number of columns: {len(df.columns)}")

## 2. Data Structure Overview

In [None]:
# Display all column names
print("Available columns:")
print(df.columns.tolist())

In [None]:
# Check data types and non-null counts
df.info()

In [None]:
# Display first few rows with key columns
key_columns = ['title', 'author', 'score', 'num_comments', 'subreddit', 'created_utc', 'is_self', 'over_18']
available_cols = [col for col in key_columns if col in df.columns]
df[available_cols].head(10)

## 3. Basic Statistics

In [None]:
# Numerical statistics
numeric_cols = ['score', 'num_comments', 'ups', 'downs', 'upvote_ratio']
available_numeric = [col for col in numeric_cols if col in df.columns]
df[available_numeric].describe()

In [None]:
# Subreddit distribution
if 'subreddit' in df.columns:
    print("Subreddit distribution:")
    print(df['subreddit'].value_counts())

## 4. Engagement Analysis

In [None]:
# Top posts by score
print("Top 10 posts by score:")
if 'score' in df.columns and 'title' in df.columns:
    top_posts = df.nlargest(10, 'score')[['title', 'score', 'num_comments', 'author']]
    display(top_posts)

In [None]:
# Score distribution
if 'score' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Histogram
    axes[0].hist(df['score'], bins=30, edgecolor='black', alpha=0.7)
    axes[0].set_xlabel('Score')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Distribution of Post Scores')
    
    # Box plot
    axes[1].boxplot(df['score'])
    axes[1].set_ylabel('Score')
    axes[1].set_title('Box Plot of Post Scores')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Comments distribution
if 'num_comments' in df.columns:
    fig, ax = plt.subplots(figsize=(12, 5))
    ax.hist(df['num_comments'], bins=30, edgecolor='black', alpha=0.7, color='coral')
    ax.set_xlabel('Number of Comments')
    ax.set_ylabel('Frequency')
    ax.set_title('Distribution of Comments per Post')
    plt.show()

In [None]:
# Score vs Comments correlation
if 'score' in df.columns and 'num_comments' in df.columns:
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.scatter(df['score'], df['num_comments'], alpha=0.6)
    ax.set_xlabel('Score')
    ax.set_ylabel('Number of Comments')
    ax.set_title('Score vs Number of Comments')
    
    # Calculate correlation
    correlation = df['score'].corr(df['num_comments'])
    ax.text(0.05, 0.95, f'Correlation: {correlation:.3f}', 
            transform=ax.transAxes, fontsize=12, verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    plt.show()

## 5. Author Analysis

In [None]:
# Most active authors
if 'author' in df.columns:
    author_counts = df['author'].value_counts().head(15)
    
    fig, ax = plt.subplots(figsize=(12, 6))
    author_counts.plot(kind='barh', ax=ax, color='steelblue')
    ax.set_xlabel('Number of Posts')
    ax.set_ylabel('Author')
    ax.set_title('Top 15 Most Active Authors')
    ax.invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print(f"\nTotal unique authors: {df['author'].nunique()}")

## 6. Content Type Analysis

In [None]:
# Self posts vs link posts
if 'is_self' in df.columns:
    self_counts = df['is_self'].value_counts()
    
    fig, ax = plt.subplots(figsize=(8, 6))
    colors = ['#66b3ff', '#ff9999']
    labels = ['Link Posts', 'Self/Text Posts']
    ax.pie(self_counts.values, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
    ax.set_title('Distribution of Post Types')
    plt.show()

In [None]:
# Domain analysis for link posts
if 'domain' in df.columns:
    domain_counts = df['domain'].value_counts().head(15)
    
    fig, ax = plt.subplots(figsize=(12, 6))
    domain_counts.plot(kind='barh', ax=ax, color='teal')
    ax.set_xlabel('Number of Posts')
    ax.set_ylabel('Domain')
    ax.set_title('Top 15 Domains')
    ax.invert_yaxis()
    plt.tight_layout()
    plt.show()

## 7. Temporal Analysis

In [None]:
# Convert timestamps and analyze posting times
if 'created_utc' in df.columns:
    df['created_datetime'] = pd.to_datetime(df['created_utc'], unit='s')
    df['hour'] = df['created_datetime'].dt.hour
    df['day_of_week'] = df['created_datetime'].dt.day_name()
    df['date'] = df['created_datetime'].dt.date
    
    print(f"Date range: {df['created_datetime'].min()} to {df['created_datetime'].max()}")

In [None]:
# Hourly posting distribution
if 'hour' in df.columns:
    fig, ax = plt.subplots(figsize=(12, 5))
    df['hour'].value_counts().sort_index().plot(kind='bar', ax=ax, color='purple', alpha=0.7)
    ax.set_xlabel('Hour of Day (UTC)')
    ax.set_ylabel('Number of Posts')
    ax.set_title('Posts by Hour of Day')
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()

In [None]:
# Day of week distribution
if 'day_of_week' in df.columns:
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    day_counts = df['day_of_week'].value_counts().reindex(day_order)
    
    fig, ax = plt.subplots(figsize=(10, 5))
    day_counts.plot(kind='bar', ax=ax, color='green', alpha=0.7)
    ax.set_xlabel('Day of Week')
    ax.set_ylabel('Number of Posts')
    ax.set_title('Posts by Day of Week')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## 8. Summary Statistics

In [None]:
# Create a summary
summary = {
    'Total Posts': len(df),
    'Unique Authors': df['author'].nunique() if 'author' in df.columns else 'N/A',
    'Average Score': df['score'].mean() if 'score' in df.columns else 'N/A',
    'Median Score': df['score'].median() if 'score' in df.columns else 'N/A',
    'Max Score': df['score'].max() if 'score' in df.columns else 'N/A',
    'Average Comments': df['num_comments'].mean() if 'num_comments' in df.columns else 'N/A',
    'Total Comments': df['num_comments'].sum() if 'num_comments' in df.columns else 'N/A',
    'Self Posts (%)': (df['is_self'].sum() / len(df) * 100) if 'is_self' in df.columns else 'N/A',
}

print("=" * 50)
print("           DATASET SUMMARY")
print("=" * 50)
for key, value in summary.items():
    if isinstance(value, float):
        print(f"{key}: {value:.2f}")
    else:
        print(f"{key}: {value}")
print("=" * 50)