# Data Exploration - MURA-Finance Project
**Phase 1, Task 1: Data Exploration**

This notebook performs comprehensive data exploration on all CSV files.

## Objectives
1. Load and inspect all CSV files
2. Understand data schema and column meanings
3. Check data quality (missing values, duplicates)
4. Analyze data distributions (sentiment, ticker, temporal)
5. Map relationship between ground truth and predictions
6. Identify data inconsistencies or anomalies

## 1. Setup and Imports

In [None]:
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

# Add src to path
sys.path.insert(0, str(Path().resolve().parent))

from src.utils.data_loader import (
    load_all_dataframes,
    get_schema_info,
    print_dataframe_summary,
)
from src.utils.analysis import (
    analyze_sentiment_distribution,
    map_ground_truth_to_predictions,
)

# Set display options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", 100)

# Set plotting style
plt.style.use("seaborn-v0_8")
sns.set_palette("husl")

print("Setup complete!")

## 2. Load Data

In [None]:
# Set base path
base_path = Path().resolve().parent

# Load all dataframes
print("Loading dataframes...")
data = load_all_dataframes(base_path)

print(f"\nLoaded {len(data)} data files\n")

# Display what we loaded
for name, df in data.items():
    print(f"{name}: {len(df)} rows × {len(df.columns)} columns")

## 3. Ground Truth Data Analysis

In [None]:
if "ground_truth" in data:
    df_gt = data["ground_truth"]

    print_dataframe_summary(df_gt, "Ground Truth")

    # Display schema
    schema_info = get_schema_info(df_gt, "Ground Truth")
    display(schema_info)

    # Show first few rows
    display(df_gt.head(3))

In [None]:
# Sentiment distribution with visualization
if "ground_truth" in data and "true_sentiment" in data["ground_truth"].columns:
    df_gt = data["ground_truth"]

    sentiment_dist = analyze_sentiment_distribution(df_gt, "true_sentiment")

    print("Sentiment Distribution:")
    for sentiment, count in sentiment_dist["counts"].items():
        pct = sentiment_dist["percentages"].get(sentiment, 0)
        print(f"  {sentiment}: {count} ({pct}%)")

    # Visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

    # Bar chart
    sentiment_df = pd.DataFrame(
        {
            "Sentiment": list(sentiment_dist["counts"].keys()),
            "Count": list(sentiment_dist["counts"].values()),
        }
    )
    sns.barplot(data=sentiment_df, x="Sentiment", y="Count", ax=ax1)
    ax1.set_title("Sentiment Distribution - Counts")
    ax1.set_ylabel("Count")

    # Pie chart
    ax2.pie(
        sentiment_dist["counts"].values(),
        labels=sentiment_dist["counts"].keys(),
        autopct="%1.1f%%",
        startangle=90,
    )
    ax2.set_title("Sentiment Distribution - Percentages")

    plt.tight_layout()
    plt.show()

## 4. Single Article Predictions Analysis

In [None]:
if "single_article" in data:
    df_single = data["single_article"]

    print(f"Shape: {df_single.shape[0]} rows × {df_single.shape[1]} columns")

    # Analyze sentiment columns
    sentiment_cols = [col for col in df_single.columns if "sentiment" in col.lower()]
    print(f"\nFound {len(sentiment_cols)} sentiment columns:")
    for col in sentiment_cols:
        print(f"  - {col}")

    # Show schema for first 20 columns
    schema_info = get_schema_info(df_single, "Single Article")
    display(schema_info.head(20))

## 5. Ground Truth vs Predictions Mapping

In [None]:
if "ground_truth" in data and "single_article" in data:
    merged, mapping_stats = map_ground_truth_to_predictions(
        data["ground_truth"], data["single_article"]
    )

    print("Mapping Statistics:")
    print(f"  Ground truth records: {mapping_stats['total_ground_truth']}")
    print(f"  Prediction records: {mapping_stats['total_predictions']}")
    print(f"  Matched records: {mapping_stats['matched_records']}")
    print(f"  Match rate: {mapping_stats['match_rate']}%")

    if mapping_stats["matched_records"] > 0:
        print("\n✓ Ground truth and predictions can be matched!")

## 6. Data Inconsistencies & Anomalies

In [None]:
# Check sentiment encoding mismatch
if "ground_truth" in data and "single_article" in data:
    df_gt = data["ground_truth"]
    df_single = data["single_article"]

    if "true_sentiment" in df_gt.columns and "true_sentiment" in df_single.columns:
        gt_unique = set(df_gt["true_sentiment"].unique())
        single_unique = set(df_single["true_sentiment"].unique())

        print("⚠ Sentiment Encoding Mismatch:")
        print(f"  Ground Truth: {gt_unique}")
        print(f"  Predictions: {single_unique}")
        print("\n  Mapping needed:")
        print("    Positive ↔ 1")
        print("    Neutral ↔ 0")
        print("    Negative ↔ -1")