In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.append('../scripts') # adjust the path based on actual location
from quantitative_analysis import StockAnalyzer


In [None]:
# Load your CSV

apple_price_data = pd.read_csv('../src/data/yfinance_data/GOOG_historical_data.csv')
# Create analyzer object

appl_analyzer = StockAnalyzer(apple_price_data )
# Prepare data
appl_analyzer.prepare_data()


In [5]:
def fix_date_column(df, col='date'):
    """
    Cleans and normalizes a date column with detailed logging.

    Steps:
    - Parses to datetime
    - Drops invalid or missing dates
    - Normalizes to date (removes time)
    - Removes timezone (if present)
    - Sets column as index

    Args:
        df (pd.DataFrame): DataFrame containing the date column
        col (str): Name of the date column to clean

    Returns:
        pd.DataFrame: Cleaned DataFrame with datetime index
    """
    df = df.copy()
    original_rows = len(df)

    # Step 1: Convert to datetime
    df[col] = pd.to_datetime(df[col], errors='coerce')
    after_parse_invalid = df[col].isna().sum()

    # Step 2: Drop invalid dates
    df = df.dropna(subset=[col])
    after_drop_rows = len(df)

    # Step 3: Normalize to remove time
    df[col] = df[col].dt.normalize()

    # Step 4: Remove timezone
    if df[col].dt.tz is not None:
        df[col] = df[col].dt.tz_localize(None)

    # Step 5: Set index
    df.set_index(col, inplace=True)

    # Logging
    print(f"🧼 Cleaning '{col}' column:")
    print(f"   - Original rows: {original_rows}")
    print(f"   - Invalid dates parsed (NaT): {after_parse_invalid}")
    print(f"   - Rows remaining after cleaning: {after_drop_rows}")

    return df


In [6]:
# Load sentiment CSV
sentiment_df = pd.read_csv("../src/sentiment_with_polarity.csv")

# Filter for a specific ticker
sentiment_df = sentiment_df[sentiment_df['stock'] == 'GOOG']

print(f"✅ Sentiment data cleaned. Rows remaining: {len(sentiment_df)}")

# Clean the 'date' column
sentiment_df = fix_date_column(sentiment_df, col='date')

# Print number of valid rows remaining
print(f"✅ Sentiment data cleaned. Rows remaining: {len(sentiment_df)}")

# Group by date to get average sentiment per day
daily_sentiment = sentiment_df.groupby(sentiment_df.index)['polarity'].mean().to_frame()

✅ Sentiment data cleaned. Rows remaining: 1199
🧼 Cleaning 'date' column:
   - Original rows: 1199
   - Invalid dates parsed (NaT): 1189
   - Rows remaining after cleaning: 10
✅ Sentiment data cleaned. Rows remaining: 10
