In [9]:
import pandas as pd
import numpy as np

def create_mock_data():
    """Creates two mock DataFrames simulating stock prices and weekly sentiment scores."""
    dates = pd.to_datetime(pd.date_range('2025-01-01', periods=10, freq='W-MON'))
    
    # DF 1: Stock Prices (has 'Date' and 'Close')
    df_prices = pd.DataFrame({
        'Date': dates,
        'Stock_Close': [150.0, 152.5, 149.0, 155.0, 154.5, 160.0, 161.0, 158.0, 163.0, 165.0],
        # FIX APPLIED HERE: Replace 'M' with actual million values (x 1,000,000)
        'Volume': [1000000, 1200000, 900000, 1500000, 1100000, 1800000, 1400000, 1300000, 1700000, 2000000]
    })
    
    # DF 2: Weekly Sentiment (Missing one date, using 'Date' as index for join)
    sentiment_dates = dates[::2] # Skip every other date to simulate mismatch
    df_sentiment = pd.DataFrame({
        'Sentiment_Score': [0.6, 0.8, 0.3, 0.9, 0.7],
        'Date': sentiment_dates
    })
    
    return df_prices, df_sentiment
def integrate_and_analyze_financial_data():
    df_prices, df_sentiment = create_mock_data()
    
    print("--- 1. Initial Data Frames ---")
    print("DF Prices:\n", df_prices.head())
    print("\nDF Sentiment:\n", df_sentiment.head())
    
    # --- Integration Step (Merge: Combining Attributes on a Foreign Key/Date) ---
    # We use a Left Join to keep all price data and add sentiment data where available.
    # The 'on' parameter specifies the common column ('Date').
    integrated_df = pd.merge(
        df_prices, 
        df_sentiment, 
        on='Date', 
        how='left' 
    )
    
    print("\n--- 2. Merged Data (Left Join) ---")
    print(integrated_df)
    
    # --- Preprocessing Step (Handling Missing Values) ---
    # Fill missing sentiment scores (NaNs) with the average score.
    # This addresses the data 'Veracity' aspect mentioned in the lecture.
    avg_sentiment = integrated_df['Sentiment_Score'].mean()
    integrated_df['Sentiment_Score'] = integrated_df['Sentiment_Score'].fillna(avg_sentiment)
    
    # --- Analysis Step (Normalization/Standardization) ---
    # Standardize the 'Stock_Close' column (Z-score normalization).
    mean_close = integrated_df['Stock_Close'].mean()
    std_close = integrated_df['Stock_Close'].std()
    
    integrated_df['Close_ZScore'] = (integrated_df['Stock_Close'] - mean_close) / std_close
    
    print("\n--- 3. Final Preprocessed Data ---")
    print(integrated_df[['Date', 'Stock_Close', 'Sentiment_Score', 'Close_ZScore']])
    
    # Final analysis: Calculate correlation between Z-Score and imputed Sentiment
    correlation = integrated_df['Close_ZScore'].corr(integrated_df['Sentiment_Score'])
    print(f"\nCorrelation (Close Z-Score vs. Imputed Sentiment): {correlation:.4f}")

integrate_and_analyze_financial_data()

--- 1. Initial Data Frames ---
DF Prices:
         Date  Stock_Close   Volume
0 2025-01-06        150.0  1000000
1 2025-01-13        152.5  1200000
2 2025-01-20        149.0   900000
3 2025-01-27        155.0  1500000
4 2025-02-03        154.5  1100000

DF Sentiment:
    Sentiment_Score       Date
0              0.6 2025-01-06
1              0.8 2025-01-20
2              0.3 2025-02-03
3              0.9 2025-02-17
4              0.7 2025-03-03

--- 2. Merged Data (Left Join) ---
        Date  Stock_Close   Volume  Sentiment_Score
0 2025-01-06        150.0  1000000              0.6
1 2025-01-13        152.5  1200000              NaN
2 2025-01-20        149.0   900000              0.8
3 2025-01-27        155.0  1500000              NaN
4 2025-02-03        154.5  1100000              0.3
5 2025-02-10        160.0  1800000              NaN
6 2025-02-17        161.0  1400000              0.9
7 2025-02-24        158.0  1300000              NaN
8 2025-03-03        163.0  1700000             