# Task 3: Enhanced Data Preparation & Correlation Analysis

Complete workflow:
1. Data Preparation & Date Normalization
2. Sentiment Analysis
3. Calculate Daily Stock Returns
4. Aggregate Sentiments
5. Pearson Correlation Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## 1. Load & Normalize Dates

In [None]:
# Load news
news_df = pd.read_csv('../data/raw_analyst_ratings.csv')
news_df['date'] = pd.to_datetime(news_df['date'], utc=True).dt.tz_localize(None)
news_df['trading_date'] = pd.to_datetime(news_df['date'].dt.date)

# Load stocks
tickers = ['AAPL', 'AMZN', 'GOOG', 'META', 'MSFT', 'NVDA']
stock_data = {}
for ticker in tickers:
    df = pd.read_csv(f'../data/Data/{ticker}.csv')
    df['trading_date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('trading_date')
    stock_data[ticker] = df

print(f"News: {len(news_df)} records")
print(f"Stocks loaded: {len(tickers)}")

## 2. Sentiment Analysis

In [None]:
vader = SentimentIntensityAnalyzer()

news_df['sentiment_textblob'] = news_df['headline'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
news_df['sentiment_vader'] = news_df['headline'].apply(lambda x: vader.polarity_scores(str(x))['compound'])
news_df['sentiment_score'] = (news_df['sentiment_textblob'] + news_df['sentiment_vader']) / 2

print("Sentiment calculated")
news_df[['headline', 'sentiment_score']].head()

## 3. Calculate Daily Returns

In [None]:
for ticker in tickers:
    stock_data[ticker]['Daily_Return'] = stock_data[ticker]['Close'].pct_change() * 100

print("Daily returns calculated")
stock_data['AAPL'][['trading_date', 'Close', 'Daily_Return']].tail()

## 4. Aggregate Daily Sentiment

In [None]:
daily_sentiment = news_df.groupby('trading_date').agg({
    'sentiment_score': ['mean', 'std', 'count']
}).reset_index()
daily_sentiment.columns = ['trading_date', 'sentiment_mean', 'sentiment_std', 'news_count']

print(f"Aggregated: {len(daily_sentiment)} days")
daily_sentiment.head()

## 5. Merge & Calculate Correlation

In [None]:
results = {}
merged_data = {}

for ticker in tickers:
    merged = pd.merge(
        stock_data[ticker][['trading_date', 'Close', 'Daily_Return']],
        daily_sentiment[['trading_date', 'sentiment_mean']],
        on='trading_date'
    ).dropna()
    
    corr, pval = stats.pearsonr(merged['sentiment_mean'], merged['Daily_Return'])
    
    results[ticker] = {'Correlation': corr, 'P_Value': pval, 'N': len(merged)}
    merged_data[ticker] = merged
    
    print(f"{ticker}: r={corr:.4f}, p={pval:.4f}, n={len(merged)}")

pd.DataFrame(results).T

## 6. Visualizations

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(16, 15))
axes = axes.flatten()

for idx, ticker in enumerate(tickers):
    df = merged_data[ticker]
    ax = axes[idx]
    
    ax.scatter(df['sentiment_mean'], df['Daily_Return'], alpha=0.5)
    z = np.polyfit(df['sentiment_mean'], df['Daily_Return'], 1)
    p = np.poly1d(z)
    ax.plot(df['sentiment_mean'], p(df['sentiment_mean']), "r--", linewidth=2)
    
    corr = results[ticker]['Correlation']
    pval = results[ticker]['P_Value']
    ax.set_title(f'{ticker}: r={corr:.3f}, p={pval:.4f}', fontweight='bold')
    ax.set_xlabel('Sentiment Score')
    ax.set_ylabel('Daily Return (%)')
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()