# Task 3: Correlation between News Sentiment and Stock Movement

## Objective
Analyze the correlation between news sentiment and stock price movements:
1. **Date Alignment**: Align news and stock data by dates
2. **Sentiment Analysis**: Quantify news headline sentiment
3. **Correlation Analysis**: Test correlation between sentiment and stock returns

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## Step 1: Load Data

In [None]:
# Load news data from raw_analyst_ratings.csv
news_df = pd.read_csv('../data/raw_analyst_ratings.csv')
news_df['date'] = pd.to_datetime(news_df['date'], utc=True)
news_df['date'] = news_df['date'].dt.tz_localize(None)  # Remove timezone for easier merging

print(f"News data: {len(news_df)} records")
print(f"Date range: {news_df['date'].min()} to {news_df['date'].max()}")
print(f"Unique stocks: {news_df['stock'].nunique()}")
print(f"Unique publishers: {news_df['publisher'].nunique()}")

# Show sample
news_df.head()

In [None]:
# Load stock data and map stock symbols
tickers = ['AAPL', 'AMZN', 'GOOG', 'META', 'MSFT', 'NVDA']
stock_data = {}

# Create mapping for stock symbols in news data
stock_mapping = {
    'AAPL': 'AAPL',
    'AMZN': 'AMZN', 
    'GOOG': 'GOOG',
    'GOOGL': 'GOOG',  # Google has multiple tickers
    'META': 'META',
    'FB': 'META',     # Facebook renamed to Meta
    'MSFT': 'MSFT',
    'NVDA': 'NVDA'
}

for ticker in tickers:
    df = pd.read_csv(f'../data/Data/{ticker}.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date')
    df['Daily_Return'] = df['Close'].pct_change()
    stock_data[ticker] = df
    print(f"{ticker}: {len(df)} records from {df['Date'].min().date()} to {df['Date'].max().date()}")

print(f"\nStock data loaded for {len(tickers)} tickers")
stock_data['AAPL'].head()

## Step 2: Sentiment Analysis on News Headlines

In [None]:
# Initialize sentiment analyzers
vader = SentimentIntensityAnalyzer()

def get_textblob_sentiment(text):
    """Get sentiment using TextBlob"""
    try:
        return TextBlob(str(text)).sentiment.polarity
    except:
        return 0

def get_vader_sentiment(text):
    """Get sentiment using VADER"""
    try:
        return vader.polarity_scores(str(text))['compound']
    except:
        return 0

# Calculate sentiment scores
print("Calculating sentiment scores...")
news_df['sentiment_textblob'] = news_df['headline'].apply(get_textblob_sentiment)
news_df['sentiment_vader'] = news_df['headline'].apply(get_vader_sentiment)

# Average sentiment score
news_df['sentiment_avg'] = (news_df['sentiment_textblob'] + news_df['sentiment_vader']) / 2

# Sentiment category
news_df['sentiment_category'] = pd.cut(
    news_df['sentiment_avg'],
    bins=[-1, -0.05, 0.05, 1],
    labels=['Negative', 'Neutral', 'Positive']
)

print("Sentiment analysis complete")
news_df[['headline', 'sentiment_textblob', 'sentiment_vader', 'sentiment_avg', 'sentiment_category']].head(10)

In [None]:
# Sentiment distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Histogram
axes[0].hist(news_df['sentiment_avg'], bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(0, color='red', linestyle='--', linewidth=2)
axes[0].set_title('Distribution of Sentiment Scores', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Sentiment Score')
axes[0].set_ylabel('Frequency')
axes[0].grid(alpha=0.3)

# Category counts
sentiment_counts = news_df['sentiment_category'].value_counts()
axes[1].bar(sentiment_counts.index, sentiment_counts.values, edgecolor='black', alpha=0.7)
axes[1].set_title('Sentiment Category Distribution', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Count')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nSentiment Statistics:")
print(news_df['sentiment_avg'].describe())
print(f"\nSentiment Categories:")
print(sentiment_counts)

## Step 3: Aggregate Daily Sentiment Scores

In [None]:
# Aggregate sentiment by date for each stock
# We'll create both overall daily sentiment and stock-specific sentiment

# Overall daily sentiment (all news)
daily_sentiment_all = news_df.groupby(news_df['date'].dt.date).agg({
    'sentiment_avg': ['mean', 'std', 'count'],
    'sentiment_textblob': 'mean',
    'sentiment_vader': 'mean'
}).reset_index()

daily_sentiment_all.columns = ['date', 'sentiment_mean', 'sentiment_std', 'news_count', 'textblob_mean', 'vader_mean']
daily_sentiment_all['date'] = pd.to_datetime(daily_sentiment_all['date'])

print(f"Overall daily sentiment aggregated: {len(daily_sentiment_all)} days")

# Stock-specific daily sentiment
stock_sentiment = {}
for ticker in tickers:
    # Filter news for this specific stock
    stock_news = news_df[news_df['stock'] == ticker].copy()
    
    if len(stock_news) > 0:
        daily = stock_news.groupby(stock_news['date'].dt.date).agg({
            'sentiment_avg': ['mean', 'count']
        }).reset_index()
        daily.columns = ['date', 'sentiment_mean', 'news_count']
        daily['date'] = pd.to_datetime(daily['date'])
        stock_sentiment[ticker] = daily
        print(f"{ticker}: {len(stock_news)} news articles, {len(daily)} unique days")
    else:
        print(f"{ticker}: No specific news found")

print("\nUsing overall daily sentiment for correlation analysis")
daily_sentiment_all.head(10)

In [None]:
# Plot daily sentiment over time
fig, axes = plt.subplots(2, 1, figsize=(16, 10))

# Sentiment mean
axes[0].plot(daily_sentiment['date'], daily_sentiment['sentiment_mean'], linewidth=1.5)
axes[0].axhline(0, color='red', linestyle='--', alpha=0.5)
axes[0].fill_between(daily_sentiment['date'], daily_sentiment['sentiment_mean'], 0, alpha=0.3)
axes[0].set_title('Daily Average Sentiment Score', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Sentiment Score')
axes[0].grid(alpha=0.3)

# News count
axes[1].bar(daily_sentiment['date'], daily_sentiment['news_count'], alpha=0.7, width=1)
axes[1].set_title('Daily News Article Count', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Number of Articles')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## Step 4: Merge Sentiment with Stock Returns

In [None]:
# Merge sentiment with stock data for each ticker
merged_data = {}

for ticker in tickers:
    stock_df = stock_data[ticker].copy()
    stock_df['date'] = pd.to_datetime(stock_df['Date'].dt.date)
    
    # Try stock-specific sentiment first, fall back to overall sentiment
    if ticker in stock_sentiment and len(stock_sentiment[ticker]) > 10:
        sentiment_to_use = stock_sentiment[ticker]
        sentiment_type = "stock-specific"
    else:
        sentiment_to_use = daily_sentiment_all
        sentiment_type = "overall market"
    
    # Merge with sentiment
    merged = pd.merge(
        stock_df[['date', 'Close', 'Daily_Return', 'Volume']],
        sentiment_to_use[['date', 'sentiment_mean', 'news_count']],
        on='date',
        how='inner'
    )
    
    merged = merged.dropna()
    merged_data[ticker] = merged
    print(f"{ticker}: {len(merged)} days with both sentiment ({sentiment_type}) and stock data")

print("\nSample merged data for AAPL:")
merged_data['AAPL'].head()

## Step 5: Correlation Analysis

In [None]:
# Calculate correlations for each stock
correlation_results = {}

for ticker in tickers:
    df = merged_data[ticker]
    
    # Pearson correlation
    corr_pearson, p_value_pearson = stats.pearsonr(df['sentiment_mean'], df['Daily_Return'])
    
    # Spearman correlation
    corr_spearman, p_value_spearman = stats.spearmanr(df['sentiment_mean'], df['Daily_Return'])
    
    correlation_results[ticker] = {
        'Pearson_Corr': corr_pearson,
        'Pearson_PValue': p_value_pearson,
        'Spearman_Corr': corr_spearman,
        'Spearman_PValue': p_value_spearman,
        'Sample_Size': len(df)
    }

corr_df = pd.DataFrame(correlation_results).T
print("\nCorrelation Analysis Results:")
corr_df

In [None]:
# Visualize correlations
fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(tickers))
width = 0.35

bars1 = ax.bar(x - width/2, corr_df['Pearson_Corr'], width, label='Pearson', alpha=0.8)
bars2 = ax.bar(x + width/2, corr_df['Spearman_Corr'], width, label='Spearman', alpha=0.8)

ax.set_xlabel('Stock Ticker')
ax.set_ylabel('Correlation Coefficient')
ax.set_title('Correlation between News Sentiment and Stock Returns', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(tickers)
ax.legend()
ax.axhline(0, color='black', linewidth=0.5)
ax.grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## Step 6: Scatter Plots - Sentiment vs Returns

In [None]:
# Scatter plots for each stock
fig, axes = plt.subplots(3, 2, figsize=(16, 15))
axes = axes.flatten()

for idx, ticker in enumerate(tickers):
    df = merged_data[ticker]
    ax = axes[idx]
    
    # Scatter plot
    ax.scatter(df['sentiment_mean'], df['Daily_Return'] * 100, alpha=0.5, s=20)
    
    # Regression line
    z = np.polyfit(df['sentiment_mean'], df['Daily_Return'] * 100, 1)
    p = np.poly1d(z)
    ax.plot(df['sentiment_mean'], p(df['sentiment_mean']), "r--", linewidth=2, alpha=0.8)
    
    # Correlation info
    corr = correlation_results[ticker]['Pearson_Corr']
    pval = correlation_results[ticker]['Pearson_PValue']
    
    ax.set_title(f'{ticker} - Sentiment vs Returns\nCorr={corr:.3f}, p={pval:.4f}', fontweight='bold')
    ax.set_xlabel('Daily Sentiment Score')
    ax.set_ylabel('Daily Return (%)')
    ax.axhline(0, color='gray', linestyle='--', alpha=0.5)
    ax.axvline(0, color='gray', linestyle='--', alpha=0.5)
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## Step 7: Lagged Correlation Analysis

In [None]:
# Test lagged correlations (sentiment today vs returns tomorrow)
lagged_correlations = {}

for ticker in tickers:
    df = merged_data[ticker].copy()
    
    lags = {}
    for lag in range(0, 6):  # 0 to 5 days lag
        df[f'Return_lag{lag}'] = df['Daily_Return'].shift(-lag)
        corr, pval = stats.pearsonr(df['sentiment_mean'].dropna(), df[f'Return_lag{lag}'].dropna())
        lags[f'Lag_{lag}'] = corr
    
    lagged_correlations[ticker] = lags

lagged_df = pd.DataFrame(lagged_correlations).T
print("\nLagged Correlation Analysis:")
print("(Sentiment today vs Returns in N days)")
lagged_df

In [None]:
# Plot lagged correlations
plt.figure(figsize=(14, 7))

for ticker in tickers:
    plt.plot(range(6), lagged_df.loc[ticker], marker='o', label=ticker, linewidth=2)

plt.title('Lagged Correlation: Sentiment vs Future Returns', fontsize=14, fontweight='bold')
plt.xlabel('Lag (Days)')
plt.ylabel('Correlation Coefficient')
plt.legend()
plt.axhline(0, color='black', linestyle='--', linewidth=0.5)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## Summary and Insights

In [None]:
print("=" * 80)
print("CORRELATION ANALYSIS SUMMARY")
print("=" * 80)

print("\n1. SENTIMENT STATISTICS:")
print(f"   - Total news articles analyzed: {len(news_df)}")
print(f"   - Average sentiment score: {news_df['sentiment_avg'].mean():.4f}")
print(f"   - Sentiment std deviation: {news_df['sentiment_avg'].std():.4f}")

print("\n2. CORRELATION STRENGTH (Pearson):")
for ticker in tickers:
    corr = correlation_results[ticker]['Pearson_Corr']
    pval = correlation_results[ticker]['Pearson_PValue']
    sig = "***" if pval < 0.001 else "**" if pval < 0.01 else "*" if pval < 0.05 else "ns"
    print(f"   {ticker}: {corr:+.4f} ({sig})")

print("\n3. KEY FINDINGS:")
strongest = corr_df['Pearson_Corr'].abs().idxmax()
print(f"   - Strongest correlation: {strongest} ({corr_df.loc[strongest, 'Pearson_Corr']:.4f})")
print(f"   - Average correlation: {corr_df['Pearson_Corr'].mean():.4f}")
print(f"   - Significant correlations (p<0.05): {(corr_df['Pearson_PValue'] < 0.05).sum()}/{len(tickers)}")

print("\nNote: *** p<0.001, ** p<0.01, * p<0.05, ns = not significant")
print("=" * 80)