In [3]:
import pandas as pd
from textblob import TextBlob

# Load raw news dataset
df_news = pd.read_csv('../data/raw_analyst_ratings.csv/raw_analyst_ratings.csv')  # Adjust path as needed

# Convert date column to datetime
df_news['date'] = pd.to_datetime(df_news['date'], errors='coerce')

# Define a function to calculate sentiment polarity using TextBlob
def get_sentiment(text):
    if pd.isna(text):
        return 0.0  # Neutral if no text
    return TextBlob(text).sentiment.polarity  # returns value between -1 (negative) and 1 (positive)

# Apply sentiment function on headlines
df_news['sentiment_score'] = df_news['headline'].apply(get_sentiment)

# Select relevant columns for output
df_sentiment = df_news[['date', 'stock', 'headline', 'sentiment_score']]

# Save to CSV
df_sentiment.to_csv('../data/sentiment_data.csv', index=False)

print("Sentiment analysis complete. File saved as 'data/sentiment_data.csv'.")


Sentiment analysis complete. File saved as 'data/sentiment_data.csv'.


In [16]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Directories
data_dir = '../data'
stock_data_subdir = os.path.join(data_dir, 'yfinance_data')
sentiment_file = os.path.join(data_dir, 'sentiment_data.csv')
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

print(" Loading sentiment data...")
sentiment_df = pd.read_csv(sentiment_file)

#  Convert to datetime and remove timezone before normalizing
sentiment_df['date'] = pd.to_datetime(sentiment_df['date']).dt.tz_localize(None)
sentiment_df['Date'] = sentiment_df['date'].dt.normalize()


print(f" Sentiment Data Loaded: {sentiment_df.shape}")
print(f" Sentiment Columns: {sentiment_df.columns}")

correlation_results = []

stock_files = [f for f in os.listdir(stock_data_subdir) if f.endswith('_historical_data.csv')]
print(f" Found stock files: {stock_files}")

for file in stock_files:
    stock_name = file.split('_')[0]
    stock_path = os.path.join(stock_data_subdir, file)
    
    stock_df = pd.read_csv(stock_path)
    stock_df['Date'] = pd.to_datetime(stock_df['Date'])
    stock_df.sort_values('Date', inplace=True)
    stock_df['Return'] = stock_df['Close'].pct_change()

    stock_sentiment = sentiment_df[sentiment_df['stock'] == stock_name]
    if stock_sentiment.empty:
        print(f"{stock_name}: No sentiment data available.")
        continue

    stock_sentiment = stock_sentiment[['Date', 'sentiment_score']]
    # Optionally: Filter out zero sentiment scores (uncomment next line)
    # stock_sentiment = stock_sentiment[stock_sentiment['sentiment_score'] != 0]

    merged_df = pd.merge(
        stock_df[['Date', 'Return']],
        stock_sentiment,
        on='Date',
        how='inner'
    )
    merged_df.rename(columns={'sentiment_score': 'Sentiment'}, inplace=True)
    merged_df.dropna(inplace=True)

    print(f"{stock_name}: Merged rows = {len(merged_df)}")

    # Save merged data to CSV
    merged_csv_path = os.path.join(output_dir, f"{stock_name}_merged.csv")
    merged_df.to_csv(merged_csv_path, index=False)

    # Plot scatter if enough data
    if len(merged_df) > 1:
        correlation = merged_df['Return'].corr(merged_df['Sentiment'])
        correlation_results.append({'Stock': stock_name, 'Correlation': correlation})
        print(f"{stock_name}: Correlation = {correlation:.4f}")

        # Plot
        plt.figure(figsize=(6, 4))
        plt.scatter(merged_df['Sentiment'], merged_df['Return'], alpha=0.6)
        plt.title(f"{stock_name}: Sentiment vs Return (Corr={correlation:.2f})")
        plt.xlabel('Sentiment Score')
        plt.ylabel('Daily Return')
        plt.grid(True)
        plt.tight_layout()
        plot_path = os.path.join(output_dir, f"{stock_name}_scatter.png")
        plt.savefig(plot_path)
        plt.close()
    else:
        print(f"{stock_name}: Not enough data points to calculate correlation.")

# Save final correlations
results_df = pd.DataFrame(correlation_results)
results_csv = os.path.join(output_dir, 'correlation_results.csv')
results_df.to_csv(results_csv, index=False)

print(f"\n All results saved in '{output_dir}' directory.")


 Loading sentiment data...
 Sentiment Data Loaded: (1407328, 5)
 Sentiment Columns: Index(['date', 'stock', 'headline', 'sentiment_score', 'Date'], dtype='object')
 Found stock files: ['AAPL_historical_data.csv', 'AMZN_historical_data.csv', 'GOOG_historical_data.csv', 'META_historical_data.csv', 'MSFT_historical_data.csv', 'NVDA_historical_data.csv', 'TSLA_historical_data.csv']
AAPL: Merged rows = 10
AAPL: Correlation = -0.2130
AMZN: Merged rows = 10
AMZN: Correlation = -0.5381
GOOG: Merged rows = 10
GOOG: Correlation = -0.2829
META: No sentiment data available.
MSFT: No sentiment data available.
NVDA: Merged rows = 9
NVDA: Correlation = 0.0587
TSLA: Merged rows = 10
TSLA: Correlation = nan


  c /= stddev[:, None]
  c /= stddev[None, :]



 All results saved in 'output' directory.
