In [1]:
import pandas as pd
from textblob import TextBlob
from scipy.stats import pearsonr

def load_and_prepare_data(stock_file, news_file):
    stock_data = pd.read_csv(stock_file, parse_dates=['Date'])
    news_data = pd.read_csv(news_file, parse_dates=['date'])

    stock_data.set_index('Date', inplace=True)
    news_data.set_index('date', inplace=True)

    return stock_data, news_data

stock_file = 'C:/Users/nejat/AIM Projects/week1 data/yfinance_data/AMZN_historical_data.csv'
news_file = 'C:/Users/nejat/AIM Projects/week1 data/raw_analyst_ratings/raw_analyst_ratings.csv'
stock_data, news_data = load_and_prepare_data(stock_file, news_file)

print("Stock Data:")
print(stock_data.head())  

print("\nNews Data:")
print(news_data.head()) 


Stock Data:
                Open      High       Low     Close  Adj Close      Volume  \
Date                                                                        
1997-05-15  0.121875  0.125000  0.096354  0.097917   0.097917  1443120000   
1997-05-16  0.098438  0.098958  0.085417  0.086458   0.086458   294000000   
1997-05-19  0.088021  0.088542  0.081250  0.085417   0.085417   122136000   
1997-05-20  0.086458  0.087500  0.081771  0.081771   0.081771   109344000   
1997-05-21  0.081771  0.082292  0.068750  0.071354   0.071354   377064000   

            Dividends  Stock Splits  
Date                                 
1997-05-15        0.0           0.0  
1997-05-16        0.0           0.0  
1997-05-19        0.0           0.0  
1997-05-20        0.0           0.0  
1997-05-21        0.0           0.0  

News Data:
                           Unnamed: 0  \
date                                    
2020-06-05 10:30:54-04:00           0   
2020-06-03 10:45:20-04:00           1   
2020-0

In [2]:
def perform_sentiment_analysis(news_data, sample_size=100):
    def get_sentiment(text):
        analysis = TextBlob(text)
        return analysis.sentiment.polarity  

    if news_data is None or news_data.empty:
        print("Error: news_data is either None or empty.")
        return None
    
    sample_size = min(sample_size, len(news_data))
    news_data_sample = news_data.sample(sample_size)
    
    news_data_sample['Sentiment'] = news_data_sample['headline'].apply(get_sentiment)
    
    print("\nNews Data with Sentiment (Sample):")
    print(news_data_sample[['headline', 'Sentiment']].head())
    
    return news_data_sample
    
news_data_sample = perform_sentiment_analysis(news_data, sample_size=100)


News Data with Sentiment (Sample):
                                                              headline  \
date                                                                     
2011-07-07 00:00:00       EXCO Resources Spiking Lower on Heavy Volume   
2015-12-22 00:00:00                   Benzinga's Top #PreMarket Losers   
2020-01-27 00:00:00  DPW Holdings Announces That It has Finalized t...   
2012-07-16 00:00:00                     Earnings Scheduled For July 16   
2011-07-06 00:00:00      Five Forex ETFs Your Broker Forgot To Mention   

                     Sentiment  
date                            
2011-07-07 00:00:00      -0.20  
2015-12-22 00:00:00       0.15  
2020-01-27 00:00:00       0.10  
2012-07-16 00:00:00       0.00  
2011-07-06 00:00:00       0.00  


In [3]:
def calculate_daily_returns(stock_data):
    if stock_data is None:
        raise ValueError("stock_data is None. Please check the data loading function.")
    
    stock_data['Daily_Returns'] = stock_data['Close'].pct_change()
    print("\nStock Data with Daily Returns:")
    print(stock_data.head())
    
    return stock_data

stock_data = calculate_daily_returns(stock_data)



Stock Data with Daily Returns:
                Open      High       Low     Close  Adj Close      Volume  \
Date                                                                        
1997-05-15  0.121875  0.125000  0.096354  0.097917   0.097917  1443120000   
1997-05-16  0.098438  0.098958  0.085417  0.086458   0.086458   294000000   
1997-05-19  0.088021  0.088542  0.081250  0.085417   0.085417   122136000   
1997-05-20  0.086458  0.087500  0.081771  0.081771   0.081771   109344000   
1997-05-21  0.081771  0.082292  0.068750  0.071354   0.071354   377064000   

            Dividends  Stock Splits  Daily_Returns  
Date                                                
1997-05-15        0.0           0.0            NaN  
1997-05-16        0.0           0.0      -0.117028  
1997-05-19        0.0           0.0      -0.012040  
1997-05-20        0.0           0.0      -0.042685  
1997-05-21        0.0           0.0      -0.127392  


In [4]:
def aggregate_daily_sentiment(news_data_sample):
    if 'Sentiment' not in news_data_sample.columns:
        print("Error: 'Sentiment' column is missing.")
        return None

    avg_daily_sentiment = news_data_sample.groupby(news_data_sample.index)['Sentiment'].mean().reset_index()
    avg_daily_sentiment.columns = ['Date', 'Avg_Sentiment']
    print("\nAverage Daily Sentiment Scores:")
    print(avg_daily_sentiment.head())
    
    return avg_daily_sentiment
    
avg_daily_sentiment = aggregate_daily_sentiment(news_data_sample)


Average Daily Sentiment Scores:
                  Date  Avg_Sentiment
0  2010-03-15 00:00:00            0.0
1  2010-07-09 00:00:00            0.0
2  2010-08-09 00:00:00            0.0
3  2010-09-09 00:00:00            0.0
4  2010-12-15 00:00:00            0.3


In [5]:
def merge_stock_sentiment(stock_data, avg_daily_sentiment):
    stock_data = stock_data.reset_index().rename(columns={'Date': 'Date'})
    avg_daily_sentiment = avg_daily_sentiment.rename(columns={'index': 'Date', 'Sentiment': 'Avg_Sentiment'})

    stock_data['Date'] = pd.to_datetime(stock_data['Date'], errors='coerce')
    avg_daily_sentiment['Date'] = pd.to_datetime(avg_daily_sentiment['Date'], errors='coerce')

    stock_data.dropna(subset=['Date'], inplace=True)
    avg_daily_sentiment.dropna(subset=['Date'], inplace=True)

    merged_data = pd.merge(stock_data[['Date', 'Daily_Returns']], avg_daily_sentiment[['Date', 'Avg_Sentiment']], on='Date', how='inner')
    print("\nMerged Data:")
    print(merged_data.head())
    
    return merged_data 

merged_data = merge_stock_sentiment(stock_data, avg_daily_sentiment)



Merged Data:
        Date  Daily_Returns  Avg_Sentiment
0 2010-03-15      -0.005234            0.0
1 2010-07-09       0.008949            0.0
2 2010-08-09       0.003974            0.0
3 2010-09-09       0.008912            0.0
4 2010-12-15       0.009371            0.3


In [6]:
def correlation_analysis(merged_data):
    if 'Daily_Returns' not in merged_data.columns or 'Avg_Sentiment' not in merged_data.columns:
        raise ValueError("Merged data is missing required columns.")
    
    correlation, p_value = pearsonr(merged_data['Daily_Returns'].dropna(), merged_data['Avg_Sentiment'].dropna())
    
    print("\nCorrelation Analysis:")
    print(f"Pearson Correlation Coefficient: {correlation:.4f}")
    print(f"P-value: {p_value:.4f}")
    
    return correlation, p_value

correlation_analysis(merged_data)


Correlation Analysis:
Pearson Correlation Coefficient: 0.1171
P-value: 0.2507


(0.11712840647562921, 0.25072367709026766)