In [7]:
import pandas as pd
from textblob import TextBlob
from scipy.stats import pearsonr

def load_and_prepare_data(stock_file, news_file):
    stock_data = pd.read_csv(stock_file, parse_dates=['Date'])
    news_data = pd.read_csv(news_file, parse_dates=['date'])

    stock_data.set_index('Date', inplace=True)
    news_data.set_index('date', inplace=True)

    return stock_data, news_data

stock_file = 'C:/Users/nejat/AIM Projects/week1 data/yfinance_data/AAPL_historical_data.csv'
news_file = 'C:/Users/nejat/AIM Projects/week1 data/raw_analyst_ratings/raw_analyst_ratings.csv'
stock_data, news_data = load_and_prepare_data(stock_file, news_file)

print("Stock Data:")
print(stock_data.head())  

print("\nNews Data:")
print(news_data.head()) 


Stock Data:
                Open      High       Low     Close  Adj Close     Volume  \
Date                                                                       
1980-12-12  0.128348  0.128906  0.128348  0.128348   0.098943  469033600   
1980-12-15  0.122210  0.122210  0.121652  0.121652   0.093781  175884800   
1980-12-16  0.113281  0.113281  0.112723  0.112723   0.086898  105728000   
1980-12-17  0.115513  0.116071  0.115513  0.115513   0.089049   86441600   
1980-12-18  0.118862  0.119420  0.118862  0.118862   0.091630   73449600   

            Dividends  Stock Splits  
Date                                 
1980-12-12        0.0           0.0  
1980-12-15        0.0           0.0  
1980-12-16        0.0           0.0  
1980-12-17        0.0           0.0  
1980-12-18        0.0           0.0  

News Data:
                           Unnamed: 0  \
date                                    
2020-06-05 10:30:54-04:00           0   
2020-06-03 10:45:20-04:00           1   
2020-05-26 04

In [8]:
def perform_sentiment_analysis(news_data, sample_size=100):
    def get_sentiment(text):
        analysis = TextBlob(text)
        return analysis.sentiment.polarity  

    if news_data is None or news_data.empty:
        print("Error: news_data is either None or empty.")
        return None
    
    sample_size = min(sample_size, len(news_data))
    news_data_sample = news_data.sample(sample_size)
    
    news_data_sample['Sentiment'] = news_data_sample['headline'].apply(get_sentiment)
    
    print("\nNews Data with Sentiment (Sample):")
    print(news_data_sample[['headline', 'Sentiment']].head())
    
    return news_data_sample
    
news_data_sample = perform_sentiment_analysis(news_data, sample_size=100)


News Data with Sentiment (Sample):
                                                                    headline  \
date                                                                           
2019-10-25 00:00:00        Tata Motors shares are trading higher after th...   
2017-04-18 00:00:00        Motorola Solutions Files Patent Infringement C...   
2011-05-13 00:00:00        Top day traders Stocks and nalyst Upgrades & D...   
2020-05-08 07:52:01-04:00  Cramer Shares His Thoughts On DraftKings, Well...   
2012-07-11 00:00:00        UPDATE: Stifel Nicolaus Downgrades hhgregg to ...   

                           Sentiment  
date                                  
2019-10-25 00:00:00             0.25  
2017-04-18 00:00:00             0.00  
2011-05-13 00:00:00             0.50  
2020-05-08 07:52:01-04:00       0.50  
2012-07-11 00:00:00             0.00  


In [9]:
def calculate_daily_returns(stock_data):
    if stock_data is None:
        raise ValueError("stock_data is None. Please check the data loading function.")
    
    stock_data['Daily_Returns'] = stock_data['Close'].pct_change()
    print("\nStock Data with Daily Returns:")
    print(stock_data.head())
    
    return stock_data

stock_data = calculate_daily_returns(stock_data)



Stock Data with Daily Returns:
                Open      High       Low     Close  Adj Close     Volume  \
Date                                                                       
1980-12-12  0.128348  0.128906  0.128348  0.128348   0.098943  469033600   
1980-12-15  0.122210  0.122210  0.121652  0.121652   0.093781  175884800   
1980-12-16  0.113281  0.113281  0.112723  0.112723   0.086898  105728000   
1980-12-17  0.115513  0.116071  0.115513  0.115513   0.089049   86441600   
1980-12-18  0.118862  0.119420  0.118862  0.118862   0.091630   73449600   

            Dividends  Stock Splits  Daily_Returns  
Date                                                
1980-12-12        0.0           0.0            NaN  
1980-12-15        0.0           0.0      -0.052171  
1980-12-16        0.0           0.0      -0.073398  
1980-12-17        0.0           0.0       0.024751  
1980-12-18        0.0           0.0       0.028992  


In [10]:
def aggregate_daily_sentiment(news_data_sample):
    if 'Sentiment' not in news_data_sample.columns:
        print("Error: 'Sentiment' column is missing.")
        return None

    avg_daily_sentiment = news_data_sample.groupby(news_data_sample.index)['Sentiment'].mean().reset_index()
    avg_daily_sentiment.columns = ['Date', 'Avg_Sentiment']
    print("\nAverage Daily Sentiment Scores:")
    print(avg_daily_sentiment.head())
    
    return avg_daily_sentiment
    
avg_daily_sentiment = aggregate_daily_sentiment(news_data_sample)


Average Daily Sentiment Scores:
                  Date  Avg_Sentiment
0  2010-08-04 00:00:00            0.0
1  2010-10-26 00:00:00            0.0
2  2010-11-30 00:00:00            0.5
3  2010-12-18 00:00:00            0.0
4  2011-01-21 00:00:00            0.0


In [11]:
def merge_stock_sentiment(stock_data, avg_daily_sentiment):
    stock_data = stock_data.reset_index().rename(columns={'Date': 'Date'})
    avg_daily_sentiment = avg_daily_sentiment.rename(columns={'index': 'Date', 'Sentiment': 'Avg_Sentiment'})

    stock_data['Date'] = pd.to_datetime(stock_data['Date'], errors='coerce')
    avg_daily_sentiment['Date'] = pd.to_datetime(avg_daily_sentiment['Date'], errors='coerce')

    stock_data.dropna(subset=['Date'], inplace=True)
    avg_daily_sentiment.dropna(subset=['Date'], inplace=True)

    merged_data = pd.merge(stock_data[['Date', 'Daily_Returns']], avg_daily_sentiment[['Date', 'Avg_Sentiment']], on='Date', how='inner')
    print("\nMerged Data:")
    print(merged_data.head())
    
    return merged_data 

merged_data = merge_stock_sentiment(stock_data, avg_daily_sentiment)



Merged Data:
        Date  Daily_Returns  Avg_Sentiment
0 2010-08-04       0.004009           0.00
1 2010-10-26      -0.002558           0.00
2 2010-11-30      -0.018052           0.50
3 2011-01-21      -0.017915           0.00
4 2011-03-31      -0.000344          -0.15


In [12]:
def correlation_analysis(merged_data):
    if 'Daily_Returns' not in merged_data.columns or 'Avg_Sentiment' not in merged_data.columns:
        raise ValueError("Merged data is missing required columns.")
    
    correlation, p_value = pearsonr(merged_data['Daily_Returns'].dropna(), merged_data['Avg_Sentiment'].dropna())
    
    print("\nCorrelation Analysis:")
    print(f"Pearson Correlation Coefficient: {correlation:.4f}")
    print(f"P-value: {p_value:.4f}")
    
    return correlation, p_value

correlation_analysis(merged_data)


Correlation Analysis:
Pearson Correlation Coefficient: 0.0752
P-value: 0.4712


(0.07521394492755487, 0.4712229167360318)