<a href="https://colab.research.google.com/github/megcp/Research-project-sentiment-driven-stock-market-prediction/blob/main/RP4_Feature_Engineering_and_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install torch



In [None]:
# importing dependencies
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import plotly.express as px
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Aggregate sentiment scores for each day

In [None]:
# load dataframe with LLM sentiments
df = pd.read_csv('/content/drive/My Drive/Research Project/apple_sentiment_probabilites_2022_2024.csv')
pd.set_option('max_colwidth', None)
df.head()

Unnamed: 0,Time Published,Title,Sentiment_title,Neutral_Prob_title,Positive_Prob_title,Negative_Prob_title
0,2022-03-07 15:03:00,"GOOGL Boosts Podcast Efforts: Should SPOT, AMZN & AAPL Worry?",positive,1e-06,0.999999,1.425253e-07
1,2022-03-07 17:05:00,What to Expect From Apple's First Big Event of The Year,neutral,0.999801,7.3e-05,0.0001254924
2,2022-03-08 03:16:11,Apple TV+ Enters Comcast Xfinity Platform: Is iPhone Maker's Streaming Push Turning Into A Shove?,neutral,0.997162,0.000518,0.002319331
3,2022-03-08 14:33:00,Apple Event 2022: New Low-End iPhone Expected to Have 5G Service,neutral,0.999533,0.000448,1.881107e-05
4,2022-03-08 16:15:00,What To Expect From Apple's Spring Product Launch Event,neutral,0.997772,0.000206,0.002022169


In [None]:
# create an sentiment score for each incidence
# postive probability is * by 1 to reflect its overall postive influence
# negative probaility is * by -1 to reflect its overal negative influence
# neural is * by 0, ultimatley  to reflect its lack of influence
df['Sentiment Score'] = df['Positive_Prob_title'] * 1 + df['Negative_Prob_title'] * -1 + df['Neutral_Prob_title'] * 0

# turn time piblished into datetime
df['Time Published'] = pd.to_datetime(df['Time Published'])

# extract the date so it can be used to aggregate articles published on the the same day
df['Date'] = df['Time Published'].dt.date
df.head()

Unnamed: 0,Time Published,Title,Sentiment_title,Neutral_Prob_title,Positive_Prob_title,Negative_Prob_title,Sentiment Score,Date
0,2022-03-07 15:03:00,"GOOGL Boosts Podcast Efforts: Should SPOT, AMZN & AAPL Worry?",positive,1e-06,0.999999,1.425253e-07,0.999999,2022-03-07
1,2022-03-07 17:05:00,What to Expect From Apple's First Big Event of The Year,neutral,0.999801,7.3e-05,0.0001254924,-5.2e-05,2022-03-07
2,2022-03-08 03:16:11,Apple TV+ Enters Comcast Xfinity Platform: Is iPhone Maker's Streaming Push Turning Into A Shove?,neutral,0.997162,0.000518,0.002319331,-0.001801,2022-03-08
3,2022-03-08 14:33:00,Apple Event 2022: New Low-End iPhone Expected to Have 5G Service,neutral,0.999533,0.000448,1.881107e-05,0.000429,2022-03-08
4,2022-03-08 16:15:00,What To Expect From Apple's Spring Product Launch Event,neutral,0.997772,0.000206,0.002022169,-0.001816,2022-03-08


In [None]:
# aggregate sentiment score and count number articles published each  day
daily_sentiment = df.groupby(df['Date']).agg({
    'Sentiment Score': 'sum',
    'Title': 'size'
}).reset_index()

In [None]:
# view the aggregated dataframe
daily_sentiment

Unnamed: 0,Date,Sentiment Score,Title
0,2022-03-07,0.999947,2
1,2022-03-08,4.052418,15
2,2022-03-09,3.897055,10
3,2022-03-10,1.999892,4
4,2022-03-11,-0.010033,2
...,...,...,...
745,2024-06-01,0.999999,1
746,2024-06-02,0.265059,2
747,2024-06-03,0.987534,5
748,2024-06-04,1.206946,8


In [None]:
# calculate average sentiment score for the day
daily_sentiment['Average Sentiment Score'] = daily_sentiment['Sentiment Score'] / daily_sentiment['Title']

In [None]:
daily_sentiment

Unnamed: 0,Date,Sentiment Score,Title,Average Sentiment Score
0,2022-03-07,0.999947,2,0.499973
1,2022-03-08,4.052418,15,0.270161
2,2022-03-09,3.897055,10,0.389705
3,2022-03-10,1.999892,4,0.499973
4,2022-03-11,-0.010033,2,-0.005017
...,...,...,...,...
745,2024-06-01,0.999999,1,0.999999
746,2024-06-02,0.265059,2,0.132530
747,2024-06-03,0.987534,5,0.197507
748,2024-06-04,1.206946,8,0.150868


# Merge sentiment Data with Stock data

In [None]:
# load stock data and view it
stock= pd.read_csv('/content/drive/My Drive/Research Project/clean_apple_stocks_2022_2024.csv', index_col=False)
stock

Unnamed: 0,timestamp,open,high,low,close,volume
0,2024-06-05,195.400,196.90,194.8700,195.87,54156785
1,2024-06-04,194.635,195.32,193.0342,194.35,47471445
2,2024-06-03,192.900,194.99,192.5200,194.03,50080539
3,2024-05-31,191.440,192.57,189.9100,192.25,75158277
4,2024-05-30,190.760,192.18,190.6300,191.29,49947941
...,...,...,...,...,...,...
561,2022-03-11,158.930,159.28,154.5000,154.73,96970102
562,2022-03-10,160.200,160.39,155.9800,158.52,105342033
563,2022-03-09,161.475,163.41,159.4100,162.95,91454905
564,2022-03-08,158.820,162.88,155.8000,157.44,131148280


In [None]:
# like sentiment create a date column to be used for merging and sort values so that they align with the sentiment dataframe
stock['Date'] = pd.to_datetime(stock['timestamp'])
stock = stock.sort_values(['Date'])

In [None]:
# convert date to datetime
daily_sentiment['Date'] = pd.to_datetime(daily_sentiment['Date'])

# create a full date range that covers all dates in sentiment data and convert it into a dataframe - this will be used to ensure that there is a row in the dataframe for every
# single day occuring between the start and the end date
full_date_range = pd.date_range(start=daily_sentiment['Date'].min(), end=daily_sentiment['Date'].max(), freq='D')
full_date_df = pd.DataFrame({'Date': full_date_range})

# merge full date range with daily sentiment
merged_df = pd.merge(full_date_df, daily_sentiment, on='Date', how='outer')

# and then merge merged_df with stock based on 'Date'
merged_df = pd.merge(merged_df, stock, on='Date', how='left')

In [None]:
merged_df

Unnamed: 0,Date,Sentiment Score,Title,Average Sentiment Score,timestamp,open,high,low,close,volume
0,2022-03-07,0.999947,2.0,0.499973,2022-03-07,163.360,165.02,159.0400,159.30,96418845.0
1,2022-03-08,4.052418,15.0,0.270161,2022-03-08,158.820,162.88,155.8000,157.44,131148280.0
2,2022-03-09,3.897055,10.0,0.389705,2022-03-09,161.475,163.41,159.4100,162.95,91454905.0
3,2022-03-10,1.999892,4.0,0.499973,2022-03-10,160.200,160.39,155.9800,158.52,105342033.0
4,2022-03-11,-0.010033,2.0,-0.005017,2022-03-11,158.930,159.28,154.5000,154.73,96970102.0
...,...,...,...,...,...,...,...,...,...,...
817,2024-06-01,0.999999,1.0,0.999999,,,,,,
818,2024-06-02,0.265059,2.0,0.132530,,,,,,
819,2024-06-03,0.987534,5.0,0.197507,2024-06-03,192.900,194.99,192.5200,194.03,50080539.0
820,2024-06-04,1.206946,8.0,0.150868,2024-06-04,194.635,195.32,193.0342,194.35,47471445.0


In [None]:
# drop redundant columns and view
merged_df.drop(columns=['Sentiment Score','Title', 'timestamp'], inplace= True)
merged_df.head()

Unnamed: 0,Date,Average Sentiment Score,open,high,low,close,volume
0,2022-03-07,0.499973,163.36,165.02,159.04,159.3,96418845.0
1,2022-03-08,0.270161,158.82,162.88,155.8,157.44,131148280.0
2,2022-03-09,0.389705,161.475,163.41,159.41,162.95,91454905.0
3,2022-03-10,0.499973,160.2,160.39,155.98,158.52,105342033.0
4,2022-03-11,-0.005017,158.93,159.28,154.5,154.73,96970102.0


# Handling missing data

In [None]:
# check what is missing
merged_df.isnull().sum()

Date                         0
Average Sentiment Score     72
open                       256
high                       256
low                        256
close                      256
volume                     256
dtype: int64

In [None]:
# Fill missing average sentiment with 0 - to reflect netural sentiment, choosen to avoid biases
merged_df = merged_df.sort_values(['Date'])
merged_df['Average Sentiment Score'] = merged_df['Average Sentiment Score'].fillna(0)
merged_df.isnull().sum()

Date                         0
Average Sentiment Score      0
open                       256
high                       256
low                        256
close                      256
volume                     256
dtype: int64

In [None]:
# dropping columns with missing stock data as these likely fall on business holiday there fore there is no stock data available
merged_df.dropna(inplace=True)
merged_df.isnull().sum()

Date                       0
Average Sentiment Score    0
open                       0
high                       0
low                        0
close                      0
volume                     0
dtype: int64

In [None]:
#save the data frame
merged_df.to_csv('/content/drive/My Drive/Research Project/merged_df_16062024.csv')

In [None]:
# reload it
stock = pd.read_csv('/content/drive/My Drive/Research Project/merged_df_16062024.csv')

In [None]:
stock

Unnamed: 0.1,Unnamed: 0,Date,Average Sentiment Score,open,high,low,close,volume
0,0,2022-03-07,0.499973,163.360,165.02,159.0400,159.30,96418845.0
1,1,2022-03-08,0.270161,158.820,162.88,155.8000,157.44,131148280.0
2,2,2022-03-09,0.389705,161.475,163.41,159.4100,162.95,91454905.0
3,3,2022-03-10,0.499973,160.200,160.39,155.9800,158.52,105342033.0
4,4,2022-03-11,-0.005017,158.930,159.28,154.5000,154.73,96970102.0
...,...,...,...,...,...,...,...,...
561,815,2024-05-30,0.097855,190.760,192.18,190.6300,191.29,49947941.0
562,816,2024-05-31,-0.058481,191.440,192.57,189.9100,192.25,75158277.0
563,819,2024-06-03,0.197507,192.900,194.99,192.5200,194.03,50080539.0
564,820,2024-06-04,0.150868,194.635,195.32,193.0342,194.35,47471445.0


# Feature Engineering

In [None]:
# daily return
stock['daily return'] = stock['close'].pct_change()

# 7-day rolling average
stock['7-day rolling avg'] = stock['close'].rolling(window=7).mean()

# 14-day Eponential moving average
stock['14-day EMA'] = stock['close'].ewm(span=14, adjust=False).mean()

# change
stock['change'] = stock['close'].diff()

# gain and loss
stock['gain'] = stock['change'].apply(lambda x: x if x > 0 else 0)
stock['loss'] = stock['change'].apply(lambda x: -x if x < 0 else 0)

# avg gain and loss
stock['avg gain'] = stock['gain'].rolling(window=14).mean()
stock['avg loss'] = stock['loss'].rolling(window=14).mean()

# RS
stock['RS'] = stock['avg gain'] / stock['avg loss']

#RSI
stock['RSI'] = 100 - (100/ (1+stock['RS']))

# day and month and day of week
stock['Date']= pd.to_datetime(stock['Date'])
stock['month'] = stock['Date'].dt.month
stock['day_of_week'] = stock['Date'].dt.dayofweek
stock['year'] = stock['Date'].dt.year

# daily variation (volatility)
stock['daily variation'] = (stock['high'] - stock['low']) / stock['open']

#  MACD
stock['12-day EMA'] = stock['close'].ewm(span=12, adjust=False).mean()
stock['26-day EMA'] = stock['close'].ewm(span=26, adjust=False).mean()
stock['MACD'] = stock['12-day EMA'] - stock['26-day EMA']
stock['MACD Signal Line'] = stock['MACD'].ewm(span=9, adjust=False).mean()

# stochastic Oscillator
stock['14 Day Low'] = stock['low'].rolling(window=14).min()
stock['14 Day High'] = stock['high'].rolling(window=14).max()
stock['%K'] = (stock['close'] - stock['14 Day Low']) / (stock['14 Day High'] - stock['14 Day Low']) * 100
stock['%D'] = stock['%K'].rolling(window=3).mean()

# encoded sentiment
def encode_sentiment(score):
    if score > 0.1:
        return 1
    elif score < -0.1:
        return -1
    else:
        return 0

stock['encoded sentiment'] = stock['Average Sentiment Score'].apply(encode_sentiment)

# create traget variable
def movement(daily_return):
  if daily_return > 0:
    return 1
  elif daily_return <0:
    return -1

stock['movement'] = stock['daily return'].apply(movement)

In [None]:
#check for imbalanced data- fairly balanced
stock['encoded sentiment'].value_counts()

encoded sentiment
 0    211
 1    203
-1    152
Name: count, dtype: int64

In [None]:
# drop the aditional rows created when making sma and emas
stock.dropna(inplace=True)

In [None]:
stock.head()

Unnamed: 0.1,Unnamed: 0,Date,Average Sentiment Score,open,high,low,close,volume,daily return,7-day rolling avg,...,12-day EMA,26-day EMA,MACD,MACD Signal Line,14 Day Low,14 Day High,%K,%D,encoded sentiment,movement
15,21,2022-03-28,0.210294,172.17,175.73,172.0,175.6,90371916.0,0.005037,170.397143,...,167.467427,163.687068,3.780359,1.844811,150.1,175.73,99.492782,98.992538,1,1.0
16,22,2022-03-29,0.497262,176.69,179.01,176.34,178.96,100589440.0,0.019134,172.537143,...,169.235516,164.818396,4.417119,2.359273,150.1,179.01,99.827049,99.031948,1,1.0
17,23,2022-03-30,0.002144,178.55,179.61,176.7,177.77,92633154.0,-0.00665,174.307143,...,170.548513,165.777774,4.770739,2.841566,150.1,179.61,93.764825,97.694886,0,-1.0
18,24,2022-03-31,0.00665,177.84,178.03,174.4,174.61,103049285.0,-0.017776,175.134286,...,171.173357,166.432013,4.741344,3.221522,150.1,179.61,83.056591,92.216155,0,-1.0
19,25,2022-04-01,0.203714,174.03,174.88,171.94,174.31,78751328.0,-0.001718,175.72,...,171.655918,167.015568,4.64035,3.505287,150.38,179.61,81.867944,86.229787,1,-1.0


In [None]:
# get the column names
stock.columns

Index(['Unnamed: 0', 'Date', 'Average Sentiment Score', 'open', 'high', 'low',
       'close', 'volume', 'daily return', '7-day rolling avg', '14-day EMA',
       'change', 'gain', 'loss', 'avg gain', 'avg loss', 'RS', 'RSI', 'month',
       'day_of_week', 'year', 'daily variation', '12-day EMA', '26-day EMA',
       'MACD', 'MACD Signal Line', '14 Day Low', '14 Day High', '%K', '%D',
       'encoded sentiment', 'movement'],
      dtype='object')

In [None]:
# drop redundant columns
stock = stock.drop(columns=['Unnamed: 0','RS','gain','loss', 'Unnamed: 0', '14 Day Low', '14 Day High', '12-day EMA',
       '26-day EMA'])

In [None]:
stock.head()

Unnamed: 0,Date,Average Sentiment Score,open,high,low,close,volume,daily return,7-day rolling avg,14-day EMA,...,month,day_of_week,year,daily variation,MACD,MACD Signal Line,%K,%D,encoded sentiment,movement
15,2022-03-28,0.210294,172.17,175.73,172.0,175.6,90371916.0,0.005037,170.397143,166.601017,...,3,0,2022,0.021665,3.780359,1.844811,99.492782,98.992538,1,1.0
16,2022-03-29,0.497262,176.69,179.01,176.34,178.96,100589440.0,0.019134,172.537143,168.248882,...,3,1,2022,0.015111,4.417119,2.359273,99.827049,99.031948,1,1.0
17,2022-03-30,0.002144,178.55,179.61,176.7,177.77,92633154.0,-0.00665,174.307143,169.518364,...,3,2,2022,0.016298,4.770739,2.841566,93.764825,97.694886,0,-1.0
18,2022-03-31,0.00665,177.84,178.03,174.4,174.61,103049285.0,-0.017776,175.134286,170.197249,...,3,3,2022,0.020412,4.741344,3.221522,83.056591,92.216155,0,-1.0
19,2022-04-01,0.203714,174.03,174.88,171.94,174.31,78751328.0,-0.001718,175.72,170.745616,...,4,4,2022,0.016894,4.64035,3.505287,81.867944,86.229787,1,-1.0


In [None]:
#checking for nulls again
stock.isnull().sum()

Date                       0
Average Sentiment Score    0
open                       0
high                       0
low                        0
close                      0
volume                     0
daily return               0
7-day rolling avg          0
14-day EMA                 0
change                     0
avg gain                   0
avg loss                   0
RSI                        0
month                      0
day_of_week                0
year                       0
daily variation            0
MACD                       0
MACD Signal Line           0
%K                         0
%D                         0
encoded sentiment          0
movement                   0
dtype: int64

In [None]:
# save the dataset with the engineered technical features
stock = stock.to_csv('/content/drive/My Drive/Research Project/apple_stock_prices_feature_engineering_2022_2024_FINALLL.csv', index= False)

# creating lagged feature for sentiment

In [None]:
stock = pd.read_csv('/content/drive/My Drive/Research Project/apple_stock_prices_feature_engineering_2022_2024_FINALLL.csv')

In [None]:
stock.head()

Unnamed: 0,Date,Average Sentiment Score,open,high,low,close,volume,daily return,7-day rolling avg,14-day EMA,...,month,day_of_week,year,daily variation,MACD,MACD Signal Line,%K,%D,encoded sentiment,movement
0,2022-03-28,0.210294,172.17,175.73,172.0,175.6,90371916.0,0.005037,170.397143,166.601017,...,3,0,2022,0.021665,3.780359,1.844811,99.492782,98.992538,1,1.0
1,2022-03-29,0.497262,176.69,179.01,176.34,178.96,100589440.0,0.019134,172.537143,168.248882,...,3,1,2022,0.015111,4.417119,2.359273,99.827049,99.031948,1,1.0
2,2022-03-30,0.002144,178.55,179.61,176.7,177.77,92633154.0,-0.00665,174.307143,169.518364,...,3,2,2022,0.016298,4.770739,2.841566,93.764825,97.694886,0,-1.0
3,2022-03-31,0.00665,177.84,178.03,174.4,174.61,103049285.0,-0.017776,175.134286,170.197249,...,3,3,2022,0.020412,4.741344,3.221522,83.056591,92.216155,0,-1.0
4,2022-04-01,0.203714,174.03,174.88,171.94,174.31,78751328.0,-0.001718,175.72,170.745616,...,4,4,2022,0.016894,4.64035,3.505287,81.867944,86.229787,1,-1.0


In [None]:
stock['sentiment 1D shift'] = stock['Average Sentiment Score'].shift(1)
stock['sentiment 5D shift'] = stock['Average Sentiment Score'].shift(5)
stock['sentiment 7D shift'] = stock['Average Sentiment Score'].shift(7)
stock['sentiment 10D shift'] = stock['Average Sentiment Score'].shift(10)
stock['sentiment 14D shift'] = stock['Average Sentiment Score'].shift(14)
stock['sentiment 30D shift'] = stock['Average Sentiment Score'].shift(30)

stock['Sentiment 5D Rolling Avg'] = stock['Average Sentiment Score'].rolling(window=5).mean()
stock['Sentiment 10D Rolling Avg'] = stock['Average Sentiment Score'].rolling(window=10).mean()
stock['Sentiment 14D Rolling Avg'] = stock['Average Sentiment Score'].rolling(window=14).mean()

stock['encoded sentiment 1D shift'] = stock['encoded sentiment'].shift(1)
stock['encoded sentiment 5D shift'] = stock['encoded sentiment'].shift(5)
stock['enocded sentiment 7D shift'] = stock['encoded sentiment'].shift(7)
stock['encoded sentiment 10D shift'] = stock['encoded sentiment'].shift(10)
stock['encoded sentiment 14D shift'] = stock['encoded sentiment'].shift(14)

In [None]:
# drop nan values created when creating lagged features
stock.dropna(inplace=True)

In [None]:
stock

Unnamed: 0,Date,Average Sentiment Score,open,high,low,close,volume,daily return,7-day rolling avg,14-day EMA,...,sentiment 14D shift,sentiment 30D shift,Sentiment 5D Rolling Avg,Sentiment 10D Rolling Avg,Sentiment 14D Rolling Avg,encoded sentiment 1D shift,encoded sentiment 5D shift,enocded sentiment 7D shift,encoded sentiment 10D shift,encoded sentiment 14D shift
30,2022-05-10,0.057865,155.520,156.7400,152.9300,154.51,115366736.0,0.016112,157.725714,159.349249,...,-0.001057,0.210294,0.003697,0.032672,0.071413,0.0,0.0,-1.0,0.0,0.0
31,2022-05-11,0.000212,153.500,155.4500,145.8100,146.50,142689825.0,-0.051841,156.088571,157.636016,...,0.001353,0.497262,0.003439,0.018439,0.071331,0.0,0.0,-1.0,1.0,0.0
32,2022-05-12,-0.492591,142.770,146.2000,138.8000,142.56,182602041.0,-0.026894,153.671429,155.625881,...,0.968269,0.002144,-0.095079,-0.091322,-0.033016,0.0,0.0,0.0,1.0,1.0
33,2022-05-13,-0.000451,144.590,148.1050,143.1100,147.11,113990852.0,0.031916,150.970000,154.490430,...,-0.296904,0.006650,-0.095159,-0.071573,-0.011841,-1.0,0.0,0.0,-1.0,-1.0
34,2022-05-16,0.189421,145.550,147.5199,144.1800,145.54,86643781.0,-0.010672,149.365714,153.297039,...,0.000342,0.203714,-0.049109,-0.028492,0.001665,0.0,0.0,0.0,-1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544,2024-05-30,0.097855,190.760,192.1800,190.6300,191.29,49947941.0,0.005255,190.240000,188.099045,...,-0.044156,-0.076579,0.154333,0.188640,0.174935,1.0,1.0,1.0,-1.0,0.0
545,2024-05-31,-0.058481,191.440,192.5700,189.9100,192.25,75158277.0,0.005019,190.225714,188.652506,...,0.051371,-0.260085,0.125944,0.176233,0.167089,0.0,0.0,1.0,0.0,0.0
546,2024-06-03,0.197507,192.900,194.9900,192.5200,194.03,50080539.0,0.009259,190.672857,189.369505,...,0.499243,-0.197776,0.159457,0.156254,0.145536,0.0,0.0,1.0,1.0,1.0
547,2024-06-04,0.150868,194.635,195.3200,193.0342,194.35,47471445.0,0.001649,191.740000,190.033571,...,0.244076,0.313320,0.172818,0.156738,0.138878,1.0,0.0,0.0,1.0,1.0


In [None]:
stock.to_csv('/content/drive/My Drive/Research Project/apple_dataframe_ready_for_ML_3.csv', index= False)

In [None]:
stock.head()

Unnamed: 0,Date,Average Sentiment Score,open,high,low,close,volume,daily return,7-day rolling avg,14-day EMA,...,sentiment 14D shift,sentiment 30D shift,Sentiment 5D Rolling Avg,Sentiment 10D Rolling Avg,Sentiment 14D Rolling Avg,encoded sentiment 1D shift,encoded sentiment 5D shift,enocded sentiment 7D shift,encoded sentiment 10D shift,encoded sentiment 14D shift
30,2022-05-10,0.057865,155.52,156.74,152.93,154.51,115366736.0,0.016112,157.725714,159.349249,...,-0.001057,0.210294,0.003697,0.032672,0.071413,0.0,0.0,-1.0,0.0,0.0
31,2022-05-11,0.000212,153.5,155.45,145.81,146.5,142689825.0,-0.051841,156.088571,157.636016,...,0.001353,0.497262,0.003439,0.018439,0.071331,0.0,0.0,-1.0,1.0,0.0
32,2022-05-12,-0.492591,142.77,146.2,138.8,142.56,182602041.0,-0.026894,153.671429,155.625881,...,0.968269,0.002144,-0.095079,-0.091322,-0.033016,0.0,0.0,0.0,1.0,1.0
33,2022-05-13,-0.000451,144.59,148.105,143.11,147.11,113990852.0,0.031916,150.97,154.49043,...,-0.296904,0.00665,-0.095159,-0.071573,-0.011841,-1.0,0.0,0.0,-1.0,-1.0
34,2022-05-16,0.189421,145.55,147.5199,144.18,145.54,86643781.0,-0.010672,149.365714,153.297039,...,0.000342,0.203714,-0.049109,-0.028492,0.001665,0.0,0.0,0.0,-1.0,0.0


# EDA

In [None]:
stock = pd.read_csv('/content/drive/My Drive/Research Project/apple_dataframe_ready_for_ML_3.csv')

In [None]:
# plotting daily close price
fig = px.line(stock, x='Date', y='close', title='Daily close of AAPL Stock')

fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(step="all")
        ])
    )
)

In [None]:
# plotting average sentiment score
fig = px.line(stock, x='Date', y='Average Sentiment Score', title='Average Sentiment Score of AAPL Stock')

fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(step="all")
        ])
    )
)

In [None]:
# plotting daily return
fig = px.line(stock, x='Date', y='daily return', title='daily return of AAPL Stock')

fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(step="all")
        ])
    )
)

## Correlation

In [None]:
# converting date into date time
stock['Date'] = pd.to_datetime(stock['Date'])
correlation_matrix = stock.corr()

In [None]:
fig = px.imshow(correlation_matrix,
                text_auto=True,
                aspect="auto",
                title="Correlation Heatmap")
fig.show()

In [None]:
import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(z=correlation_matrix.values,
                                  x=list(correlation_matrix.columns),
                                  y=list(correlation_matrix.index),
                                  colorscale='Viridis',
                                  annotation_text=correlation_matrix.round(2).values,
                                  showscale=True)

fig.update_layout(title=' ',
                  xaxis_nticks=36,
                  width=1600, height=1000)

fig.show()

In [None]:
##value counts

value_counts = stock['encoded sentiment'].value_counts().sort_index()
value_counts_df = value_counts.reset_index()
value_counts_df.columns = ['encoded sentiment', 'count']

# change encoded sentiment to categoricazl
value_counts_df['encoded sentiment'] = value_counts_df['encoded sentiment'].astype(str)

fig = px.bar(value_counts_df, x='encoded sentiment', y='count', title='A.',
             color_discrete_sequence=['#636EFA'])

fig.update_layout(
    width=500,
    height=500
)
fig.show()

In [None]:

# repeat for target variable
value_counts = stock['movement'].value_counts().sort_index()

value_counts_df = value_counts.reset_index()
value_counts_df.columns = ['movement', 'count']

value_counts_df['movement'] = value_counts_df['movement'].astype(str)

fig = px.bar(value_counts_df, x='movement', y='count', title='B.',
             color_discrete_sequence=['#636EFA'])

fig.update_layout(
    width=500,
    height=500
)
fig.show()