# Import Relevant Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Prepare inputs and targets

In [2]:
google_sentiment = pd.read_csv("../data/Twitter_sentiment/google_sentiment_time_series.csv")
google_sentiment.head()

Unnamed: 0,date,binary_mean,binary_squared_mean,binary_sqrt_mean,prob_mean,prob_squared_mean,prob_sqrt_mean
0,1/3/2020,1.0,1.0,0.999,0.173,0.172,0.173
1,1/4/2020,-1.0,-1.0,-0.999,-0.618,-0.624,-0.614
2,1/5/2020,1.0,1.0,0.999,0.191,0.172,0.201
3,1/6/2020,1.0,1.0,0.999,0.453,0.478,0.439
4,1/7/2020,-0.561,-0.618,-0.531,-0.433,-0.466,-0.416


In [3]:
google_lstm = pd.read_csv("../data/LSTM/GOOGL_LSTM_predictions.csv")
google_lstm.head()

Unnamed: 0.1,Unnamed: 0,Date,Pred,Actual
0,3868,1/2/2020,1328.691596,1367.369995
1,3869,1/3/2020,1361.228533,1360.660034
2,3870,1/6/2020,1349.106885,1394.209961
3,3871,1/7/2020,1385.93088,1393.339966
4,3872,1/8/2020,1379.950078,1404.319946


In [4]:
merged = pd.merge(google_sentiment, google_lstm, left_on='date', right_on='Date')

In [5]:
merged_changed = merged[['date', 'prob_sqrt_mean', 'Pred', 'Actual']]
merged_changed = merged_changed.dropna()

In [6]:
inputs_1 = merged_changed.prob_sqrt_mean.values
inputs_1

array([ 0.173,  0.439, -0.416,  0.527,  0.382,  0.048, -0.193, -0.393,
       -0.043, -0.13 ,  0.245, -0.279, -0.017,  0.163, -0.256, -0.294,
        0.306, -0.327,  0.191, -0.084,  0.147,  0.031, -0.062,  0.144,
       -0.025,  0.099, -0.07 ,  0.066,  0.069,  0.518,  0.132, -0.001,
       -0.033,  0.007,  0.451,  0.131,  0.491,  0.161, -0.102, -0.275,
        0.097,  0.623,  0.11 ,  0.037,  0.18 ,  0.376, -0.165,  0.492,
        0.27 ,  0.472,  0.009,  0.327,  0.217, -0.204, -0.043,  0.034,
       -0.055, -0.046,  0.147, -0.226,  0.051,  0.141,  0.694,  0.498,
       -0.506, -0.099,  0.335,  0.041,  0.199,  0.288, -0.122, -0.544,
        0.019,  0.028, -0.03 ,  0.276, -0.052, -0.103,  0.123,  0.329,
        0.445,  0.309, -0.063,  0.111,  0.14 ,  0.041,  0.028,  0.009,
        0.165,  0.1  ,  0.123, -0.037, -0.011, -0.31 , -0.   ,  0.093,
       -0.332, -0.091, -0.181, -0.044,  0.059,  0.052,  0.145,  0.092,
        0.086, -0.059,  0.012, -0.196, -0.263,  0.177,  0.078,  0.021,
      

In [7]:
inputs_2 = merged_changed.Pred.values
inputs_2.shape

(213,)

In [8]:
inputs = np.stack((inputs_1, inputs_2), axis=1)
inputs.shape

(213, 2)

In [9]:
targets = merged_changed.iloc[:, 3].values

# Fit a model

In [10]:
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
inputs = sc.fit_transform(inputs)

inputs = sm.add_constant(inputs)
model = sm.OLS(targets, inputs)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.942
Model:                            OLS   Adj. R-squared:                  0.942
Method:                 Least Squares   F-statistic:                     1707.
Date:                Mon, 15 Feb 2021   Prob (F-statistic):          1.29e-130
Time:                        22:23:55   Log-Likelihood:                -1055.5
No. Observations:                 213   AIC:                             2117.
Df Residuals:                     210   BIC:                             2127.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1446.5477      2.370    610.296      0.0

# Save Results

In [11]:
predictions = results.predict()

In [12]:
google_stock_prediction = pd.DataFrame().assign(Date=merged_changed['date'], Predictions=predictions, 
                                                Actual=merged_changed['Actual'])
google_stock_prediction.to_csv("../Stock Price Predictions with Twitter/google_stock_prediction_twitter.csv")