# Models embedded with Sentiment Scores

In [1]:
import pandas as pd
import numpy as np

## Loading and merging the data

### CoCo Index 

In [2]:
ts_data = pd.read_csv('CocoStudy.csv')
coco_data = ts_data.sort_values('Date')
coco_data = coco_data.drop(columns = ['Stock Index', 'Stock Returns', 'Bond Index', 'Bond Returns', 'Outlier Score'])
coco_data = coco_data.rename(columns={'Coco Index': 'Coco'})
coco_data = coco_data.rename(columns={'Coco Returns': 'd_Coco'})
coco_data.head()

Unnamed: 0,Date,Coco,d_Coco
1222,2014-06-04,119.260403,0.001599
673,2014-06-05,119.744212,0.004057
220,2014-06-06,120.607632,0.007211
271,2014-06-09,121.569545,0.007976
992,2014-06-10,121.921731,0.002897


### Euro Stoxx 50

In [3]:
es50_data = pd.read_csv('EuroStoxx50.csv')

In [4]:
es50_data['d_ES50'] = es50_data['Close'].pct_change(1)
es50_data = es50_data[['Date', 'Close', 'd_ES50']]
es50_data = es50_data.rename(columns={'Close': 'ES50'})

In [5]:
data_temp = pd.merge(coco_data, es50_data, how='inner', on = 'Date')

### iTraxx

In [6]:
itraxx_data = pd.read_csv('iTraxx.csv')

In [7]:
from datetime import datetime
itraxx_data['Date'] = itraxx_data['Date'].map(lambda x: datetime.strptime(x, '%b %d, %Y'))
itraxx_data['d_iTraxx'] = itraxx_data['Price'].pct_change(1)

In [8]:
itraxx_data = itraxx_data[['Date', 'Price', 'd_iTraxx']]
itraxx_data = itraxx_data.rename(columns={'Price': 'iTraxx'})

In [9]:
data_temp['Date'] = pd.to_datetime(data_temp['Date'])
data_temp = pd.merge(data_temp, itraxx_data, how='inner', on = 'Date')

### US 5 year Bonds Yields

In [10]:
r_data = pd.read_csv('US_5Year_Bond_Yield.csv')

In [11]:
from datetime import datetime
r_data['Date'] = r_data['Date'].map(lambda x: datetime.strptime(x, '%b %d, %Y'))
r_data['d_r'] = r_data['Price'].pct_change(1)

In [12]:
r_data = r_data[['Date', 'Price', 'd_r']]
r_data = r_data.rename(columns={'Price': 'r'})

In [13]:
df = pd.merge(data_temp, r_data, how='inner', on = 'Date')

In [14]:
df.head()

Unnamed: 0,Date,Coco,d_Coco,ES50,d_ES50,iTraxx,d_iTraxx,r,d_r
0,2014-06-04,119.260403,0.001599,3237.929932,-0.00096,115.74,-0.001897,1.644,0.011692
1,2014-06-05,119.744212,0.004057,3267.050049,0.008993,115.96,-0.000776,1.625,-0.014554
2,2014-06-06,120.607632,0.007211,3294.280029,0.008335,116.05,-0.001806,1.649,0.000607
3,2014-06-10,121.921731,0.002897,3313.800049,0.005925,116.2,0.000603,1.711,0.007063
4,2014-06-11,121.605514,-0.002594,3289.090088,-0.007457,116.13,0.000517,1.699,0.009507


### Sentiment predictions

In [15]:
#sentiment_data = pd.read_json('prediction_LIN.json')
sentiment_data = pd.read_json('prediction_RBF.json')
#sentiment_data = pd.read_csv('pred_SEMI.csv')
#sentiment = pd.read_csv('pred_TRANSD.csv')

sentiment_data = sentiment_data.sort_values('DATE')
sentiment_data = sentiment_data[['DATE', 'SENTIMENT']]
sentiment_data.head()

Unnamed: 0,DATE,SENTIMENT
1228,2004-10-18,1
1233,2009-10-30,0
1209,2009-11-06,-1
1215,2009-11-11,-1
321,2009-11-13,-1


In [16]:
df.loc[:, 'Sentiment'] = np.zeros(df.shape[0])

In [17]:
i = 0

for date_return in df.Date:
    j = 0
    
    for date_sent in sentiment_data.DATE:
        if str(date_sent)==str(date_return):
            df.loc[i, 'Sentiment'] += sentiment_data.loc[j, 'SENTIMENT']
            
        j+=1
    i+=1

In [18]:
df['d_Sentiment'] = df['Sentiment'] - df['Sentiment'].shift()
df['lag_Sentiment'] = df['Sentiment'].shift()
df['d_ES50_squared'] = df['d_ES50']**2

In [19]:
df[['d_Sentiment']] = df['d_Sentiment'].apply(lambda x: x/100)

In [20]:
df.head()

Unnamed: 0,Date,Coco,d_Coco,ES50,d_ES50,iTraxx,d_iTraxx,r,d_r,Sentiment,d_Sentiment,lag_Sentiment,d_ES50_squared
0,2014-06-04,119.260403,0.001599,3237.929932,-0.00096,115.74,-0.001897,1.644,0.011692,0.0,,,9.208351e-07
1,2014-06-05,119.744212,0.004057,3267.050049,0.008993,115.96,-0.000776,1.625,-0.014554,0.0,0.0,0.0,8.08819e-05
2,2014-06-06,120.607632,0.007211,3294.280029,0.008335,116.05,-0.001806,1.649,0.000607,0.0,0.0,0.0,6.946772e-05
3,2014-06-10,121.921731,0.002897,3313.800049,0.005925,116.2,0.000603,1.711,0.007063,0.0,0.0,0.0,3.51107e-05
4,2014-06-11,121.605514,-0.002594,3289.090088,-0.007457,116.13,0.000517,1.699,0.009507,0.0,0.0,0.0,5.560214e-05


In [21]:
df.tail()

Unnamed: 0,Date,Coco,d_Coco,ES50,d_ES50,iTraxx,d_iTraxx,r,d_r,Sentiment,d_Sentiment,lag_Sentiment,d_ES50_squared
1279,2019-08-05,182.816682,-0.009691,3310.929932,-0.019309,119.83,-0.00125,1.528,0.001311,0.0,0.0,0.0,0.000373
1280,2019-08-06,183.213285,0.002169,3291.659912,-0.00582,119.98,0.000334,1.526,-0.010376,-1.0,-0.01,0.0,3.4e-05
1281,2019-08-07,183.127889,-0.000466,3309.98999,0.005569,119.94,-0.001914,1.542,-0.001295,0.0,0.01,-1.0,3.1e-05
1282,2019-08-08,183.666437,0.002941,3375.379883,0.019755,120.17,0.001584,1.544,-0.022166,0.0,0.0,0.0,0.00039
1283,2019-08-09,183.355698,-0.001692,3333.73999,-0.012336,119.98,0.000751,1.579,0.0,0.0,0.0,0.0,0.000152


In [22]:
df = df.dropna()

In [29]:
df.to_csv('regression_data.csv')

## Linear Regression

In [23]:
import statsmodels.formula.api as smf

In [24]:
mod = smf.ols('d_Coco ~  d_ES50 + d_ES50_squared + d_iTraxx + d_r  + d_Sentiment ', data=df).fit()
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                 d_Coco   R-squared:                       0.338
Model:                            OLS   Adj. R-squared:                  0.335
Method:                 Least Squares   F-statistic:                     127.4
Date:                Sun, 17 May 2020   Prob (F-statistic):          4.16e-109
Time:                        11:22:45   Log-Likelihood:                 5114.5
No. Observations:                1254   AIC:                        -1.022e+04
Df Residuals:                    1248   BIC:                        -1.019e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.0004      0.000      2.

In [25]:
mod = smf.ols('d_Coco ~  d_ES50 + d_ES50_squared + d_iTraxx + d_r  + d_Sentiment ', data=df).fit()
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                 d_Coco   R-squared:                       0.338
Model:                            OLS   Adj. R-squared:                  0.335
Method:                 Least Squares   F-statistic:                     127.4
Date:                Sun, 17 May 2020   Prob (F-statistic):          4.16e-109
Time:                        11:22:45   Log-Likelihood:                 5114.5
No. Observations:                1254   AIC:                        -1.022e+04
Df Residuals:                    1248   BIC:                        -1.019e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.0004      0.000      2.

## Granger Causality

In [26]:
from statsmodels.tsa.stattools import grangercausalitytests

Does Sentiment granger cause CoCo?

In [27]:
granger_data = df[[ 'd_Coco', 'd_Sentiment']]
gtest = grangercausalitytests(granger_data, 3)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.2832  , p=0.5947  , df_denom=1250, df_num=1
ssr based chi2 test:   chi2=0.2839  , p=0.5941  , df=1
likelihood ratio test: chi2=0.2839  , p=0.5942  , df=1
parameter F test:         F=0.2832  , p=0.5947  , df_denom=1250, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.9728  , p=0.3783  , df_denom=1247, df_num=2
ssr based chi2 test:   chi2=1.9535  , p=0.3765  , df=2
likelihood ratio test: chi2=1.9519  , p=0.3768  , df=2
parameter F test:         F=0.9728  , p=0.3783  , df_denom=1247, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.7446  , p=0.5255  , df_denom=1244, df_num=3
ssr based chi2 test:   chi2=2.2463  , p=0.5229  , df=3
likelihood ratio test: chi2=2.2443  , p=0.5233  , df=3
parameter F test:         F=0.7446  , p=0.5255  , df_denom=1244, df_num=3


***
Does CoCo granger cause Sentiment?

In [28]:
granger_data = df[[ 'd_Sentiment', 'd_Coco']]
gtest = grangercausalitytests(granger_data, 3)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.0950  , p=0.7579  , df_denom=1250, df_num=1
ssr based chi2 test:   chi2=0.0953  , p=0.7576  , df=1
likelihood ratio test: chi2=0.0953  , p=0.7576  , df=1
parameter F test:         F=0.0950  , p=0.7579  , df_denom=1250, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.2209  , p=0.8018  , df_denom=1247, df_num=2
ssr based chi2 test:   chi2=0.4435  , p=0.8011  , df=2
likelihood ratio test: chi2=0.4435  , p=0.8011  , df=2
parameter F test:         F=0.2209  , p=0.8018  , df_denom=1247, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.5970  , p=0.1883  , df_denom=1244, df_num=3
ssr based chi2 test:   chi2=4.8179  , p=0.1856  , df=3
likelihood ratio test: chi2=4.8086  , p=0.1864  , df=3
parameter F test:         F=1.5970  , p=0.1883  , df_denom=1244, df_num=3


## Final Remarks

Coefficient for Sentiment is significant at 10%. Ganger causality suggest that time series of sentiment is not useful in predicting the time series of CoCo returns