In [1]:
# The libriaries for this experiment were imported into the python enviroment
import os

import numpy as np
import scipy as sp
import pandas as pd

import statsmodels.api as sm

from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression

import holoviews as hv
import hvplot.pandas

import datetime

import datashader as ds
from holoviews.operation.datashader import datashade, shade, dynspread, rasterize

In [2]:
hv.extension('bokeh')
#hv.archive.auto() 

In [3]:
news_volume_path = os.path.join(".",'news_volume.csv')
news_volume = pd.read_csv(news_volume_path)
news_volume.columns = ['index','date']

In [4]:
news_volume['Articles'] = 1

In [5]:
news_volume = news_volume.groupby('date').sum().loc[:,['Articles']]

In [6]:
news_volume = news_volume.rolling(90).sum()

In [7]:
news_volume = news_volume.reset_index()

In [8]:
news_volume.date = pd.to_datetime(news_volume.date)

In [12]:
%%opts Scatter [width=800 height=400]
%%output filename="./media/News Volume" fig="png"
news_volume.hvplot.scatter(x='date', y='Articles')

meta data 

In [4]:
dates_path = os.path.join(".",'12_dates.csv')
dates = pd.to_datetime(pd.read_csv(dates_path).iloc[:,1])

In [5]:
portfolios_path = os.path.join(".",'13092018_portfolios.csv')
portfolios = pd.read_csv(portfolios_path).iloc[:,1:].T

association

In [6]:
association_path = os.path.join(".",'13092018_association_risk.csv')
association = pd.read_csv(association_path)

In [7]:
association = association.iloc[:-1,1:]

In [8]:
association = association**-1

In [9]:
association.index = dates

In [10]:
%%opts Histogram [width=800 height=400] 

def association_plot(time):
    return (1/association.loc[dates.loc[time],:]).hvplot.hist()

hv.DynamicMap(association_plot, kdims=['time']).redim.range(time=(0,3914))

In [11]:
association = association.reset_index()

In [12]:
association_scatter = pd.melt(association, id_vars=['2003-05-16'])

In [13]:
association_scatter.variable = association_scatter.variable.astype(int)

In [15]:
association_scatter = association_scatter.sample(frac=0.1).merge(portfolios.T, how='left', right_index=True, left_on='variable')

In [15]:
association_scatter = association_scatter.loc[:,['2003-05-16','value']]
association_scatter.columns = ['Time', 'Association']
association_scatter.Association = association_scatter.Association

association_scatter['Time'] = pd.to_datetime(association_scatter['Time'])

In [17]:
%%opts RGB [width=800 height=400]
%%output filename="./media/Association Over Time" fig="png"
datashade(association_scatter.hvplot.scatter(x='Time', y='Association'))

In [16]:
var_path = os.path.join(".",'13092018_portfolio_var.csv')
var = pd.read_csv(var_path)
var = var.iloc[:,1:]

In [19]:
%%opts Histogram [width=800 height=400]

def var_plot(time):
    return var.iloc[time,:].hvplot.hist()

hv.DynamicMap(var_plot, kdims=['time']).redim.range(time=(0,3915))

In [20]:
var_scatter = pd.melt(var.iloc[1:,:].reset_index(), id_vars=['index']).loc[:,['index','value']]
var_scatter.columns = ['Time', 'Std']

var_scatter['Time'] = pd.Series(np.tile(dates, var.shape[1])).values

In [21]:
%%opts RGB [width=800 height=400]
%%output filename="./media/Volatility Over Time time" fig="png"

datashade(var_scatter.hvplot.scatter(x='Time', y='Std'))

# Rolling ANCOVA

In [25]:
p_values = []
coefficient =[]

for i in range(association.shape[0]-1825):
    m = association.iloc[i+1825,1:]**-1
    
    X = sm.add_constant(m.values.tolist())
    model = sm.OLS((var.iloc[i,:]).tolist(),X)
    results = model.fit()
    coefficient.append(results.params.tolist())
    p_values.append(results.pvalues.tolist())

  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [26]:
%%opts Curve [width=1000 height=500 tools=['hover']] (line_width=0.5 line_alpha=0.5)
%%output filename="./media/P-Values with constant over time" fig="png"
p_values_frame = pd.DataFrame(p_values).iloc[1:,:]
p_values_frame.index = dates.iloc[1825+1:]
p_values_frame.iloc[:,0].hvplot(label='ANCOVA constant p-value')   * \
p_values_frame.iloc[:,1].hvplot(label='ANCOVA coefficient p-value')   * \
p_values_frame.iloc[:,1].rolling(90).mean().hvplot(label='ANCOVA coefficient p-value rolling 90-day mean') *\
hv.HLine(0.05) * \
hv.Text(dates.quantile(0.5),0.1, "0.05 level of significance", fontsize=10).options( color='#0066CC')

In [27]:
%%opts Curve [width=1000 height=500] (line_alpha=0.5 line_width=0.5)
%%output filename="./media/Coefficients with constant over time" fig="png"
coefficient_frame = pd.DataFrame(coefficient).iloc[1:,:]
coefficient_frame.index = dates.iloc[1825+1:]
coefficient_frame.iloc[:,0].hvplot(label='ANCOVA constant') * \
coefficient_frame.iloc[:,1].hvplot(label='ANCOVA coefficient') * \
hv.VLine(pd.to_datetime('08-08-2008')) * \
hv.Text(pd.to_datetime('06-06-2008'),6e-6, "Financial Crisis", fontsize=10, rotation=90).options( color='#0066CC') *\
hv.VLine(pd.to_datetime('01-01-2016')) * \
hv.Text(pd.to_datetime('10-10-2015'),6e-6, "Zuma-Gate", fontsize=10, rotation=90).options( color='orange')

# Ancova of portfolio over time with constant

In [28]:
t_p_values = []
t_coefficient =[]

for j in range(association.shape[1]-1):
    val = pd.concat([1/association.iloc[1825:,1+j], var.iloc[1825+1:,j]], axis=1).dropna(0)
    
    X = sm.add_constant(val.iloc[:,0].tolist())
    model = sm.OLS((val.iloc[:,1]).tolist(),X)
    results = model.fit()
    t_coefficient.append(results.params.tolist())
    t_p_values.append(results.pvalues.tolist())

In [29]:
%%opts Histogram [width=1000 height=500] (alpha=0.5)
%%output filename="./media/Coefficeint P-value with contant accross portfolio" fig="png"
t_values_frame = pd.DataFrame(p_values).iloc[:,:]
hv.Histogram(np.histogram(t_values_frame.iloc[:,1].dropna(), bins=100), label='ANCOVA coefficient p-value')

In [30]:
%%opts Histogram [width=1000 height=500] (alpha=0.5)
%%output filename="./media/Constant P-value with contant accross portfolio" fig="png"
t_values_frame = pd.DataFrame(p_values).iloc[:,:]
hv.Histogram(np.histogram(t_values_frame.iloc[:,0].dropna(), bins=100), label='ANCOVA constant p-value')

In [31]:
%%opts Histogram [width=1000 height=500] (alpha=0.5)
%%output filename="./media/Coefficeint value with contant accross portfolio" fig="png"

t_coefficient_frame = pd.DataFrame(t_coefficient).iloc[:,:]
hv.Histogram(np.histogram(t_coefficient_frame.iloc[:,0].dropna(), bins=100), label='ANCOVA constant')

In [32]:
%%opts Histogram [width=1000 height=500] (alpha=0.5)
%%output filename="./media/Coefficeint Value with contant accross portfolio" fig="png"

hv.Histogram(np.histogram(t_coefficient_frame.iloc[:,1].dropna(), bins=100), label='ANCOVA coefficient')

# ANCOVA over time without constant

In [33]:
t_p_values = []
t_coefficient =[]

for j in range(association.shape[1]-1):
    val = pd.concat([1/association.iloc[1825:,1+j], var.iloc[1825+1:,j]], axis=1).dropna(0)
    
    X = val.iloc[:,0].tolist()
    model = sm.OLS((val.iloc[:,1]).tolist(),X)
    results = model.fit()
    t_coefficient.append(results.params.tolist())
    t_p_values.append(results.pvalues.tolist())

In [34]:
%%opts Histogram [width=1000 height=500] (alpha=0.5)
%%output filename="./media/Coefficeint P-value without contant accross portfolios" fig="png"
t_values_frame = pd.DataFrame(p_values).iloc[:,:]
hv.Histogram(np.histogram(t_values_frame.iloc[:,0].dropna(), bins=100), label='ANCOVA coefficient p-value')

In [35]:
%%opts Histogram [width=1000 height=500] (alpha=0.5)
%%output filename="./media/Coefficeint Value without contant accross portfolio" fig="png"

t_coefficient_frame = pd.DataFrame(t_coefficient).iloc[:,:]
hv.Histogram(np.histogram(t_coefficient_frame.iloc[:,0].dropna(), bins=100), label='ANCOVA constant')

# Single Porfolio Example

In [45]:
val = pd.concat([1/association.iloc[1825:,1+250], var.iloc[1825+1:,250]], axis=1).dropna(0)
val.columns = ['association','volatility']

X = sm.add_constant(val.iloc[:,0].tolist())
model = sm.OLS((val.iloc[:,1]).tolist(),X)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.068
Model:,OLS,Adj. R-squared:,0.068
Method:,Least Squares,F-statistic:,150.8
Date:,"Mon, 17 Sep 2018",Prob (F-statistic):,1.7e-33
Time:,17:43:13,Log-Likelihood:,9334.1
No. Observations:,2058,AIC:,-18660.0
Df Residuals:,2056,BIC:,-18650.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0075,0.000,72.687,0.000,0.007,0.008
x1,0.0114,0.001,12.279,0.000,0.010,0.013

0,1,2,3
Omnibus:,227.096,Durbin-Watson:,0.065
Prob(Omnibus):,0.0,Jarque-Bera (JB):,309.939
Skew:,0.874,Prob(JB):,4.979999999999999e-68
Kurtosis:,3.75,Cond. No.,16.4


In [47]:
#results.summary().as_latex()

In [48]:
%%opts Scatter [width=1000 height=500]
%%output filename="./media/Scatter Plot of Residuals of 500th Portofolio" fig="png"

pd.concat([pd.DataFrame(results.resid, columns=['residuals']).reset_index().loc[:,['residuals']], val.loc[:,['association']].reset_index().loc[:,['association']]], axis=1).hvplot.scatter(y='residuals', x='association', label='Residuals of 500th Portfolio', size=4)

In [49]:
%%opts Histogram [width=1000 height=500]
%%output filename="./media/Histogram of Residuals of 500th Portofolio" fig="png"

pd.DataFrame(results.resid, columns=['residuals']).hvplot.hist(label='Distribution of Errors')

In [38]:
%%opts Scatter [width=1000 height=500]
%%output filename="./media/Scatter Plot of 500th Portofolio" fig="png"

val.hvplot.scatter(x='association',y='volatility', label='500th Portfolio', size=3)

# Single Point in Time

![image](./media/Association Computation Diagram.png)

In [42]:
val = pd.concat([1/association.iloc[1825+1003,1:], var.iloc[1825+1004,:]], axis=1).dropna(0)
val.columns = ['association','volatility']

In [43]:
X = sm.add_constant(val.iloc[:,0].tolist())
model = sm.OLS((val.iloc[:,1]).tolist(),X)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.003
Method:,Least Squares,F-statistic:,4.486
Date:,"Mon, 17 Sep 2018",Prob (F-statistic):,0.0344
Time:,17:42:21,Log-Likelihood:,5684.4
No. Observations:,1000,AIC:,-11360.0
Df Residuals:,998,BIC:,-11360.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0058,0.000,32.799,0.000,0.005,0.006
x1,0.0064,0.003,2.118,0.034,0.000,0.012

0,1,2,3
Omnibus:,0.648,Durbin-Watson:,2.11
Prob(Omnibus):,0.723,Jarque-Bera (JB):,0.724
Skew:,0.005,Prob(JB):,0.696
Kurtosis:,2.869,Cond. No.,117.0


In [37]:
#pd.Series(results.summary().as_html()).str.replace('\n','')[0]

In [41]:
%%opts Scatter [width=1000 height=500]
%%output filename="./media/Scatter Plot of Portfolios on 2014-03-21" fig="png"

val.hvplot.scatter(x='association',y='volatility', label='2325 Date', size=3)

In [44]:
%%opts Histogram [width=1000 height=500]
%%output filename="./media/Histogram of Residuals of of Portfolios on 2014-03-21" fig="png"

pd.DataFrame(results.resid, columns=['residuals']).hvplot.hist(label='Distribution of Errors')

In [42]:
%%opts Scatter [width=1000 height=500]
%%output filename="./media/Scatter Plot of Residuals of Portfolios on 2014-03-21" fig="png"

pd.concat([pd.DataFrame(results.resid, columns=['residuals']).reset_index().loc[:,['residuals']], val.reset_index().loc[:,['association']]], axis=1).hvplot.scatter(y='residuals', x='association', label='Residuals of Portfolios on 2014-03-21', size=4)

# Complete Model

metadata

In [3]:
portfolios_path = os.path.join(".",'13092018_portfolios.csv')
portfolios = pd.read_csv(portfolios_path).iloc[:,1:].T

In [4]:
dates_path = os.path.join(".",'12_dates.csv')
dates = pd.to_datetime(pd.read_csv(dates_path).iloc[-500:,1])#.iloc[-500:,1])

association

In [5]:
association_path = os.path.join(".",'13092018_association_risk.csv')
association = pd.read_csv(association_path).iloc[:,1:]

In [6]:
association = association.iloc[:,:]

In [7]:
association.loc[:,'dates'] = dates

In [8]:
association = pd.melt(association, 'dates')

In [9]:
association.loc[:,['dates', 'variable']] = association.loc[:,['dates', 'variable']].astype(str)

In [10]:
association = pd.merge(association, portfolios, how='left', left_on='variable', right_index=True)

volatility

In [11]:
var_path = os.path.join(".",'13092018_portfolio_var.csv')
var = pd.read_csv(var_path).iloc[:,:]

In [12]:
var = pd.melt(var, 'Unnamed: 0')

In [13]:
var.loc[:,['Unnamed: 0', 'variable']] = var.loc[:,['Unnamed: 0', 'variable']].astype(str)

join

In [14]:
joined = pd.merge(association, var, how='inner', left_on=['dates', 'variable'], right_on=['Unnamed: 0', 'variable'])

In [15]:
del association, var

In [16]:
joined = joined.dropna(0).sample(frac=0.05)

In [17]:
joined['association'] = joined.value_x

In [18]:
x = pd.concat([joined.loc[:,['association']]**-1, pd.get_dummies(joined.dates), joined.drop(columns=['value_x','variable','dates', 'association', 'Unnamed: 0', 'value_y',])], axis=1)

In [19]:
#x = x.reset_index()

In [20]:
#x['association * time'] = x.index * x.association

In [21]:
x['bias'] = 1

Model estimation

In [22]:
model = sm.OLS(joined.iloc[:,:].value_y,x.iloc[:,:])

clean wordspace

In [23]:
del joined, x

estimation

$\beta$

In [24]:
#Final two years
# results = model.fit()
# results.summary()

0,1,2,3
Dep. Variable:,value_y,R-squared:,0.579
Model:,OLS,Adj. R-squared:,0.578
Method:,Least Squares,F-statistic:,1170.0
Date:,"Tue, 18 Sep 2018",Prob (F-statistic):,0.0
Time:,01:42:49,Log-Likelihood:,2366800.0
No. Observations:,469000,AIC:,-4732000.0
Df Residuals:,468449,BIC:,-4726000.0
Df Model:,550,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
association,0.0003,5.32e-05,4.968,0.000,0.000,0.000
2016-06-17,0.0038,4.95e-05,75.797,0.000,0.004,0.004
2016-06-20,0.0039,4.92e-05,78.994,0.000,0.004,0.004
2016-06-21,0.0038,4.92e-05,77.515,0.000,0.004,0.004
2016-06-22,0.0038,4.92e-05,78.214,0.000,0.004,0.004
2016-06-23,0.0027,4.93e-05,55.704,0.000,0.003,0.003
2016-06-24,0.0008,4.93e-05,16.030,0.000,0.001,0.001
2016-06-27,6.673e-06,5.08e-05,0.131,0.895,-9.29e-05,0.000
2016-06-28,-0.0001,4.92e-05,-2.831,0.005,-0.000,-4.29e-05

0,1,2,3
Omnibus:,114281.425,Durbin-Watson:,0.044
Prob(Omnibus):,0.0,Jarque-Bera (JB):,538763.072
Skew:,1.113,Prob(JB):,0.0
Kurtosis:,7.756,Cond. No.,3.62e+16


In [131]:
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,value_y,R-squared:,0.794
Model:,OLS,Adj. R-squared:,0.79
Method:,Least Squares,F-statistic:,182.6
Date:,"Mon, 17 Sep 2018",Prob (F-statistic):,0.0
Time:,23:56:13,Log-Likelihood:,621520.0
No. Observations:,123450,AIC:,-1238000.0
Df Residuals:,120899,BIC:,-1213000.0
Df Model:,2550,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
association,0.0007,0.000,3.529,0.000,0.000,0.001
2008-10-17,0.0147,0.000,68.352,0.000,0.014,0.015
2008-10-20,0.0147,0.000,56.307,0.000,0.014,0.015
2008-10-21,0.0135,0.000,55.509,0.000,0.013,0.014
2008-10-22,0.0142,0.000,57.134,0.000,0.014,0.015
2008-10-23,0.0135,0.000,55.449,0.000,0.013,0.014
2008-10-24,0.0132,0.000,64.336,0.000,0.013,0.014
2008-10-27,0.0135,0.000,58.063,0.000,0.013,0.014
2008-10-28,0.0120,0.000,52.368,0.000,0.012,0.012

0,1,2,3
Omnibus:,25356.151,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,117583.806
Skew:,0.933,Prob(JB):,0.0
Kurtosis:,7.402,Cond. No.,1.72e+17


In [132]:
file = open("blocking summary html.txt","w") 
file.write(results.summary().as_html())
file.close() 

file = open("blocking summary latex.txt","w") 
file.write(results.summary().as_latex())
file.close() 

In [133]:
blocking_residuals = pd.DataFrame(results.resid, columns=['residuals']).reset_index()

In [134]:
%%opts Scatter [width=1000 height=500]
%%output filename="./media/Scatter Plot of Residuals of Blocking Ancova" fig="png"

pd.concat([blocking_residuals, pd.DataFrame(model.data.exog[:,0], columns=['association'])], axis=1).hvplot.scatter(y='residuals', x='association', size=1)

In [135]:
%%opts Histogram [width=1000 height=500]
%%output filename="./media/Histogram of Residuals of Blocking Ancova" fig="png"

blocking_residuals.hvplot.hist(bins=100)

In [136]:
blocking_pvalues = pd.DataFrame(results.pvalues, columns=['p-value'])

In [137]:
%%opts Scatter [width=1000 height=500]
%%output filename="./media/Scatter Plot of P-values for dates of Blocking Ancova" fig="png"

dates_pvalues = pd.merge(pd.DataFrame(dates.dt.date.astype(str).values, columns=['dates']), blocking_pvalues, how='left', left_on='dates', right_index=True).dropna()
dates_pvalues.dates = pd.to_datetime(dates_pvalues.dates)
    
dates_pvalues.hvplot.scatter(x='dates', y='p-value')

In [138]:
%%opts Scatter [width=1000 height=500]
%%output filename="./media/Scatter Plot of P-values for shares in a portfolio of Blocking Ancova" fig="png"

shares_pvalues = blocking_pvalues.iloc[-83:-1].reset_index()

shares_pvalues.columns =  ['share','p-value']

    
shares_pvalues.hvplot.scatter(x='share', y='p-value')

# Time Backtesting

In [3]:
# association_path = os.path.join(".",'13092018_association_risk.csv')
# association = pd.read_csv(association_path).iloc[:,1:]

In [4]:
# var_path = os.path.join(".",'13092018_portfolio_var.csv')
# var = pd.read_csv(var_path).iloc[:,1:]

In [32]:
# tscv = TimeSeriesSplit(n_splits=10)

# X = association.iloc[1825:2058,:].reset_index(drop=True)
# y = var.iloc[1825:2058,:].reset_index(drop=True)

# scores = []
# for train_index, test_index in tscv.split(X.iloc[:,1]):
#     portfolios = []
#     for portfolio in range(X.shape[1]):
#         X_train, X_test = X.iloc[train_index, portfolio], X.iloc[test_index, portfolio]
#         y_train, y_test = y.iloc[train_index, portfolio], y.iloc[test_index, portfolio]
        
#         lm = LinearRegression()
        
#         lm.fit(X=X_train.values.reshape(-1, 1), y=y_train.values.reshape(-1, 1))
        
#         R2 = lm.score(X=X_test.values.reshape(-1, 1), y=y_test.values.reshape(-1, 1))
    
#         portfolios.append(R2)
#     scores.append(portfolios)

In [33]:
# scores = pd.DataFrame(scores).reset_index()

# scores_melt = pd.melt(scores, id_vars=['index']).loc[:,['index','value']]
# scores_melt.columns = ['Time Split','R2']

In [None]:
# %%output filename="./media/R2 over Time" fig="png"

# hv.Scatter(scores_melt, kdims='Time Split', vdims='R2').options(width=800, height=400)

Association Computation Diagram

|                   |                  | table 4    |          |                   |            |        |  
|-------------------|------------------|------------|----------|-------------------|------------|--------|  
| Dep. Variable     | y                |            |          | R-squared         | 0.794      |        |  
| Method:           | Least Squares    |            |          | Adj. R-squared    | 0.790      |        |  
| No. Observations: | 123450           |            |          | F-statistic       | 182.6      |        |  
| DF Residuals:     | 120899           |            |          | Prob(F-statistic) | 0.000      |        |  
| Df Model:         | 2550             |            |          | Log-likelihood    | 6.2152e+05 |        |  
| Covariance type   | non-robust       |            |          | AIC:              | -1.238e+64 |        |  
| Date              | Mon, 17 Sep 2018 |            |          | BIC:              | -1.213e+06 |        |  
| Time              | 23:56:13         |            |          |                   |            |        |  
|                   | Coef             | std err    | t        | $p>|t|   $          | [0.025     | 0.975] |  
| Association       | 0.007            | 0.000      | 3.529    | 0.000             | 0.000      | 0.001  |  
| ...               |                  |            |          |                   |            |        |  
| Constant          | 0.0024           | 4.84e-06   | 500.510  | 0.000             | 0.002      | 0.002      |  
|                   |                  |            |          |                   |            |        |  
| Omnibus:          | 25356.151        | Skew:      | 0.933    | Dubin-Watson      | 1.999      |          |  
| Prob(Omnibus):    | 0.000            | Kurtosis:  | 7.402    | Jarque-Bera (JB): |117583.806 |        |
| Prob(JB):         | 0.000            | Cond. No.  | 1.72e+17 |                   |            |        |  


![image](./media/Association_Computation_Diagram.png "Voyage to the moon")\

[tsne_diagram]: ./media/Association_Computation_Diagram.png "Diagram of  TSNE Plot with" 
![Alt text][some_tag] 
A reference to the [TSNE Diagram](#some_tag).

![This is the caption\label{mylabel}](./media/Association_Computation_Diagram.png)
See figure [TSNE Diagram](mylabel).

 ![Caption.](./media/Association_Computation_Diagram.png)

[TSNE Diagram](Caption.)