# CPS600 - Python Programming for Finance 
###  
<img src="https://www.syracuse.edu/wp-content/themes/g6-carbon/img/syracuse-university-seal.svg?ver=6.3.9" style="width: 200px;"/>

## Financial Time Series

###  October 25, 2018



In [1]:
import pandas as pd
import numpy as np

**First Steps with the `DataFrame`**

In [None]:
# pandas DataFrame class is designed to manage indexed and labeled data 
# Data itself can be provided in different shapes and types 
#    (list, tuple, ndarray, and dict objects).
# Create DataFrame object

df = pd.DataFrame([10, 20, 30, 40], columns=['numbers'],
                  index=['a', 'b', 'c', 'd'])
df

In [None]:
# There is an index that can take on different formats 
# (e.g., numbers, strings, time information).

df.index  # the index values

In [None]:
# Data is organized in columns, which can have custom names

df.columns  # the column names

In [None]:
df.ix['c']  # selection via index

In [None]:
df.ix[['a', 'd']]  # selection of multiple indices

In [None]:
df.ix[df.index[1:3]]  # selection via Index object

In [None]:
df.sum()  # sum per column

In [None]:
df.apply(lambda x: x ** 2)  # square of every element

In [None]:
df ** 2  # again square, this time NumPy-like

In [None]:
df['floats'] = (1.5, 2.5, 3.5, 4.5)
  # new column is generated
df

In [None]:
df['floats']  # selection of column

In [None]:
df['names'] = pd.DataFrame(['Yves', 'Guido', 'Peter', 'Travis'],
                           index=['d', 'a', 'b', 'c'])
df

In [None]:
df.append({'numbers': 100, 'floats': 5.75, 'names': 'Henry'},
               ignore_index=True)
    # temporary object; df not changed
    # the index gets replaced by a simple numbered index
    # avoid ignoring index

In [None]:
# It is  better to append a DataFrame object, 
# providing the appropriate index information

df = df.append(pd.DataFrame({'numbers': 100, 'floats': 5.75,
                             'names': 'Henry'}, index=['z',]))
df

In [None]:
# The following code  adds a new column, but with a slightly different index.
# Pandas by default accepts only values for those indices that already exist. 
# We lose the value for the index y and have a NaN value 
# (i.e., “Not a Number”) at index position z.

df.join(pd.DataFrame([1, 4, 9, 16, 25],
            index=['a', 'b', 'c', 'd', 'y'],
            columns=['squares',]))
  # temporary object

In [None]:
# we use how=“outer” to use the union of all values from both indices

df = df.join(pd.DataFrame([1, 4, 9, 16, 25],
                    index=['a', 'b', 'c', 'd', 'y'],
                    columns=['squares',]),
                    how='outer')
df

In [None]:
df[['numbers', 'squares']].mean()
  # column-wise mean

In [None]:
df[['numbers', 'squares']].std()
  # column-wise standard deviation

### Second Steps with DataFrame Class

In [22]:
import numpy as np
import pandas as pd

In [None]:
# Let's generate a numpy.ndarry with, with  nine rows and four columns 
# of  pseudorandom, standard normally distributed numbers

a = np.random.standard_normal((9, 4))
a.round(6)

In [None]:
df = pd.DataFrame(a)
df

In [7]:
#  Parameters of DataFrame function
# Parameter Format                   Description
# --------- ------                   -----------
# data      ndarray/dict/DataFrame   Data for DataFrame; dict can contain Series, ndarrays, lists
# index     Index/array-like         Index to use; defaults to range(n)
# columns   Index/array-like         Column headers to use; defaults to range(n)
# dtype     dtype, default None      Data type to use/force; otherwise, it is inferred
# copy      bool, default None       Copy data from inputs

In [None]:
df.columns = [['No1', 'No2', 'No3', 'No4']]
df

In [None]:
df['No2'][3]  # value in column No2 at index position 3

In [None]:
# assume that our nine data entries in the four columns correspond to
# month-end data, beginning in January 2015.

# freq = 'M'     Month end frequency  (See Table 6.3)
# periods = 9    number of periods 

dates = pd.date_range('2015-1-1', periods=9, freq='M')
dates

In [None]:
# we assign the newly generated DatetimeIndex 
# as the new Index object to the DataFrame object

df.index = dates
df

In [None]:
# You can generate a DataFrame object in general from an ndarray object. 
# You can also generate an ndarray object out of a DataFrame by using the function array of NumPy.

np.array(df).round(6)

### Basic Analytics

In [None]:
df.sum()   # column-wise sums

In [None]:
df.mean()   # column-wise means

In [None]:
df.cumsum()  #column-wise cumulative sums

In [None]:
df.describe()   # a number of often-used statistics for numerical data sets

In [None]:
# You can  apply the majority of NumPy universal functions to DataFrame objects

np.sqrt(df)

In [None]:
# pandas leaves out the NaN values and only works with the other available values

np.sqrt(df).sum()

In [None]:
# Plotting of data
# pandas provides a wrapper around matplotplib, specifically designed for DataFrame objects

# Parameters of plot method are listed in table 6.4
#  subplots  Boolean, default False    Plot columns in subplots
#  grid      Boolean, default False    Horizontal and vertical grid lines
#  title     String, default None      Title for the plot
#  legend    Boolean, default True     Legend of labels
#  logx      Boolean, default False    Logarithmic scaling of x-axis
#  logy      Boolean, default False    Logarithmic scaling of y-axis
#  xlim      2-tuple, list             Boundaries for x-axis
#  ylim      2-tuple, list             Boundaries for y-axis
#    etc...

%matplotlib inline
df.cumsum().plot(lw=2.0, # subplots = True
                 grid = True)
# title: Line plot of a DataFrame object

### TimeSeries Class

In [None]:
# we have worked mainly with the pandas DataFrame class

type(df)

In [None]:
# there is  a dedicated Series class. We get a Series object, 
# for example, when selecting a single column from our DataFrame object

df['No1']

In [None]:
type(df['No1'])

In [None]:
# we can plot Series objets
# style = 'b'     blue line   

%matplotlib inline
import matplotlib.pyplot as plt
df['No1'].cumsum().plot(style='b', lw=2.)
plt.xlabel('date')
plt.ylabel('value')
plt.grid()
plt.title("Line plot of a TimeSeries object")
# title: Line plot of a TimeSeries object

### GroupBy Operations

In [None]:
# pandas has powerful and flexible grouping capabilities
# similar to grouping in SQL as well as pivot tables in Microsoft Excel

# To have something to group by, we add a column 
# indicating the quarter the respective data

df['Quarter'] = ['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2', 'Q3', 'Q3', 'Q3']
df

In [None]:
# Now, we can group by the “Quarter” column

groups = df.groupby('Quarter')

In [None]:
groups.mean()

In [None]:
groups.max()

In [None]:
groups.size()

In [None]:
# Grouping can also be done with multiple columns. 
# add another column, indicating whether the month of the index date is odd or even

df['Odd_Even'] = ['Odd', 'Even', 'Odd', 'Even', 'Odd', 'Even',
                  'Odd', 'Even', 'Odd']

In [None]:
# grouping based on two columns simultaneously

groups = df.groupby(['Quarter', 'Odd_Even'])

In [None]:
groups.size()

In [None]:
groups.mean()

**Financial Data**

We will use `iexfinance` as before. And we will download some other data from the web.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from iexfinance import get_historical_data
from datetime import datetime

Here, we are downloading *Apple* stock data (historical).

In [10]:
start = datetime(2014, 2, 9)
#end = datetime(2017, 5, 24)

AAPL = get_historical_data("AAPL", start=start,  output_format='pandas')

In [None]:
AAPL.info()

In [None]:
AAPL.shape

In [None]:
AAPL.head()

In [None]:
AAPL.tail()

We can easily visualize...

In [None]:
AAPL['close'].plot(figsize=(8,5))

We can take advantage of the vectorization provided by `pandas` in computing a new column:

In [12]:
AAPL['return'] = np.log(AAPL['close'] / AAPL['close'].shift(1))

Sanity Check:

In [None]:
AAPL.close.shift(1)

We can now plot these two next two each other with a single line:

In [None]:
AAPL[['close','return']].plot(subplots=True,style='b',figsize=(8,5))

This is allegedly demonstrating some principles in finance (i.e. *volatility clustering* and *leveraging effect*), but I've substituted different data for the textbook's, so you tell me!

One of the most basic moves one makes in analyzing time series data is to calculate a *rolling average* or *rolling mean*. There's a method for that.

In [24]:
AAPL['42d'] = AAPL.close.rolling(window=42).mean()
AAPL['252d'] = AAPL.close.rolling(window=252).mean()

Finally, we can plot the two of these together.

In [None]:
AAPL[['close','42d','252d']].plot(figsize=(8,5))

Another financial metric a trader might want to calculate is the *moving volatility*, which is given by the rolling *standard deviation* of the return.

In [28]:
AAPL['movVol'] = AAPL['return'].rolling(window=252).std()*np.sqrt(252)

In [None]:
AAPL[['close','movVol','return']].plot(subplots=True,style='b',figsize=(8,5))

This seems to do a better job of illustrating those two principles mentioned above.

**Regression**

Let's talk about quantifying and automatically detecting the patterns we find in plots like the ones we made above. This is *regression analysis* and is basically glorified curve-fitting.

The first step is to get our hands on some nice new data from [Stoxx](stoxx.com).

In [None]:
# Necessary imports (the latter for later)
from urllib.request import urlretrieve
import statsmodels.api as sm

# Here downloading the data
es_url = 'http://www.stoxx.com/download/historical_values/hbrbcpe.txt'
vs_url = 'http://www.stoxx.com/download/historical_values/h_vstoxx.txt'
urlretrieve(es_url, 'es.txt')
urlretrieve(vs_url, 'vs.txt')

(might take a moment)

Now that we have this data...what is it?

* EURO STOXX 50 index
    >  Historical	daily	closing	values	of	the	EURO	STOXX	50	index,	composed	of European	blue-chip	stocks
* VSTOXX
    > Historical daily closing	data for the VSTOXX volatility index, calculated	on	the	basis of volatilities implied by options on	the	EURO STOXX 50 index

The first of these, call it `es`, needs some formatting changes. You can reverse engineer the following cell if you feel like it; I think it is really not interesting.

In [32]:
with open('es.txt', 'r') as f:
    lines = f.readlines()

lines = [line.replace(' ', '') for line in lines]

with open('es50.txt', 'w') as new_file:
    new_file.writelines('date' + lines[3][:-1]
                        + ';DEL' + lines[3][-1])
        # writes the corrected third line of the orginal file
        # as first line of new file
    new_file.writelines(lines[4:])
        # writes the remaining lines of the orginial file

new_lines = open('es50.txt', 'r').readlines()
new_lines[:5]

['date;SX5P;SX5E;SXXP;SXXE;SXXF;SXXA;DK5F;DKXF;DEL\n',
 '31.12.1986;775.00;900.82;82.76;98.58;98.06;69.06;645.26;65.56\n',
 '01.01.1987;775.00;900.82;82.76;98.58;98.06;69.06;645.26;65.56\n',
 '02.01.1987;770.89;891.78;82.57;97.80;97.43;69.37;647.62;65.81\n',
 '05.01.1987;771.89;898.33;82.82;98.60;98.19;69.16;649.94;65.82\n']

It is ready to be read in.

In [33]:
es = pd.read_csv('es50.txt',index_col=0,parse_dates=True,sep=';',dayfirst=True)

In [None]:
np.round(es.tail())

See that little guy on the end? We don't need that little guy.

In [35]:
del es['DEL']

We also want the VSTOXX set. That one is already nice.

In [48]:
vs = pd.read_csv('vs.txt',index_col=0,parse_dates=True,header=2,sep=',',dayfirst=True)

In [39]:
import datetime as dt # Needed for next thing

In [50]:
# Just one column, filtered, from the first
data = pd.DataFrame({'EUROSTOXX' :
                     es['SX5E'][es.index > dt.datetime(1999, 1, 1)]})

# Combining with another column, also filtered, from the second
data = data.join(pd.DataFrame({'VSTOXX' :
                     vs['V2TX'][vs.index > dt.datetime(1999, 1, 1)]}))

We  fill missing values with the last available values from the time series. 

We call the fillna method, providing ffill (for forward fill) as the method parameter


In [51]:
data = data.fillna(method='ffill')

Let's take a look

In [None]:
data.tail()

Looking good! How about a look at the plot?

In [None]:
data.plot(subplots=True, grid=True, style='b', figsize=(8, 6))

We can also quickly look at the log returns of the two quantities in question.

In [None]:
# Calculating - massive broadcasting here
rets = np.log(data / data.shift(1)) 
rets.dropna(inplace=True)

rets.plot(subplots=True, grid=True, style='b', figsize=(8, 6))

Now we are ready to hit it with the linear regression hammer. We take the `es` values as the *independent* or *predictor* variable, and the `vs` variables as the *dependent* or *response* variable.

There is unfortunately some nasty stuff in there, so lets get rid of it first.

In [87]:
rets.dropna(inplace=True)
rets = rets[(rets.EUROSTOXX != np.inf) & (rets.EUROSTOXX != -np.inf)]

In [88]:
xdat = rets['EUROSTOXX']
ydat = rets['VSTOXX']

The model is now very easy to do:

In [105]:
model = sm.OLS(ydat, xdat)
olsres = model.fit()

Let's plot the line of predicted values for this sucker together with a scatterplot of the original data.

In [108]:
preds = olsres.predict(xdat) # Getting the predictions.

In [None]:
from bokeh.plotting import figure, output_notebook, show

output_notebook()

p = figure(plot_width=400, plot_height=400)

p.circle(xdat, ydat, size=20, color="navy", alpha=0.5)

p.line(xdat,preds,color='firebrick')

show(p)

Nice!