In [10]:
import pandas as pd
import numpy as np # For random numbers mostly
import matplotlib.pyplot as plt # Matplotlib
from pandas import offsets # For offsets
from datetime import time  # For time functions
from pandas_datareader import data as web # For scraper functions

import statsmodels.formula.api as sm # for OLS regression
    # This is a big module

%matplotlib inline

### Data Alignment Problem for Time Series

Pandas automatically aligns dataframes for arithmetic operations

Pandas functions ignore NaNs - treat them as non-existant (len=0)

### align, resample, reindex, offset, time, concat, combine_first, xrange(), random.choice(choices), statsmodels library for OLS
#### align - forces indices into alignment by using similar concepts to SQL joins
#### resample - downsample or upsample a time or period range
#### reindex - force 1 df time series or period index to match another's
#### offset - versatile way to offset timesamps by common periods
- offset knows about things like business days

### This is important
#### time - used to extract temporal elements from a timestamp
- ts.at_time(time(s, m)) - extract the seconds or minutes from a timestamp
- ts.between_time(time(s, m), time(s, m)) - find all time between two ranges (inclusive of the endpoints)
- ts.asof(time range) - find the last non-NA values at or before each timestamp

#### concat - as if splicing together two data sources
- ts1.combine_first(ts2, overwrite = False) to replace the NaN's in ts1 with any values that exist in ts2

#### xrange() - like range except only generates the list when its called and then throws the list away.

#### choice, xrange, list comprehension - nice code snipper to create 1000 5-character tickers
    N = 1000
    def rands(n):
      choices = string.ascii_uppercase # Get the ASCII Uppercase letters
      return ''.join([random.choice(choices) for _ in xrange(n)]) 
      # Joins ASCII Uppdercase n number of times.
    np.array([rands(5) for _ in xrange(N)])

# This means "do the thing 5 times". xrange is used because it's just-in-time memory usage 
# [rands(5) for _ in xrange(5)]

# Generates 1000 5-digit choices
tickers = np.array([rands(5) for _ in range(N)])

### Can choose to only work with the shared variables using pd.align
Returns a tuple:
    
    prices.align(volume, join = 'inner')

In [11]:
prices = pd.read_csv('ch11/prices.csv')
prices = prices.set_index('Unnamed: 0')
prices.index.name = 'date'

volume = pd.read_csv('ch11/volumes.csv')
volume = volume.set_index('Unnamed: 0')
volume.index.name = 'date'

prices = prices[['AAPL','JNJ','SPX','XOM']]
volume = volume[['AAPL','JNJ','XOM']]
prices = prices['2011-09-06':'2011-09-14']
volume = volume['2011-09-06':'2011-09-12']

print prices
print volume

                       AAPL    JNJ      SPX    XOM
date                                              
2011-09-06 00:00:00  379.74  64.64  1165.24  71.15
2011-09-07 00:00:00  383.93  65.43  1198.62  73.65
2011-09-08 00:00:00  384.14  64.95  1185.90  72.82
2011-09-09 00:00:00  377.48  63.64  1154.23  71.01
2011-09-12 00:00:00  379.94  63.59  1162.27  71.84
2011-09-13 00:00:00  384.62  63.61  1172.87  71.65
                           AAPL         JNJ         XOM
date                                                   
2011-09-06 00:00:00  18173500.0  15848300.0  25416300.0
2011-09-07 00:00:00  12492000.0  10759700.0  23108400.0
2011-09-08 00:00:00  14839800.0  15551500.0  22434800.0
2011-09-09 00:00:00  20171900.0  17008200.0  27969100.0


In [12]:
# Arithmetic operations are automatically aligned
prices * volume

Unnamed: 0_level_0,AAPL,JNJ,SPX,XOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-09-06 00:00:00,6901205000.0,1024434000.0,,1808370000.0
2011-09-07 00:00:00,4796054000.0,704007200.0,,1701934000.0
2011-09-08 00:00:00,5700561000.0,1010070000.0,,1633702000.0
2011-09-09 00:00:00,7614489000.0,1082402000.0,,1986086000.0
2011-09-12 00:00:00,,,,
2011-09-13 00:00:00,,,,


In [13]:
# Pandas functions also aggregate over NaNs w/o error
# Can easily calculate volume-weighted average price across the time range
vwap = (prices * volume).sum() / volume.sum()
vwap

AAPL    380.837003
JNJ      64.577684
SPX            NaN
XOM      72.073105
dtype: float64

### Can choose to explicitly work with the variables shared across data sets using df1.align(df2)
<b>Returns a tuple with each element corresponding to a given variable)

join = inner means only return elements that share all indices

In [14]:
prices.align(volume, join = 'inner')

(                       AAPL    JNJ    XOM
 date                                     
 2011-09-06 00:00:00  379.74  64.64  71.15
 2011-09-07 00:00:00  383.93  65.43  73.65
 2011-09-08 00:00:00  384.14  64.95  72.82
 2011-09-09 00:00:00  377.48  63.64  71.01,
                            AAPL         JNJ         XOM
 date                                                   
 2011-09-06 00:00:00  18173500.0  15848300.0  25416300.0
 2011-09-07 00:00:00  12492000.0  10759700.0  23108400.0
 2011-09-08 00:00:00  14839800.0  15551500.0  22434800.0
 2011-09-09 00:00:00  20171900.0  17008200.0  27969100.0)

### Example: combine Series with different indices

In [15]:
s1 = pd.Series(range(3), index=['a', 'b', 'c'])
s2 = pd.Series(range(4), index=['d', 'b', 'c', 'e'])
s3 = pd.Series(range(3), index=['f', 'a', 'c'])
pd.DataFrame({'one':s1, 'two':s2, 'three':s3})

Unnamed: 0,one,three,two
a,0.0,1.0,
b,1.0,,1.0
c,2.0,2.0,2.0
d,,,0.0
e,,,3.0
f,,0.0,


#### Explicitly set indices to remove indices that we don't want

In [16]:
pd.DataFrame({'one':s1, 'two':s2, 'three': s3}, index=list('face'))

Unnamed: 0,one,three,two
f,,0.0,
a,0.0,1.0,
c,2.0,2.0,2.0
e,,,3.0


### Using resample and reindex for frequency conversion and realignment

- #### Recall for that 'resample' creates an object; it requires a method to turn it back into a data frame
- #### fillna()

In [19]:
# Weekly on Wednesdays
ts1 = pd.Series(np.random.randn(3), index=pd.date_range('2012-06-13', periods=3, freq='W-WED'))
ts1

2012-06-13   -0.454667
2012-06-20   -0.385863
2012-06-27    0.207826
Freq: W-WED, dtype: float64

In [18]:
pd.date_range('2016-01-01', periods=5, freq='W-TUE')

DatetimeIndex(['2016-01-05', '2016-01-12', '2016-01-19', '2016-01-26',
               '2016-02-02'],
              dtype='datetime64[ns]', freq='W-TUE')

In [23]:
# Resample to business days
# 'Mean' is sort of silly here because we're actually upsampling so no aggregation is happening; 'mean' and 'sum' return the same thing
print ts1.resample('B').sum()
print '-----'
print ts1.resample('B').mean()

2012-06-13   -0.454667
2012-06-14         NaN
2012-06-15         NaN
2012-06-18         NaN
2012-06-19         NaN
2012-06-20   -0.385863
2012-06-21         NaN
2012-06-22         NaN
2012-06-25         NaN
2012-06-26         NaN
2012-06-27    0.207826
Freq: B, dtype: float64
-----
2012-06-13   -0.454667
2012-06-14         NaN
2012-06-15         NaN
2012-06-18         NaN
2012-06-19         NaN
2012-06-20   -0.385863
2012-06-21         NaN
2012-06-22         NaN
2012-06-25         NaN
2012-06-26         NaN
2012-06-27    0.207826
Freq: B, dtype: float64


In [24]:
# Upsample and forward fill na -- 6/13 - 6/27 inclusive
ts1.resample('B').fillna('ffill')

2012-06-13   -0.454667
2012-06-14   -0.454667
2012-06-15   -0.454667
2012-06-18   -0.454667
2012-06-19   -0.454667
2012-06-20   -0.385863
2012-06-21   -0.385863
2012-06-22   -0.385863
2012-06-25   -0.385863
2012-06-26   -0.385863
2012-06-27    0.207826
Freq: B, dtype: float64

In [39]:
# Create a series with 6/12 - 6/29 inclusive with gaps
dates = pd.DatetimeIndex(['2012-06-12', '2012-06-17', '2012-06-18', '2012-06-21', '2012-06-22', '2012-06-29'])
ts2 = pd.Series(np.random.randn(6), index=dates)
ts2

2012-06-12   -0.993051
2012-06-17   -1.235584
2012-06-18   -0.511427
2012-06-21   -1.683188
2012-06-22   -0.288761
2012-06-29    1.159336
dtype: float64

### We want to take the dates in ts2 only - reindex ts1 on ts2's index
#### You can then perform operations on ts2 and any ts1 values that are the same index
#### Powerful because let's you perform operations without needing to care about which index is the highest / lower frequency

In [42]:
print ts1
print '------'
print ts1.reindex(ts2.index, method='ffill')

2012-06-13   -0.454667
2012-06-20   -0.385863
2012-06-27    0.207826
Freq: W-WED, dtype: float64
------
2012-06-12         NaN
2012-06-17   -0.454667
2012-06-18   -0.454667
2012-06-21   -0.385863
2012-06-22   -0.385863
2012-06-29    0.207826
dtype: float64


In [43]:
ts2 + ts1.reindex(ts2.index, method='ffill')

2012-06-12         NaN
2012-06-17   -1.690252
2012-06-18   -0.966095
2012-06-21   -2.069052
2012-06-22   -0.674624
2012-06-29    1.367162
dtype: float64

### Working with Periods rather than Timestamps - Specify period start, # of periods, and interval

In [29]:
gdp = pd.Series([1.78, 1.94, 2.08, 2.01, 2.15, 2.31, 2.46], index=pd.period_range('1984Q2', periods=7, freq='Q-SEP'))
infl = pd.Series([0.025, 0.045, 0.037, 0.04], index=pd.period_range('1982', periods=4, freq='A-DEC'))
# GDP should be quarterly
gdp

1984Q2    1.78
1984Q3    1.94
1984Q4    2.08
1985Q1    2.01
1985Q2    2.15
1985Q3    2.31
1985Q4    2.46
Freq: Q-SEP, dtype: float64

In [30]:
# Inlation is annual basis
infl

1982    0.025
1983    0.045
1984    0.037
1985    0.040
Freq: A-DEC, dtype: float64

### Indexes are periods -- we lack timestamps so we can't just perform operations across time intervals
#### We have to explicitly convert the periods to a common basis

In [56]:
# Recast inflation as Quarterly
infl_q = infl.asfreq('Q-SEP', how='end')
infl_q

1983Q1    0.025
1984Q1    0.045
1985Q1    0.037
1986Q1    0.040
Freq: Q-SEP, dtype: float64

In [35]:
# Reindex to the time basis of GDP and forward-fill the missing pieces
infl_q.reindex(gdp.index, method='ffill')

1984Q2    0.045
1984Q3    0.045
1984Q4    0.045
1985Q1    0.037
1985Q2    0.037
1985Q3    0.037
1985Q4    0.037
Freq: Q-SEP, dtype: float64

#### Example: Extract a price at the same intra-day time, even if the time isn't explicitly present

In [57]:
# Make our intraday time index that has all times
# freq = 'T' --> minute-level frequency

# Step 1 - create the full 5-minute time range for the trading day
rng = pd.date_range('2012-06-01 09:30', '2012-06-01 15:59', freq='T')

# Step 2 - Starting with that date, offset that rng by the next 5 Business Days -- use list comprehension and offset module
rng = rng.append([rng + pd.offsets.BDay(i) for i in range(1,4)])
rng

DatetimeIndex(['2012-06-01 09:30:00', '2012-06-01 09:31:00',
               '2012-06-01 09:32:00', '2012-06-01 09:33:00',
               '2012-06-01 09:34:00', '2012-06-01 09:35:00',
               '2012-06-01 09:36:00', '2012-06-01 09:37:00',
               '2012-06-01 09:38:00', '2012-06-01 09:39:00',
               ...
               '2012-06-06 15:50:00', '2012-06-06 15:51:00',
               '2012-06-06 15:52:00', '2012-06-06 15:53:00',
               '2012-06-06 15:54:00', '2012-06-06 15:55:00',
               '2012-06-06 15:56:00', '2012-06-06 15:57:00',
               '2012-06-06 15:58:00', '2012-06-06 15:59:00'],
              dtype='datetime64[ns]', length=1560, freq=None)

In [38]:
# Step 3 - Create an enumerated series with that index - notice that it has length of 1560 as stated above
ts = pd.Series(np.arange(len(rng), dtype=float), index=rng)
ts

2012-06-01 09:30:00       0.0
2012-06-01 09:31:00       1.0
2012-06-01 09:32:00       2.0
2012-06-01 09:33:00       3.0
2012-06-01 09:34:00       4.0
2012-06-01 09:35:00       5.0
2012-06-01 09:36:00       6.0
2012-06-01 09:37:00       7.0
2012-06-01 09:38:00       8.0
2012-06-01 09:39:00       9.0
2012-06-01 09:40:00      10.0
2012-06-01 09:41:00      11.0
2012-06-01 09:42:00      12.0
2012-06-01 09:43:00      13.0
2012-06-01 09:44:00      14.0
2012-06-01 09:45:00      15.0
2012-06-01 09:46:00      16.0
2012-06-01 09:47:00      17.0
2012-06-01 09:48:00      18.0
2012-06-01 09:49:00      19.0
2012-06-01 09:50:00      20.0
2012-06-01 09:51:00      21.0
2012-06-01 09:52:00      22.0
2012-06-01 09:53:00      23.0
2012-06-01 09:54:00      24.0
2012-06-01 09:55:00      25.0
2012-06-01 09:56:00      26.0
2012-06-01 09:57:00      27.0
2012-06-01 09:58:00      28.0
2012-06-01 09:59:00      29.0
                        ...  
2012-06-06 15:30:00    1530.0
2012-06-06 15:31:00    1531.0
2012-06-06

In [59]:
# Step 4 - Pull out the 10 AM times
# time requires "from datetime import time"
# time objects accessible as: time([hour[, minute[, second[, microsecond[, tzinfo]]]]]) --> a time object
from datetime import time
ts[time(10,0)]

2012-06-01 10:00:00      30.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1200.0
dtype: float64

In [123]:
# Equivalently
ts.at_time(time(10,0))

2012-06-01 10:00:00      30.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1200.0
dtype: float64

In [71]:
# Use between_time to find the values between 10:00 and 10:01 inclusive (2 values per day for our data set)
ts.between_time(time(10, 0), time(10, 1))

2012-06-01 10:00:00      30.0
2012-06-01 10:01:00      31.0
2012-06-04 10:00:00     420.0
2012-06-04 10:01:00     421.0
2012-06-05 10:00:00     810.0
2012-06-05 10:01:00     811.0
2012-06-06 10:00:00    1200.0
2012-06-06 10:01:00    1201.0
dtype: float64

In [67]:
len(np.random.permutation(len(ts))[:700])

700

In [84]:
# Step 5 - Find an array of 700 values between 1 and the length of the array
# (Sort them so they're in order -- for aesthetics, not actually analytically necessary)
# Step 6 - Set the index values matching those 700 random values NaN's
np.random.seed() # This will randomize the permutation.
indexer = np.sort(np.random.permutation(len(ts))[:700])
irr_ts = ts.copy()
irr_ts[indexer] = np.nan

# See how this looks for a single day approaching 10 AM.  
# We're hoping that there are some values and some blanks there
irr_ts['2012-06-05 09:50':'2012-06-05 10:00']

2012-06-05 09:50:00    800.0
2012-06-05 09:51:00    801.0
2012-06-05 09:52:00    802.0
2012-06-05 09:53:00    803.0
2012-06-05 09:54:00      NaN
2012-06-05 09:55:00      NaN
2012-06-05 09:56:00      NaN
2012-06-05 09:57:00    807.0
2012-06-05 09:58:00    808.0
2012-06-05 09:59:00      NaN
2012-06-05 10:00:00      NaN
dtype: float64

In [85]:
# Step 7 - Set our selection criteria -- 10 AM for 4 business days on and after 2012-06-01
# with ts.asof, what's returned is just the LAST VALID timestamp AT OR PRIOR TO the selection, even if the value
# of the time series at that index was NaN
selection = pd.date_range('2012-06-01 10:00', periods = 4, freq='B')
irr_ts.asof(selection)

2012-06-01 10:00:00      28.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     808.0
2012-06-06 10:00:00    1200.0
Freq: B, dtype: float64

### Splicing Together Data Sources
#### concat - join  2 data sets
#### combine_first - backfill data from 1 data set with another data set
- Similar functionality to 'update'

#### pct_change, cumprod, resample().prod() as useful shortcut functions


In [152]:
# A data set of '1' values
data1 = pd.DataFrame(np.ones((6, 3), dtype=float),
                     columns=['a', 'b', 'c'],
                     index=pd.date_range('6/12/2012',
                     periods=6))

# A data set of '2' values
data2 = pd.DataFrame(np.ones((6,3), dtype=float) * 2,
                     columns=['a', 'b', 'c'],
                     index=pd.date_range('6/13/2012',
                     periods = 6))

# Take data1 up to 6/14/2012 and data2 for 6/15/2012 and beyond
# syntax: pd.concat([df1, df2, ....])
pd.concat([data1.ix[:'6/14/2012'], data2.ix['2012-06-15':]])

Unnamed: 0,a,b,c
2012-06-12,1.0,1.0,1.0
2012-06-13,1.0,1.0,1.0
2012-06-14,1.0,1.0,1.0
2012-06-15,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0


In [156]:
# If data1 was missing a time series present in data2 -- for example, an entire column
# Then concat will just naively concat the data sets
data2 = pd.DataFrame(np.ones((6, 4), dtype=float) * 2,
                    columns=['a', 'b', 'c', 'd'],
                    index=pd.date_range('6/13/2012', periods = 6))
spliced = pd.concat([data1.ix[:'6/14/2012'], data2.ix['6/15/2012':]])
spliced

Unnamed: 0,a,b,c,d
2012-06-12,1.0,1.0,1.0,
2012-06-13,1.0,1.0,1.0,
2012-06-14,1.0,1.0,1.0,
2012-06-15,2.0,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0,2.0


In [158]:
# Use combine_first to backfill data from data2 into data1 where data2 has that index
# If data2 doesn't have the index of data1, then the value will NOT be filled in
spliced_filled = spliced.combine_first(data2)
spliced_filled

Unnamed: 0,a,b,c,d
2012-06-12,1.0,1.0,1.0,
2012-06-13,1.0,1.0,1.0,2.0
2012-06-14,1.0,1.0,1.0,2.0
2012-06-15,2.0,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0,2.0


In [160]:
# The same functionality can be achieved with update and overwrite = False
spliced.update(data2, overwrite = False)
spliced

Unnamed: 0,a,b,c,d
2012-06-12,1.0,1.0,1.0,
2012-06-13,1.0,1.0,1.0,2.0
2012-06-14,1.0,1.0,1.0,2.0
2012-06-15,2.0,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0,2.0


### Percent Changes and Cumulative Changes

In [173]:
# Uses pandas-datareader.data 
#web.get_data_yahoo(ticker, start date for data)
price = web.get_data_yahoo('AAPL', '2011-01-01')['Adj Close']
price[-5:] # last 5 prices

Date
2017-04-03    143.699997
2017-04-04    144.770004
2017-04-05    144.020004
2017-04-06    143.660004
2017-04-07    143.339996
Name: Adj Close, dtype: float64

In [174]:
# Cumulative return of a single day v. another previous day
price['2011-10-03'] / price['2011-03-01'] - 1

0.072399896359515159

In [193]:
# Cumlative returns over all days -- how much money you would have made on each day if you invested $1 on Day 1.
returns = price.pct_change() # Cumulative day-over-day
ret_index = (1 + returns)    # Add 1 so we have a multiple not just a gain
ret_index = ret_index.cumprod() # Multiply all the multiples together to get the final product up to each time
ret_index[0] = 1 # If we make the first value to be 1, then all resultant days' values are the given value
ret_index
m_returns = ret_index.resample('BM').last().pct_change()   # last-day-of-month-over-last-day-of-month percent changes
m_returns['2012']

Date
2012-01-31    0.127111
2012-02-29    0.188311
2012-03-30    0.105284
2012-04-30   -0.025970
2012-05-31   -0.010702
2012-06-29    0.010853
2012-07-31    0.045822
2012-08-31    0.093877
2012-09-28    0.002796
2012-10-31   -0.107600
2012-11-30   -0.012375
2012-12-31   -0.090743
Freq: BM, Name: Adj Close, dtype: float64

In [194]:
# The same result achieved via downsampling to monthly and taking the product as the aggregator
m_rets = (1 + returns).resample('M', kind = 'period').prod() - 1
m_rets['2012']

Date
2012-01    0.127111
2012-02    0.188311
2012-03    0.105284
2012-04   -0.025970
2012-05   -0.010702
2012-06    0.010853
2012-07    0.045822
2012-08    0.093877
2012-09    0.002796
2012-10   -0.107600
2012-11   -0.012375
2012-12   -0.090743
Freq: M, Name: Adj Close, dtype: float64

## Group Transforms and Analysis

In [250]:
import random; random.seed(0)
import string
N = 1000
def rands(n):
    choices = string.ascii_uppercase # Get the ASCII Uppercase letters
    return ''.join([random.choice(choices) for _ in xrange(n)]) # Joins ASCII Uppdercase n number of times.

# This means "do the thing 5 times". xrange is used because it's just-in-time memory usage 
# [rands(5) for _ in xrange(5)]

# Generates 1000 5-digit choices
tickers = np.array([rands(5) for _ in range(N)])
tickers[:5]

array(['VTKGN', 'KUHMP', 'XNHTQ', 'GXZVX', 'ISXRM'], 
      dtype='|S5')

In [217]:
# Create a dataframe of three random, hypothetical portfolios for 500 of the tickers
# Create a dataframe of random industry classifications for the tickers
M = 500
df = pd.DataFrame({'Momentum': np.random.randn(M) / 200 + 0.03,
                   'Value': np.random.randn(M) / 200 + 0.08,
                   'ShortInterest': np.random.randn(M) / 200 - 0.02},
                 index = tickers[:M])
ind_names = np.array(['FINANCIAL', 'TECH'])
sampler = np.random.randint(0, len(ind_names), N) # Decide if each place is a 1 or a 0
industries = pd.Series(ind_names[sampler], index=tickers, name='industry')
industries

VTKGN    FINANCIAL
KUHMP         TECH
XNHTQ    FINANCIAL
GXZVX         TECH
ISXRM         TECH
CLPXZ         TECH
MWGUO    FINANCIAL
ASKVR         TECH
AMWGI    FINANCIAL
WEOGZ         TECH
ULCIN         TECH
YCOSO         TECH
VOZPP         TECH
LPKOH         TECH
EEPRM    FINANCIAL
CTWYV    FINANCIAL
XYOKS    FINANCIAL
HVWXP         TECH
YPLRZ    FINANCIAL
XUCPM    FINANCIAL
QVGTD    FINANCIAL
FUIVC         TECH
DSBOX    FINANCIAL
NRAQP         TECH
OKJZA    FINANCIAL
AYEDF         TECH
UYALC    FINANCIAL
GFQJE    FINANCIAL
NBCZF    FINANCIAL
JTVXE         TECH
           ...    
GPNYZ    FINANCIAL
UWRUN         TECH
SLXXZ         TECH
GFQJK         TECH
URWVM    FINANCIAL
CKUJV         TECH
KVTRA         TECH
YJCAW    FINANCIAL
KPZYQ         TECH
TQKDF    FINANCIAL
HFYFG    FINANCIAL
SZQHV    FINANCIAL
EQMTM    FINANCIAL
QAVSF         TECH
YZQVZ    FINANCIAL
HEIZH         TECH
LWELG    FINANCIAL
KYLBX         TECH
WIBMQ         TECH
LBCTN    FINANCIAL
QHLIR         TECH
LCRZK       

In [221]:
# Aggregate the portfolios dataframe over the values in the industries dataframe
by_industry = df.groupby(industries) # Split
#  by_industry.mean() # Apply, Combine
by_industry.describe() # Apply, Combine

Unnamed: 0_level_0,Unnamed: 1_level_0,Momentum,ShortInterest,Value
industry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FINANCIAL,count,264.0,264.0,264.0
FINANCIAL,mean,0.030774,-0.020041,0.080245
FINANCIAL,std,0.004998,0.004798,0.005017
FINANCIAL,min,0.016389,-0.032104,0.065535
FINANCIAL,25%,0.027884,-0.023685,0.076884
FINANCIAL,50%,0.030531,-0.019707,0.080279
FINANCIAL,75%,0.033944,-0.016955,0.083831
FINANCIAL,max,0.045915,-0.006732,0.093865
TECH,count,236.0,236.0,236.0
TECH,mean,0.030212,-0.020456,0.080191


In [222]:
# Standardize the z-score within each industry
def zscore(group):
    return (group - group.mean()) / group.std()
df_stand = by_industry.apply(zscore)
df_stand

Unnamed: 0,Momentum,ShortInterest,Value
VTKGN,0.187140,1.147173,-1.210089
KUHMP,1.477522,1.591198,-0.669357
XNHTQ,-2.214231,-2.084997,0.745669
GXZVX,0.450109,0.216695,-0.465904
ISXRM,-0.449655,0.986529,-0.843883
CLPXZ,0.572958,-0.457501,-0.962520
MWGUO,-0.175183,0.341141,0.516723
ASKVR,0.566358,0.346740,1.278944
AMWGI,-0.810510,1.029095,-1.860343
WEOGZ,-1.163121,-0.207462,-0.569017


In [224]:
# Verify that df_stand standardizes the means and std of the two sectors
df_stand.groupby(industries).agg(['mean', 'std'])

Unnamed: 0_level_0,Momentum,Momentum,ShortInterest,ShortInterest,Value,Value
Unnamed: 0_level_1,mean,std,mean,std,mean,std
industry,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
FINANCIAL,3.258126e-15,1.0,7.250093e-16,1.0,1.359771e-14,1.0
TECH,-2.85509e-16,1.0,-1.5195e-15,1.0,3.214354e-15,1.0


In [231]:
# Use "Rank" to see the rank of the row-column entry in the column space
ind_rank = by_industry.rank(ascending=False)

# Finding the min and max ranks will of course just give 1 and N, where N is the number of entries
ind_rank.groupby(industries).agg(['min', 'max'])

Unnamed: 0_level_0,Momentum,Momentum,ShortInterest,ShortInterest,Value,Value
Unnamed: 0_level_1,min,max,min,max,min,max
industry,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
FINANCIAL,1.0,264.0,1.0,264.0,1.0,264.0
TECH,1.0,236.0,1.0,236.0,1.0,236.0


#### Rank and Standardize

In [236]:
# Industry rank and standardize
# "Rank the row and then find its zscore -- i.e relative return of each stock by each strategy"
by_industry.apply(lambda x: zscore(x.rank()))

Unnamed: 0,Momentum,ShortInterest,Value
VTKGN,0.386356,1.290036,-1.355520
KUHMP,1.545300,1.559948,-0.783636
XNHTQ,-1.696038,-1.682941,0.936422
GXZVX,0.490688,0.241682,-0.519983
ISXRM,-0.578572,1.135173,-0.959404
CLPXZ,0.710399,-0.607867,-1.091231
MWGUO,-0.229194,0.412550,0.648292
ASKVR,0.681104,0.476040,1.369532
AMWGI,-1.041197,1.263843,-1.604360
WEOGZ,-1.296295,-0.300272,-0.725046


### Group Factor Exposures

#### Want to recover the statistical correlations of moving values on each other
#### These othe rmoving values are called "factors" or (confusingly) "risk factors"

In [247]:
# 3 imaginary factors and some weights
fac1, fac2, fac3 = np.random.rand(3, 1000)
fac1, fac2, fac3

In [273]:
# Get our numpy array of 5 random tickers
# Rewriting this just to remember how to do it
N = 1000
def rands(n):
    choices = string.ascii_uppercase
    return ''.join([random.choice(choices) for _ in xrange(n)])
tickers = np.array([rands(5) for _ in xrange(N)])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
0    1000 non-null object
dtypes: object(1)
memory usage: 7.9+ KB


In [279]:
# Get the tickers in a random order
ticker_subset = tickers.take(np.random.permutation(N)[:1000])
ticker_subset[:10]

array(['CAXGT', 'BYKPG', 'DCMNP', 'KCEKY', 'GPUQG', 'MCFAX', 'RRSSH',
       'PBLWZ', 'GCOWQ', 'MNXDQ'], 
      dtype='|S5')

#### Define portfolio as a weighted sum of factors plus noise
#### The goal of a factor analysis is the recover the weights that we're putting into the equation below

In [335]:
# Correlation shows correct rough order of normalized magnitude, but not real magnitude
portfolio = pd.Series(0.7 * fac1 - 1.2 * fac2 + 0.3 * fac3 + np.random.rand(1000),
                index = ticker_subset, name='portfolio')
factors = pd.DataFrame({'f1':fac1, 'f2':fac2, 'f3':fac3},
                index = ticker_subset)
portfolio
pd.concat([pd.DataFrame(portfolio), factors], axis=1)
#factors
#factors.corrwith(port)

Unnamed: 0,portfolio,f1,f2,f3
CAXGT,0.721904,0.950042,0.767583,0.453695
BYKPG,0.208846,0.822100,0.545149,0.450633
DCMNP,0.057752,0.174294,0.201980,0.030661
KCEKY,0.141601,0.918203,0.945727,0.800020
GPUQG,0.251059,0.150396,0.643855,0.332862
MCFAX,1.342596,0.741253,0.132295,0.606908
RRSSH,0.963161,0.627664,0.151453,0.500297
PBLWZ,-0.168629,0.742000,0.869108,0.534585
GCOWQ,0.903861,0.728885,0.273407,0.139559
MNXDQ,0.161744,0.229414,0.863350,0.393065


In [328]:
# Conduct an OLS regression and see if we recover the factors
# This uses "import statsmodels.formula.api as sm"
# Note the order of the passed variable columns matters, not the name in the formula

result = sm.ols(formula = "portfolio ~ fac1 + fac2 + fac3", 
       data = pd.concat([portfolio, factors], axis=1)).fit()
result.params

Intercept    0.508704
fac1         0.674471
fac2        -1.191594
fac3         0.299553
dtype: float64

### Much material in this section can't be done because
### the pandas libraries have been deprecated in factor of statsmodels
### and the syntax is different

In [353]:
# Compute exposure industry-by-industry
# The term for this correlation is "beta"
#def beta_exposure(chunk, factors=None):
#    return sm.ols(formula = "portfolio ~ fac1 + fac2 + fac3", 
#                  data = pd.Dataframe(pd.concat([chunk, factors], axis=1)).beta

sm.ols(formula = "portfolio ~ f1 + f2 + f3", 
    data = pd.concat([portfolio, factors], axis=1)).fit()

#def beta_exposure(chunk, factor=None):
#    return sm.ols(formula = "portfolio ~ fac1 + fac2 + fac3", 
#                 data = pd.concat([chunk, factor], axis=1)).beta

#by_ind = portfolio.groupby(industries)
#exposures = by_ind.apply(beta_exposure, factors = factors)
#exposures.unstack()

TypeError: cannot concatenate a non-NDFrame object