In [29]:
import os
import sys
import types
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt

In [2]:
# Reads data from csv file
def symbol_to_path(symbol, base_dir="data"):
    """Return CSV file path given ticker symbol."""
    return os.path.join(base_dir, "{}.csv".format(str(symbol)))


def read_quote_data( symbol, new_path="data" ):
    
    """Read stock data (adjusted close) for given symbols from CSV files."""
    df = pd.read_csv(  symbol_to_path(symbol,new_path), 
                                      index_col='Date',
                                      parse_dates=True, 
                                     na_values=['nan'])

    return df

In [34]:
# Forward fill quote gaps
# Then backfill
def fill_quote_gaps( inp_df ):
    inp_df.fillna( method='bfill', axis=0, inplace=True )
    inp_df.fillna( method='ffill', axis=0, inplace=True )

# Find locations where stock dropped by const number
# These stock splits need to be normalized out, divide previous dates by const
def fix_splits( inp_df ):
    
    test_df = inp_df.copy()
    
    # Possible values for splits
    split_list = [ 1/5., 1/4., 1/3., 1/2., 2., 3., 4., 5. ]
    
    while True:
    
        # Breaks loop if we found a split
        break_it = False
        rat_list = test_df['Open']/test_df['Close'].shift(1)
    
        for     split in split_list:
            for     i in range( 0, len(rat_list)) :
                
                # If stock changes by what would expect from a split
                if ( abs(rat_list[i]-split) < 1e-4 ):
                    
                    # Modulate everything by the split
                    test_df.ix[i:,['Close','Open','High','Low','Adj Close']] = \
                    test_df.ix[i:,['Close','Open','High','Low','Adj Close']] / split
                    
                    # Volume behaves opposite
                    test_df.ix[i:,'Volume'] = test_df.ix[i:,'Volume'] * split
                    
                    # Break out of loops and double check for more splits
                    break_it = True
                    break
            if ( break_it ):
                break
        
        # If we didn't find a split
        if ( break_it == False ):
            break
        
    return test_df

                  Open        High         Low       Close   Volume  \
Date                                                                  
2017-03-01  180.479996  182.550003  180.029999  181.949997  2960000   
2017-02-28  179.380005  180.630005  179.350006  179.820007  3272500   
2017-02-27  181.190002  181.250000  179.279999  179.399994  3672000   
2017-02-24  180.210007  181.490005  179.889999  181.350006  3267800   
2017-02-23  182.000000  182.500000  180.919998  181.649994  2250500   

             Adj Close  
Date                    
2017-03-01  181.949997  
2017-02-28  179.820007  
2017-02-27  179.399994  
2017-02-24  181.350006  
2017-02-23  181.649994  
                 Open       High        Low      Close   Volume  Adj Close
Date                                                                      
2004-01-08  93.209999  93.209999  92.029999  93.040001  6179800  72.133910
2004-01-07  93.139999  93.379997  92.470001  92.779999  4927600  71.932330
2004-01-06  92.199997  93.1

In [4]:
# Generate measures of (close-open)/open, 
#                        (high-low)/open,
#                  (adj_close-open)/open, 
#                 (vol_tod-vol_yes)/vol_yes
def generate_differentials( inp_df ):
    new_df = inp_df.copy()

    # diff is just differences between close and open
    new_df['diff_co'] = new_df[    'Close']/new_df['Open'] - 1.0
    new_df['diff_ao'] = new_df['Adj Close']/new_df['Open'] - 1.0
    
    # diff is breadth of high-low prices, relative to open
    new_df['diff_hl'] = (new_df['High']-new_df['Low']) / new_df['Open']
    
    new_df['diff_v']  = new_df.ix[:-1,['Volume']].astype(float)/new_df.ix[1:,['Volume']].values - 1.0
    new_df.ix[ -1,['diff_v']] = 0
    
    return new_df[ ['diff_co', 'diff_ao', 'diff_hl', 'diff_v'] ]

In [9]:
# Generate some rolling values of the data
def generate_rolling_close( inp_df, inp_list ):
    
    # Reverse things, list is reversed from direction of rolling
    new_df  = inp_df[::-1].copy()
    my_days = inp_list
    
    # Make sure we are working with a list
    if ( not isinstance( inp_list, list ) ):
        my_days = [ inp_list ]

    labelList = []
        
    # Generate rolline mean and std for each length of days
    for day in my_days:
        
        labelList.append( 'Close_mean_'+str(day) )
        labelList.append( 'Close_std_' +str(day) )
        
        new_df[ labelList[-2] ] = new_df['Close'].rolling(day).mean()
        new_df[ labelList[-1] ] = new_df['Close'].rolling(day).std()
        
    return new_df.ix[ ::-1, labelList ]

In [18]:
# Generate momentum list, momentum is calculated as day/oldDay - 1
def generate_momentum_close( inp_df, inp_list ):
    
    new_df  = inp_df.copy()
    my_days = inp_list
    
    # Make sure we are working with a list
    if ( not isinstance( inp_list, list ) ):
        my_days = [ inp_list ]
    labelList = []
        
    # Generate rolline mean and std for each length of days
    for day in my_days:
        
        labelList.append( 'Momentum_'+str(day) )

        new_df[ labelList[-1] ] = new_df.ix[:-day,'Close'].astype(float)/new_df.ix[day:,'Close'].values - 1.0
        new_df.ix[-day:, labelList[-1] ] = 0
    
    return new_df[ labelList ]

In [7]:
rolling_list = [3,5,10,15,20,25,30]

In [35]:
tempDf = fix_splits( read_quote_data('IBM') )
fill_quote_gaps( tempDf )
diffs = generate_differentials( tempDf )
rolls = generate_rolling_close( tempDf, rolling_list )
moms  = generate_momentum_close(tempDf, rolling_list )

In [36]:
tempDf.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-03-01,180.479996,182.550003,180.029999,181.949997,2960000,181.949997
2017-02-28,179.380005,180.630005,179.350006,179.820007,3272500,179.820007
2017-02-27,181.190002,181.25,179.279999,179.399994,3672000,179.399994
2017-02-24,180.210007,181.490005,179.889999,181.350006,3267800,181.350006
2017-02-23,182.0,182.5,180.919998,181.649994,2250500,181.649994


In [None]:
# Need to comb looking for splits