In [1]:
# 16/06/2020
# TASI.AI
# Raphael Mourad

# Feature engineering: how to make features that best capture and summarize
# the key information from the huge option data.

# Explanations: Option data are too big to be used directly for the predictions.
# Instead, features should be built which should best capture and summarize
# key information for the prediction of stock price and probability distribution.
# This is where knowledge about stock market and option pricing is necessary.

# https://en.wikipedia.org/wiki/Feature_engineering
# https://medium.com/predict/feature-engineering-for-stock-price-movement-54669d4a7efc

###### IMPORT LIBRARIES AND SET UP PARAMETERS

# Import libraries
import os
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
import weighted
from pandas_datareader import DataReader

  from pandas.util.testing import assert_frame_equal


In [2]:
# Set up directory
dir="/media/mourad/diskSave/MCF_Toulouse/recherche/ConsulProj/RamCiri"
os.chdir(dir)
print(os.getcwd())

/media/mourad/diskSave/MCF_Toulouse/recherche/ConsulProj/RamCiri


In [3]:
# Parameters
symbol="CDLX" # Set up the symbol you want
kdays=30 # Number of days for forecasting
kdaysExpi=180 # Number of days until expiration date

In [4]:
# Create folder for symbol
symbolFolder="results/ARIMA_optionstats/"+symbol
if(os.path.isdir(symbolFolder)==False):
    os.mkdir(symbolFolder, mode=0o777)

In [5]:
### STOCK OPTION PRICE
# Import option
pathOption="data/options/samples_L3_options_"+symbol+".csv.gz"
data_option = pd.read_csv(pathOption, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False)
data_option=data_option.sort_values(["Expiration","DataDate","Type","Strike"])
data_option["DataDate"]=pd.to_datetime(data_option["DataDate"])
data_option["Expiration"]=pd.to_datetime(data_option["Expiration"])

# Compute mean of bid and ask prices
data_option["BidAskMean"]=(data_option["Bid"]+data_option["Ask"])/2
print(data_option.columns.values)
#print(data_option)

['UnderlyingSymbol' 'UnderlyingPrice' 'Flags' 'OptionSymbol' 'Type'
 'Expiration' 'DataDate' 'Strike' 'Last' 'Bid' 'Ask' 'Volume'
 'OpenInterest' 'T1OpenInterest' 'IVMean' 'IVBid' 'IVAsk' 'Delta' 'Gamma'
 'Theta' 'Vega' 'AKA' 'BidAskMean']


In [6]:
### BUILD FEATURES FROM OPTION DATA

# Feature: Put to Call Ratio (PCR) [CHECKED]
# PCR < 1, the stock price is anticipated to increase in the future.
# PCR > 1, the stock price is anticipated to decrease in the future.
# Average PCR over expiration dates


# Loop over every date in DataDate
dataDate=np.sort(np.unique(data_option["DataDate"]))
matOptionFeatures=pd.DataFrame(columns=["DataDate","PCR","SM","SWM","WMEP","MROI","WMD","WMG","WMT","WMV"])
for i in range(0, len(dataDate)):
    
    # Check if there are option data for the given DataDate
    #print(dataDate[i])
    #print(sum(data_option["DataDate"]==dataDate[i]))
    
    # Set a data date and expiration date < DataDate + 180 days
    data_option_i=data_option[data_option["DataDate"]==dataDate[i]]
    priceUnderlying=np.unique(data_option_i["UnderlyingPrice"])
    rowToKeep=((data_option_i["Expiration"]>= (data_option_i["DataDate"]+pd.DateOffset(days=kdays))) 
        & (data_option_i["Expiration"]<= (data_option_i["DataDate"]+pd.DateOffset(days=kdaysExpi))))
    data_option_i=data_option_i[rowToKeep]
    data_option_i=data_option_i.drop(columns=["UnderlyingSymbol","Flags","OptionSymbol","DataDate"])
    #print(data_option_i[["UnderlyingPrice","Expiration","Strike","BidAskMean"]])
    
    #print(data_option_i.shape)
    
    # Put to Call Ratio
    # 1 is added to avoid +Inf values
    PCRi=round((np.sum(data_option_i.loc[data_option_i["Type"]=="put","Volume"])+1)/(np.sum(data_option_i.loc[data_option_i["Type"]=="call","Volume"])+1),3)
    
    # Distribution of expected prices (strike price + call price, strike price - put price)
    # I noted "BAMS".
    data_option_i["BAMS"]=data_option_i["Strike"]+data_option_i["BidAskMean"]
    data_option_i.loc[data_option_i["Type"]=="put","BAMS"]=(data_option_i.loc[data_option_i["Type"]=="put","Strike"]
                                                      -data_option_i.loc[data_option_i["Type"]=="put","BidAskMean"])

    # Straddle cost
    SMi=round(np.median(abs(data_option_i["BAMS"]-priceUnderlying)),3)
    SWMi=round(weighted.median(abs(data_option_i["BAMS"]-priceUnderlying),weights=data_option_i["Volume"]+1),3)
    
    # Weighted median expected price
    WMEPi=round(weighted.median(data_option_i["BAMS"],weights=data_option_i["Volume"]+1),3)
    
    # Median open interest
    MROIi=round(np.median(data_option_i["OpenInterest"]),3)
    
    # Weighted median Greeks
    WMDi=round(weighted.median(data_option_i["Delta"],weights=data_option_i["Volume"]+1),3)
    WMGi=round(weighted.median(data_option_i["Gamma"],weights=data_option_i["Volume"]+1),3)
    WMTi=round(weighted.median(data_option_i["Theta"],weights=data_option_i["Volume"]+1),3)
    WMVi=round(weighted.median(data_option_i["Vega"],weights=data_option_i["Volume"]+1),3)

    # Feature list
    featListi=[dataDate[i],PCRi,SMi,SWMi,WMEPi,MROIi,WMDi,WMGi,WMTi,WMVi]
    
    # Store features in matOptionFeatures
    matOptionFeatures.loc[i]=featListi



In [7]:
### EXPORT FEATURES FROM OPTION DATA
pathOptionFeatures="data/options_features/samples_L3_options_features_"+symbol+"_"+str(kdaysExpi)+"daysExpi.csv.gz"
matOptionFeatures.to_csv(pathOptionFeatures, compression='gzip', index=False, sep=',', quotechar='"')

In [8]:
print(matOptionFeatures)

      DataDate    PCR      SM     SWM    WMEP  MROI    WMD    WMG     WMT  \
0   2019-09-05  1.000   4.750   4.750  36.550   0.0  0.107  0.025  -9.060   
1   2019-09-06  0.062   4.150   3.607  41.325   0.0  0.405  0.031  -8.569   
2   2019-09-09  0.022   4.325   4.379  42.937   0.0  0.476  0.030  -8.130   
3   2019-09-10  0.030   3.675   6.394  40.464   0.0  0.224  0.045  -9.165   
4   2019-09-11  1.667   4.012   4.050  36.462   0.0  0.045  0.029  -7.140   
..         ...    ...     ...     ...     ...   ...    ...    ...     ...   
180 2020-05-22  0.099  11.325  11.089  78.657   8.0  0.451  0.011 -17.252   
181 2020-05-26  0.060  10.815  25.128  91.932   8.5  0.198  0.012 -16.817   
182 2020-05-27  0.308  10.300  10.549  68.958   9.0  0.170  0.011 -15.553   
183 2020-05-28  0.188  10.400  15.626  71.220   9.0  0.174  0.011 -13.750   
184 2020-05-29  0.120  11.125   9.150  71.979   9.0  0.245  0.012 -16.178   

        WMV  
0     5.835  
1     5.695  
2     6.444  
3     3.298  
4    