In [1]:
# 16/06/2020
# TASI.AI
# Raphael Mourad

# Feature engineering: how to make features that best capture and summarize
# the key information from the huge option data.

# Explanations: Option data are too big to be used directly for the predictions.
# Instead, features should be built which should best capture and summarize
# key information for the prediction of stock price and probability distribution.
# This is where knowledge about stock market and option pricing is necessary.

# https://en.wikipedia.org/wiki/Feature_engineering
# https://medium.com/predict/feature-engineering-for-stock-price-movement-54669d4a7efc

###### IMPORT LIBRARIES AND SET UP PARAMETERS

# Import libraries
import os
import pandas as pd
import numpy as np
import sklearn.metrics as mt
import random
import datetime
import seaborn as sns
import weighted
from pandas_datareader import DataReader
from matplotlib import pyplot
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.arima_model import ARIMA
from pmdarima import auto_arima
from scipy.stats import norm

  from pandas.util.testing import assert_frame_equal


In [2]:
# Set up directory
dir="/media/mourad/diskSave/MCF_Toulouse/recherche/ConsulProj/RamCiri"
os.chdir(dir)
print(os.getcwd())

/media/mourad/diskSave/MCF_Toulouse/recherche/ConsulProj/RamCiri


In [3]:
# Parameters
symbol="GOOG" # Set up the symbol you want
kdays=30 # Number of days for forecasting

In [4]:
# Create folder for symbol
symbolFolder="results/ARIMA_optionstats/"+symbol
if(os.path.isdir(symbolFolder)==False):
    os.mkdir(symbolFolder, mode=0o777)

In [5]:
### STOCK OPTION PRICE
# Import option
pathOption="data/options/samples_L3_options_"+symbol+".csv.gz"
data_option = pd.read_csv(pathOption, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False)
data_option=data_option.sort_values(["Expiration","DataDate","Type","Strike"])
data_option["DataDate"]=pd.to_datetime(data_option["DataDate"])
data_option["Expiration"]=pd.to_datetime(data_option["Expiration"])

# Compute mean of bid and ask prices
data_option["BidAskMean"]=(data_option["Bid"]+data_option["Ask"])/2
print(data_option.columns.values)
#print(data_option)

['UnderlyingSymbol' 'UnderlyingPrice' 'Flags' 'OptionSymbol' 'Type'
 'Expiration' 'DataDate' 'Strike' 'Last' 'Bid' 'Ask' 'Volume'
 'OpenInterest' 'T1OpenInterest' 'IVMean' 'IVBid' 'IVAsk' 'Delta' 'Gamma'
 'Theta' 'Vega' 'AKA' 'BidAskMean']


In [18]:
### BUILD FEATURES FROM OPTION DATA

# Feature: Put to Call Ratio (PCR) [CHECKED]
# PCR < 1, the stock price is anticipated to increase in the future.
# PCR > 1, the stock price is anticipated to decrease in the future.
# Average PCR over expiration dates


# Loop over every date in DataDate
dataDate=np.sort(np.unique(data_option["DataDate"]))
matOptionFeatures=pd.DataFrame(columns=["DataDate","PCR","SM","SWM","WAEP","WMEP","WAD","WAG","WAT","WAV"])
for i in range(0, len(dataDate)):
    
    # Set a data date and expiration date < DataDate + 180 days
    data_option_i=data_option[data_option["DataDate"]==dataDate[i]]
    priceUnderlying=np.unique(data_option_i["UnderlyingPrice"])
    rowToKeep=((data_option_i["Expiration"]> (data_option_i["DataDate"]+pd.DateOffset(days=kdays))) 
        & (data_option_i["Expiration"]<= (data_option_i["DataDate"]+pd.DateOffset(days=120))))
    data_option_i=data_option_i[rowToKeep]
    data_option_i=data_option_i.drop(columns=["UnderlyingSymbol","Flags","OptionSymbol","DataDate"])
    #print(data_option_i[["UnderlyingPrice","Expiration","Strike","BidAskMean"]])
    
    PCRi=round((np.sum(data_option_i.loc[data_option_i["Type"]=="put","Volume"])
     /np.sum(data_option_i.loc[data_option_i["Type"]=="call","Volume"])),3)
    
    # Distribution of expected prices (strike price + call price, strike price - put price)
    # I noted "BAMS".
    data_option_i["BAMS"]=data_option_i["Strike"]+data_option_i["BidAskMean"]
    data_option_i.loc[data_option_i["Type"]=="put","BAMS"]=(data_option_i.loc[data_option_i["Type"]=="put","Strike"]
                                                      -data_option_i.loc[data_option_i["Type"]=="put","BidAskMean"])

    # Straddle cost
    SMi=round(np.median(abs(data_option_i["BAMS"]-priceUnderlying)),3)
    SWMi=round(weighted.median(abs(data_option_i["BAMS"]-priceUnderlying),weights=data_option_i["Volume"]),3)
    
    # Weighted average/median expected price
    WAEPi=round(np.average(data_option_i["BAMS"],weights=data_option_i["Volume"]),3)
    WMEPi=round(weighted.median(data_option_i["BAMS"],weights=data_option_i["Volume"]),3)
    
    # Weighted average/median Greeks
    WADi=round(np.average(data_option_i["Delta"],weights=data_option_i["Volume"]),3)
    WAGi=round(np.average(data_option_i["Gamma"],weights=data_option_i["Volume"]),3)
    WATi=round(np.average(data_option_i["Theta"],weights=data_option_i["Volume"]),3)
    WAVi=round(np.average(data_option_i["Vega"],weights=data_option_i["Volume"]),3)

    # Feature list
    featListi=[dataDate[i],PCRi,SMi,SWMi,WAEPi,WMEPi,WADi,WAGi,WATi,WAVi]
    
    # Store features in matOptionFeatures
    matOptionFeatures.loc[i]=featListi



In [25]:
### EXPORT FEATURES FROM OPTION DATA
print(matOptionFeatures)
pathOptionFeatures="data/options_features/samples_L3_options_features_"+symbol+".csv.gz"
matOptionFeatures.to_csv(pathOptionFeatures, compression='gzip', index=False, sep=',', quotechar='"')



       DataDate    PCR      SM      SWM      WAEP      WMEP    WAD    WAG  \
0    2015-05-01  1.461  24.890   29.337   524.503   513.368  0.025  0.007   
1    2015-05-04  0.706  23.280   33.616   543.213   555.014  0.083  0.008   
2    2015-05-05  1.185  23.100   26.606   532.228   518.185 -0.046  0.008   
3    2015-05-06  1.890  26.880   26.327   516.727   510.070 -0.111  0.008   
4    2015-05-07  0.882  22.200   24.745   533.762   539.700  0.034  0.008   
...         ...    ...     ...      ...       ...       ...    ...    ...   
1273 2020-05-22  0.719  68.425   83.550  1371.501  1449.930  0.143  0.002   
1274 2020-05-26  1.697  65.975  104.408  1355.689  1317.386 -0.009  0.002   
1275 2020-05-27  0.984  71.275  111.510  1384.709  1425.200  0.070  0.002   
1276 2020-05-28  0.643  68.300   97.683  1403.772  1475.809  0.152  0.002   
1277 2020-05-29  1.777  66.000   98.570  1387.149  1376.584 -0.053  0.002   

          WAT      WAV  
0     -41.369   56.162  
1     -44.042   56.609  


In [None]:
# Features per expiration date
expDates=np.unique(data_optionX["Expiration"])
PCR=list()
PCRoi=list()
for date in expDates:
    data_optionXY=data_optionX[data_optionX["Expiration"]==date]
    PCR.append(np.sum(data_optionXY.loc[data_optionXY["Type"]=="put","Volume"])/np.sum(data_optionXY.loc[data_optionXY["Type"]=="call","Volume"]))
    PCRoi.append(np.sum(data_optionXY.loc[data_optionXY["Type"]=="put","OpenInterest"])/np.sum(data_optionXY.loc[data_optionXY["Type"]=="call","OpenInterest"]))

matPCR=pd.DataFrame(PCR,expDates,columns=["PCR"])
matPCRoi=pd.DataFrame(PCRoi,expDates,columns=["PCRoi"])
print(matPCR)
print(matPCRoi)


In [None]:
# Store 