In [1]:
# 16/06/2020
# TASI.AI
# Raphael Mourad

# Feature engineering: how to make features that best capture and summarize
# the key information from the huge option data.

# Explanations: Option data are too big to be used directly for the predictions.
# Instead, features should be built which should best capture and summarize
# key information for the prediction of stock price and probability distribution.
# This is where knowledge about stock market and option pricing is necessary.

# https://en.wikipedia.org/wiki/Feature_engineering
# https://medium.com/predict/feature-engineering-for-stock-price-movement-54669d4a7efc

###### IMPORT LIBRARIES AND SET UP PARAMETERS

# Import libraries
import os
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
import weighted
from pandas_datareader import DataReader

  from pandas.util.testing import assert_frame_equal


In [2]:
# Set up directory
dir="/media/mourad/diskSave/MCF_Toulouse/recherche/ConsulProj/RamCiri"
os.chdir(dir)
print(os.getcwd())

/media/mourad/diskSave/MCF_Toulouse/recherche/ConsulProj/RamCiri


In [3]:
# Parameters
symbol="AMZN" # Set up the symbol you want
kdays=30 # Number of days for forecasting
kdaysExpi=90 # Number of days until expiration date

In [4]:
# Create folder for symbol
symbolFolder="results/ARIMA_optionstats/"+symbol
if(os.path.isdir(symbolFolder)==False):
    os.mkdir(symbolFolder, mode=0o777)

In [5]:
### STOCK OPTION PRICE
# Import option
pathOption="data/options/samples_L3_options_"+symbol+".csv.gz"
data_option = pd.read_csv(pathOption, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False)
data_option=data_option.sort_values(["Expiration","DataDate","Type","Strike"])
data_option["DataDate"]=pd.to_datetime(data_option["DataDate"])
data_option["Expiration"]=pd.to_datetime(data_option["Expiration"])

# Compute mean of bid and ask prices
data_option["BidAskMean"]=(data_option["Bid"]+data_option["Ask"])/2
print(data_option.columns.values)
#print(data_option)

['UnderlyingSymbol' 'UnderlyingPrice' 'Flags' 'OptionSymbol' 'Type'
 'Expiration' 'DataDate' 'Strike' 'Last' 'Bid' 'Ask' 'Volume'
 'OpenInterest' 'T1OpenInterest' 'IVMean' 'IVBid' 'IVAsk' 'Delta' 'Gamma'
 'Theta' 'Vega' 'AKA' 'BidAskMean']


In [6]:
### BUILD FEATURES FROM OPTION DATA

# Feature: Put to Call Ratio (PCR) [CHECKED]
# PCR < 1, the stock price is anticipated to increase in the future.
# PCR > 1, the stock price is anticipated to decrease in the future.
# Average PCR over expiration dates


# Loop over every date in DataDate
dataDate=np.sort(np.unique(data_option["DataDate"]))
matOptionFeatures=pd.DataFrame(columns=["DataDate","PCR","SM","SWM","WMEP","MROI","WMD","WMG","WMT","WMV"])
for i in range(0, len(dataDate)):
    
    # Set a data date and expiration date < DataDate + 180 days
    data_option_i=data_option[data_option["DataDate"]==dataDate[i]]
    priceUnderlying=np.unique(data_option_i["UnderlyingPrice"])
    rowToKeep=((data_option_i["Expiration"]>= (data_option_i["DataDate"]+pd.DateOffset(days=kdays))) 
        & (data_option_i["Expiration"]<= (data_option_i["DataDate"]+pd.DateOffset(days=kdaysExpi))))
    data_option_i=data_option_i[rowToKeep]
    data_option_i=data_option_i.drop(columns=["UnderlyingSymbol","Flags","OptionSymbol","DataDate"])
    #print(data_option_i[["UnderlyingPrice","Expiration","Strike","BidAskMean"]])
    
    PCRi=round((np.sum(data_option_i.loc[data_option_i["Type"]=="put","Volume"])
     /np.sum(data_option_i.loc[data_option_i["Type"]=="call","Volume"])),3)
    
    # Distribution of expected prices (strike price + call price, strike price - put price)
    # I noted "BAMS".
    data_option_i["BAMS"]=data_option_i["Strike"]+data_option_i["BidAskMean"]
    data_option_i.loc[data_option_i["Type"]=="put","BAMS"]=(data_option_i.loc[data_option_i["Type"]=="put","Strike"]
                                                      -data_option_i.loc[data_option_i["Type"]=="put","BidAskMean"])

    # Straddle cost
    SMi=round(np.median(abs(data_option_i["BAMS"]-priceUnderlying)),3)
    SWMi=round(weighted.median(abs(data_option_i["BAMS"]-priceUnderlying),weights=data_option_i["Volume"]),3)
    
    # Weighted median expected price
    WMEPi=round(weighted.median(data_option_i["BAMS"],weights=data_option_i["Volume"]),3)
    
    # Median relative open interest
    MROIi=round(np.median(data_option_i["OpenInterest"]),3)
    
    # Weighted median Greeks
    WMDi=round(weighted.median(data_option_i["Delta"],weights=data_option_i["Volume"]),3)
    WMGi=round(weighted.median(data_option_i["Gamma"],weights=data_option_i["Volume"]),3)
    WMTi=round(weighted.median(data_option_i["Theta"],weights=data_option_i["Volume"]),3)
    WMVi=round(weighted.median(data_option_i["Vega"],weights=data_option_i["Volume"]),3)

    # Feature list
    featListi=[dataDate[i],PCRi,SMi,SWMi,WMEPi,MROIi,WMDi,WMGi,WMTi,WMVi]
    
    # Store features in matOptionFeatures
    matOptionFeatures.loc[i]=featListi



In [7]:
### EXPORT FEATURES FROM OPTION DATA
pathOptionFeatures="data/options_features/samples_L3_options_features_"+symbol+"_"+str(kdaysExpi)+"daysExpi.csv.gz"
matOptionFeatures.to_csv(pathOptionFeatures, compression='gzip', index=False, sep=',', quotechar='"')

In [8]:
print(matOptionFeatures)

       DataDate    PCR       SM      SWM      WMEP  MROI    WMD    WMG  \
0    2015-05-01  0.713   14.325   25.244   436.484   3.0  0.145  0.008   
1    2015-05-04  1.190   13.550   31.753   412.027   3.0 -0.011  0.008   
2    2015-05-05  0.904   13.948   30.918   431.280   6.5  0.034  0.008   
3    2015-05-06  0.893   13.575   25.729   429.648   8.0  0.071  0.008   
4    2015-05-07  0.579   15.005   29.659   442.782   8.0  0.150  0.008   
...         ...    ...      ...      ...       ...   ...    ...    ...   
1270 2020-05-22  0.555   92.600  178.830  2550.080   2.0  0.117  0.001   
1271 2020-05-26  0.602  109.662  144.090  2491.648  10.0  0.168  0.001   
1272 2020-05-27  0.467  108.587  152.241  2489.298  10.0  0.243  0.001   
1273 2020-05-28  0.431  110.438  131.325  2420.467  17.0  0.369  0.001   
1274 2020-05-29  0.857  104.550  178.384  2442.649   8.0  0.017  0.001   

          WMT      WMV  
0     -45.243   56.536  
1     -42.813   46.612  
2     -42.931   48.290  
3     -45.5