In [3]:
! pip3 install yfinance --upgrade --no-cache-dir
! pip3 install yahoo_fin
! pip3 install multiprocess

! pip3 install pyarrow

# ML packages
! pip3 install keras
! pip3 install tensorflow
! pip3 install sklearn
! pip3 install matplotlib

Requirement already up-to-date: yfinance in c:\users\katri\anaconda3\lib\site-packages (0.1.59)




In [4]:
import json
import pandas as pd
import numpy as np
from yahoo_fin import stock_info
import yahoo_fin
from multiprocessing import Pool
import multiprocess as mp
import datetime
from collections import Counter
from get_historical_data import get_historical_data

import keras
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [5]:
# we have transactions data
all_transactions_parquet_file_name = '../collected_data/house_stock_watcher_data_all_transactions.parquet'
all_transactions = pd.read_parquet(all_transactions_parquet_file_name)

In [6]:
end_date = datetime.datetime.now().date()
start_date = end_date - datetime.timedelta(days=365 * 2)

def get_historical_data_batch(tickers, start_date, end_date):
    historical_data = []
    with Pool(32) as p:
        historical_data = p.map(get_historical_data, tickers)
    return historical_data


In [7]:
%%time
# stock_info.get_data('amzn')
get_historical_data_batch(['amzn'],start_date, end_date)

Wall time: 3.27 s


[          date         open         high          low        close  \
 0   2019-06-10  1822.000000  1884.869995  1818.000000  1860.630005   
 1   2019-06-11  1883.250000  1893.699951  1858.000000  1863.699951   
 2   2019-06-12  1853.979980  1865.000000  1844.380005  1855.319946   
 3   2019-06-13  1866.719971  1883.089966  1862.219971  1870.300049   
 4   2019-06-14  1864.000000  1876.000000  1859.000000  1869.670044   
 ..         ...          ...          ...          ...          ...   
 499 2021-06-02  3223.100098  3235.000000  3208.000000  3233.989990   
 500 2021-06-03  3204.229980  3214.439941  3184.030029  3187.010010   
 501 2021-06-04  3212.000000  3221.000000  3198.810059  3206.219971   
 502 2021-06-07  3197.330078  3208.000000  3172.199951  3198.010010   
 503 2021-06-08  3222.610107  3279.530029  3218.010010  3264.110107   
 
         adjclose   volume ticker  
 0    1860.630005  5371000   AMZN  
 1    1863.699951  4042700   AMZN  
 2    1855.319946  2678300   AMZN  
 3

In [8]:
%%time

# a couple of hard-coded company->ticker values
company_to_ticker = {
    'Broadcom Inc.': 'AVGO',
    'AD': 'ADM',
    'ADDDYY': 'ADDYY',
    'AOBC': 'AOUT',
    'APCD': 'APCD.VI',
    'APPL': 'AAPL',
    'AZSEY': 'ALIZY',
    'BAMXY': 'BMWYY',
    'BBCBX': 'BBCPX',
    'BOA': 'BAC',
    'BRK.A': 'BRK-A',
    'BRK.B': 'BRK-B',
    'BXS$A': 'BXS-PA',
    'CCC': 'CLVT',
    'CTL': 'LUMN',
    'CWEN.A': 'CWEN-A',
    'DESY': 'DSEY',
    'EBJ': 'ERJ',
    'EQUIX': 'EFX',
    'EVGB': 'EVBG',
    'FFHRX': 'FFRHX',
    'FII': 'FHI',
    'FMCMF': 'FMS',
    'FXM': 'FMX',
    'GEAGY': 'GEAGF',
    'HCN': 'WELL',
    'HS': 'HSIC',
    'HTZ': 'HTZGQ',
    'IDXXX': 'IDXX',
    'INTL': 'INTC',
    'LAM': 'LRCX',
    'LBDAV': 'LBRDA',
    'LCRX': 'LRCX',
    'LEN.B': 'LEN-B',
    'LTD': 'LB',
    'MRCK': 'MRK',
    'MSBHY': 'MSBHF',
    'NGLS': 'NGL',
    'ORLT': 'ORLY',
    'POL': 'AVNT',
    'RBS': 'NWG',
    'RDS.A': 'RDS-A',
    'RDS.B': 'RDS-B',
    'RF$A': 'RF-A',
    'SERV': 'TMX',
    'SVCBY': 'SVCBF',
    'TDDC': 'TDC',
    'URGO': 'UGRO',
    'WFC$V': 'WFC',
    'WYND': 'TNL',
    'XMO': 'XOM',
    'ZOOM': 'ZM'
}
all_transactions['cleaned_ticker'] = all_transactions['ticker'].apply(lambda ticker: company_to_ticker.get(ticker, ticker))
# replace '.' w/ '-' to make things compatible
all_transactions['cleaned_ticker'] = all_transactions['cleaned_ticker']

# get the set of tickers
tickers = sorted(set(all_transactions['cleaned_ticker']))
# '--' is the null ticker, remove it
if '--' in tickers:
    tickers.remove('--')

historical_data = get_historical_data_batch(tickers, start_date, end_date)


Wall time: 1min 30s


In [9]:
Counter([isinstance(el, tuple) and 'ERROR' == el[0] for el in historical_data])

Counter({False: 1678, True: 58})

In [10]:
lemon_tickers = [el[1] for el in historical_data if isinstance(el, tuple)]

In [11]:
lemon_tickers


['AGN',
 'AMTD',
 'APC',
 'BAC$K',
 'BGG',
 'BPHSPX',
 'BRK',
 'CBS',
 'CCXX',
 'CELG',
 'CELO',
 'CHL',
 'CHV',
 'CTRCF',
 'DCMYY',
 'DEACU',
 'DNKN',
 'DWDP',
 'ETFC',
 'ETP',
 'FNRVGX',
 'GLIBA',
 'HDS',
 'HZD',
 'JPM$G',
 'LLL',
 'LPT',
 'MINI',
 'MNTA',
 'MYL',
 'MZOR',
 'NBL',
 'NLG',
 'PEGI',
 'PS',
 'QCHR',
 'RF-A',
 'RTN',
 'SKVKY',
 'SNE',
 'SPDR',
 'SPN',
 'SPYS',
 'SZEVY',
 'TF',
 'TIF',
 'TMK',
 'TMUSR',
 'UTX',
 'VIAB',
 'VMN',
 'VRTU',
 'VSLR',
 'WAIR',
 'WCG',
 'WLTL',
 'WMGI',
 'WPX']

In [12]:
# Katrina's Notes:
# 'AGN',	company purchased by Abbvie (ABBV) in May 2020
#  'AMTD',	company purchased by Charles Schwab (SCHW)
#  'AOBC',	AOUT
#  'APC',	
#  'APCD',	APCD.VI
#  'APPL',	AAPL
#  'AZSEY',	ALIZY
#  'BAC$K',	can't find this ticker: "Bank of America Corporation Depositary Shares"
#  'BAMXY',	BMWYY
#  'BBCBX',	BBCPX
#  'BGG',	
#  'BOA',	BAC
#  'BPHSPX',	can't find this ticker: "S&P 500 Bullish Percent Index"
#  'BRK.A',	BRK-A
#  'BRK.B',	BRK-B
#  'BXS$A',	BXS-PA
#  'CBS',	
#  'CCC',	CLVT
#  'CCXX',	
#  'CELG',	
#  'CELO',	Not sure about this one. Might be CELO-USD
#  'CHL',	Delisted because of Trump. Chinese company.
#  'CHV',	Not sure about this one. There are multiple Chevrons on yahoo finance.
#  'CTL',	LUMN
#  'CTRCF',	Don't think this is a US stock
#  'CWEN.A',	CWEN-A
#  'DCMYY',	Privatized
#  'DEACU',	
#  'DESY',	DSEY
#  'DNKN',	
#  'DWDP',	split into 3 companies
#  'EBJ',	ERJ
#  'EQUIX',	EFX
#  'ETFC',	
#  'ETP',	Not sure about this one. Multiple options for Energy Transfer Partners.
#  'EVGB',	EVBG
#  'FFHRX',	FFRHX
#  'FII',	FHI
#  'FMCMF',	FMS
#  'FNRVGX',	This trust was terminated
#  'FXM',	FMX
#  'GEAGY',	GEAGF
#  'GLIBA',	
#  'HCN',	WELL
#  'HDS',	
#  'HS',	HSIC
#  'HTZ',	HTZGQ
#  'HZD',	
#  'IDXXX',	IDXX
#  'INTL',	INTC
#  'JPM$G',	couldn't find "J P Morgan Chase & Co Depositary Shares" on yahoo fin
#  'LAM',	LRCX


# Amber's Notes:
#LBDAV -> LBRDA
#LCRX -> LRCX
#LEN.B -> LEN-B
#LLL : only in CNSX
#LPT (seems okay, not on Yahoo Finance though)
#LTD -> LB
#MINI : combined with Willscot (WSC)
#MNTA : sold to Johnson & Johnson
#MRCK -> MRK
#MSBHY -> MSBHF
#MYL : merged with Pfizer's Upjohn to form Viatris (VTRS)
#MZOR : acquired by Medtronic (MDT)
#NBL : acquired by Chevron
#NGLS -> NGL
#NLG : Nam Long Investment Corp (not in USD)
#ORLT -> ORLY
#PEGI : takeover by Canada Pension Plan Investment Board (CPPIB), now a private company
#POL -> AVNT (aka Avient Corp)
#QCHR : only in NASDAQ
#RBS -> NWG (formerly Royal Bank of Scotland, now NatWest)
#RDS.A -> RDS-A
#RDS.B -> RDS-B (based in UK, no tax-witholding penalty)
#RF$A -> RF-A
#RTN : only in JSE (Africa)
#SERV -> TMX (changed name to Terminix)
#SKVKY -> SEB-A.ST (currency in Swedish money)
#SPDR : is an ETF
#SPN : only in CVE (Canada)
#SPYS : SPY? is an ETF
#SVCBY -> SVCBF
#SZEVY : In France (SEV.PA)
#TDDC -> TDC
#TF : In Canada (TF.TO)
#TIF : purchased by Louis Vitton
#TMK : only in ASX (Australia)
#TMUSR : right to buy 0.05 shares of TMUS (T-mobile) at a discounted price
#URGO -> UGRO
#UTX : merged with Raytheon (RTX), plus spinoffs?
#VIAB : merged with CBS, shares converted to CBS
#VMN : could be VNM (Vietnam), or an ETF?
#VRTU : acquired and privatized by Baring Private Equity Asia (BPEA)
#VSLR : acquired by Sunrun Inc (RUN)
#WAIR : acquired and privatized by Platinum Equity affiliate ()
#WCG : only in ASX (Australia)
#WFC$V -> WFC
#WLTL : only in NASDAQ
#WMGI : only in NASDAQ
#WPX : merged with Devon Energy, 0.5165 shares of DVN for each WPX
#WYND -> TNL (aka Travel + Leisure Co.)
#XMO -> XOM
#ZOOM -> ZM

In [13]:
all_transactions

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd,cleaned_ticker
0,2020,04/28/2020,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_dis...,False,BHC
1,2020,04/28/2020,2020-03-06,joint,BAH,Booz Allen Hamilton Holding Corporation,sale_full,"$1,001 - $15,000",Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_dis...,True,BAH
2,2020,04/28/2020,2020-03-23,joint,BYD,Boyd Gaming Corporation,sale_full,"$1,001 - $15,000",Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_dis...,False,BYD
3,2020,04/28/2020,2020-02-04,joint,CTLT,"Catalent, Inc.",purchase,"$1,001 - $15,000",Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_dis...,False,CTLT
4,2020,04/28/2020,2020-03-18,joint,CBRE,CBRE Group Inc Common Stock Class A,sale_partial,"$1,001 - $15,000",Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_dis...,False,CBRE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10055,2020,06/10/2020,2020-04-09,--,SWK,"Stanley Black & Decker, Inc.",sale_partial,"$1,001 - $15,000",Hon. Ed Perlmutter,CO07,https://disclosures-clerk.house.gov/public_dis...,False,SWK
10056,2020,06/10/2020,2020-04-09,--,USB,U.S. Bancorp,sale_partial,"$1,001 - $15,000",Hon. Ed Perlmutter,CO07,https://disclosures-clerk.house.gov/public_dis...,False,USB
10057,2020,06/10/2020,2020-03-13,,BMY,Bristol-Myers Squibb Company,sale_full,"$100,001 - $250,000",Hon. Nicholas Van Taylor,TX03,https://disclosures-clerk.house.gov/public_dis...,False,BMY
10058,2020,06/10/2020,2020-03-13,,LLY,Eli Lilly and Company,sale_full,"$500,001 - $1,000,000",Hon. Nicholas Van Taylor,TX03,https://disclosures-clerk.house.gov/public_dis...,False,LLY


In [14]:
# get a filter for some of the historical data
ticker_to_historical_data = {ticker_table['ticker'].iloc[0]: ticker_table.sort_values('date')
                             for ticker_table in historical_data if not isinstance(ticker_table, tuple)}
successful_ticker_set = set(ticker_to_historical_data.keys())
transactions_with_stock_info = all_transactions[all_transactions['cleaned_ticker'].apply(lambda x: x in successful_ticker_set)]

string_to_date = lambda x: datetime.datetime.strptime(x, '%Y-%m-%d')

transactions_with_stock_info['transaction_date'] = transactions_with_stock_info['transaction_date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
transactions_with_stock_info['disclosure_date'] = transactions_with_stock_info['disclosure_date'].apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y'))

print('number of transactions retained:', len(transactions_with_stock_info))
print('number of transactions total:', len(all_transactions))
print('ratio of transactions retained:', len(transactions_with_stock_info) / len(all_transactions))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions_with_stock_info['transaction_date'] = transactions_with_stock_info['transaction_date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions_with_stock_info['disclosure_date'] = transactions_with_stock_info['disclosure_date'].apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y'))


number of transactions retained: 9070
number of transactions total: 10060
ratio of transactions retained: 0.9015904572564613


In [15]:
transactions_with_stock_info

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd,cleaned_ticker
0,2020,2020-04-28,2020-02-24,joint,BHC,Bausch Health Companies Inc.,purchase,"$1,001 - $15,000",Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_dis...,False,BHC
1,2020,2020-04-28,2020-03-06,joint,BAH,Booz Allen Hamilton Holding Corporation,sale_full,"$1,001 - $15,000",Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_dis...,True,BAH
2,2020,2020-04-28,2020-03-23,joint,BYD,Boyd Gaming Corporation,sale_full,"$1,001 - $15,000",Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_dis...,False,BYD
3,2020,2020-04-28,2020-02-04,joint,CTLT,"Catalent, Inc.",purchase,"$1,001 - $15,000",Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_dis...,False,CTLT
4,2020,2020-04-28,2020-03-18,joint,CBRE,CBRE Group Inc Common Stock Class A,sale_partial,"$1,001 - $15,000",Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_dis...,False,CBRE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10055,2020,2020-06-10,2020-04-09,--,SWK,"Stanley Black & Decker, Inc.",sale_partial,"$1,001 - $15,000",Hon. Ed Perlmutter,CO07,https://disclosures-clerk.house.gov/public_dis...,False,SWK
10056,2020,2020-06-10,2020-04-09,--,USB,U.S. Bancorp,sale_partial,"$1,001 - $15,000",Hon. Ed Perlmutter,CO07,https://disclosures-clerk.house.gov/public_dis...,False,USB
10057,2020,2020-06-10,2020-03-13,,BMY,Bristol-Myers Squibb Company,sale_full,"$100,001 - $250,000",Hon. Nicholas Van Taylor,TX03,https://disclosures-clerk.house.gov/public_dis...,False,BMY
10058,2020,2020-06-10,2020-03-13,,LLY,Eli Lilly and Company,sale_full,"$500,001 - $1,000,000",Hon. Nicholas Van Taylor,TX03,https://disclosures-clerk.house.gov/public_dis...,False,LLY


In [16]:
set(transactions_with_stock_info['type'])

{'exchange', 'purchase', 'sale_full', 'sale_partial'}

In [17]:
set(transactions_with_stock_info['amount'])

{'$1,000,001 - $5,000,000',
 '$1,001 -',
 '$1,001 - $15,000',
 '$100,001 - $250,000',
 '$15,001 - $50,000',
 '$250,001 - $500,000',
 '$5,000,001 - $25,000,000',
 '$50,000,000 +',
 '$50,001 - $100,000',
 '$500,001 - $1,000,000'}

In [18]:
transactions_with_stock_info[transactions_with_stock_info['type'] == 'exchange']['ptr_link'].values[1]

'https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/20018701.pdf'

In [19]:
transactions_with_stock_info[transactions_with_stock_info['type'] == 'exchange']

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd,cleaned_ticker
219,2021,2021-04-23,2021-03-16,,SON,Sonoco Products Company,exchange,"$1,001 - $15,000",Hon. Lois Frankel,FL21,https://disclosures-clerk.house.gov/public_dis...,False,SON
388,2021,2021-05-10,2021-04-21,joint,MRVL,Marvell Technology Inc,exchange,"$1,001 -",Hon. Susie Lee,NV03,https://disclosures-clerk.house.gov/public_dis...,False,MRVL
594,2020,2020-08-11,2019-07-01,self,LHX,"L3Harris Technologies, Inc.",exchange,"$1,001 - $15,000",Hon. Grace Meng,NY06,https://disclosures-clerk.house.gov/public_dis...,False,LHX
916,2020,2020-04-10,2020-03-02,--,IR,Ingersoll Rand Inc.,exchange,"$1,001 - $15,000",Hon. Dean Phillips,MN03,https://disclosures-clerk.house.gov/public_dis...,False,IR
1235,2020,2020-03-10,2020-02-04,self,PLD,"ProLogis, Inc.",exchange,"$1,001 - $15,000",Hon. Zoe Lofgren,CA19,https://disclosures-clerk.house.gov/public_dis...,False,PLD
1566,2021,2021-03-10,2021-03-01,,MS,Morgan Stanley,exchange,"$15,001 - $50,000",Hon. Debbie Dingell,MI12,https://disclosures-clerk.house.gov/public_dis...,False,MS
2212,2020,2020-04-16,2020-04-03,,CARR,Carrier Global Corporation,exchange,"$1,001 - $15,000",Hon. Debbie Dingell,MI12,https://disclosures-clerk.house.gov/public_dis...,False,CARR
2213,2020,2020-04-16,2020-04-03,,OTIS,Otis Worldwide Corporation,exchange,"$1,001 - $15,000",Hon. Debbie Dingell,MI12,https://disclosures-clerk.house.gov/public_dis...,False,OTIS
2474,2020,2020-12-15,2020-11-17,,VTRS,Viatris,exchange,"$1,001 - $15,000",Hon. Gilbert Cisneros,CA39,https://disclosures-clerk.house.gov/public_dis...,False,VTRS
2759,2020,2020-12-01,2020-11-17,--,VTRS,Viatris Inc.,exchange,"$1,001 - $15,000",Hon. Debbie Dingell,MI12,https://disclosures-clerk.house.gov/public_dis...,False,VTRS


In [20]:
%%time

# build the features matrix
feature_matrix = []
label_matrix = []

# super slow but I don't care for toy data
# returns the ratio of the two prices
def find_stock_price(ticker, date, days_future):
    historical_ticker_data = ticker_to_historical_data[ticker]
    current_stock_data = historical_ticker_data[historical_ticker_data['date'] >= date]
    future_stock_data = historical_ticker_data[historical_ticker_data['date'] >= date + datetime.timedelta(days=days_future)]
    if len(future_stock_data) == 0:
        return None
    
    current_price = current_stock_data.iloc[0]['close']
    future_price = future_stock_data.iloc[0]['close']
    return np.log(future_price / current_price)
    
    
# TODO: look up tensorflow/keras one-hot encoding scheme
transaction_types = set(transactions_with_stock_info['type'])
transaction_types_to_index = {transaction_type: i for i, transaction_type in enumerate(sorted(transaction_types))}
def get_one_hot_transaction_variable(transaction_type):
    feature_vector = [0 for i in range(len(transaction_types_to_index))]
    feature_vector[transaction_types_to_index[transaction_type]] = 1
    return feature_vector

amounts = set(transactions_with_stock_info['amount'])
amounts = sorted(amounts, key=lambda x: int(x.split('$')[1].split('-')[0].strip().replace(',', '').replace('+', '')))
amount_to_index = {amount: i for i, amount in enumerate(amounts)}

def get_one_hot_transaction_and_index_amount_variable(amount):
    feature_vector = [0 for i in range(len(amount_to_index))]
    feature_vector[amount_to_index[amount]] = 1
    return feature_vector

# slow but for now who cares?
for i, row in transactions_with_stock_info.iterrows():
    if i % 1000 == 0:
        print(f'processed {i} rows')
    feature_vector = []
    feature_vector.extend(get_one_hot_transaction_variable(row['type']))
    feature_vector.extend(get_one_hot_transaction_and_index_amount_variable(row['amount']))
    feature_matrix.append(feature_vector)
    
    label_vector = []
    label_vector.append(find_stock_price(row['cleaned_ticker'], row['disclosure_date'], 7))
    label_vector.append(find_stock_price(row['cleaned_ticker'], row['transaction_date'], 7))
    label_matrix.append(label_vector)

feature_matrix = np.array(feature_matrix)
label_matrix = np.array(label_matrix)


processed 0 rows
processed 1000 rows
processed 2000 rows
processed 3000 rows
processed 4000 rows
processed 5000 rows
processed 6000 rows
processed 8000 rows
processed 9000 rows
processed 10000 rows
Wall time: 1min 7s


In [21]:
# split the data, build the keras models
train_matrix, test_matrix, train_labels, test_labels = train_test_split(feature_matrix, label_matrix)

num_params = train_matrix.shape[1]
input_layer = keras.layers.Input(shape=(num_params,))
batch_norm = keras.layers.BatchNormalization()(input_layer)
prev_fc_layer = batch_norm
fc_layer = None
for i in range(4):
    fc_layer = keras.layers.Dense(num_params, activation='selu', name=f'fc_{i}')(prev_fc_layer)
    prev_fc_layer = fc_layer

output_layer = keras.layers.Dense(1, name=f'output')(prev_fc_layer)
    
disclosure_date_model = keras.Model(inputs=[input_layer], outputs=[output_layer], name='disclosure_model')
disclosure_date_model.compile(loss='mean_squared_error', optimizer="adam", metrics=["mean_squared_error"])

num_params = train_matrix.shape[1]
input_layer = keras.layers.Input(shape=(num_params,))
batch_norm = keras.layers.BatchNormalization()(input_layer)
prev_fc_layer = batch_norm
for i in range(4):
    fc_layer = keras.layers.Dense(num_params, activation='selu', name=f'fc_{i}')(prev_fc_layer)
    prev_fc_layer = fc_layer
    
output_layer = keras.layers.Dense(1, name=f'output')(prev_fc_layer)

transaction_date_model = keras.Model(inputs=[input_layer], outputs=[output_layer], name='transaction_model')
transaction_date_model.compile(loss='mean_squared_error', optimizer="adam", metrics=["mean_squared_error"])

In [22]:
%%time
# train the models

# have some mask for the nan values
disclosure_mask = [not np.isnan(el) for el in train_labels[:, 0]]
transaction_mask = [not np.isnan(el) for el in train_labels[:, 1]]

disclosure_date_model.fit(
    train_matrix[disclosure_mask],
    train_labels[disclosure_mask][:, 0].flatten(),
    batch_size=(1 << 6),
    epochs=200, validation_split=0.1, verbose=2)

transaction_date_model.fit(
    train_matrix[transaction_mask],
    train_labels[transaction_mask][:, 0].flatten(),
    batch_size=(1 << 6),
    epochs=200, validation_split=0.1, verbose=2)

Epoch 1/200
96/96 - 33s - loss: 0.7725 - mean_squared_error: 0.7725 - val_loss: 0.1280 - val_mean_squared_error: 0.1280
Epoch 2/200
96/96 - 0s - loss: 0.0244 - mean_squared_error: 0.0244 - val_loss: 0.0309 - val_mean_squared_error: 0.0309
Epoch 3/200
96/96 - 0s - loss: 0.0170 - mean_squared_error: 0.0170 - val_loss: 0.0103 - val_mean_squared_error: 0.0103
Epoch 4/200
96/96 - 0s - loss: 0.0134 - mean_squared_error: 0.0134 - val_loss: 0.0067 - val_mean_squared_error: 0.0067
Epoch 5/200
96/96 - 0s - loss: 0.0095 - mean_squared_error: 0.0095 - val_loss: 0.0063 - val_mean_squared_error: 0.0063
Epoch 6/200
96/96 - 0s - loss: 0.0090 - mean_squared_error: 0.0090 - val_loss: 0.0070 - val_mean_squared_error: 0.0070
Epoch 7/200
96/96 - 0s - loss: 0.0083 - mean_squared_error: 0.0083 - val_loss: 0.0066 - val_mean_squared_error: 0.0066
Epoch 8/200
96/96 - 0s - loss: 0.0079 - mean_squared_error: 0.0079 - val_loss: 0.0060 - val_mean_squared_error: 0.0060
Epoch 9/200
96/96 - 0s - loss: 0.0070 - mean_sq

Epoch 70/200
96/96 - 0s - loss: 0.0052 - mean_squared_error: 0.0052 - val_loss: 0.0058 - val_mean_squared_error: 0.0058
Epoch 71/200
96/96 - 0s - loss: 0.0053 - mean_squared_error: 0.0053 - val_loss: 0.0055 - val_mean_squared_error: 0.0055
Epoch 72/200
96/96 - 0s - loss: 0.0052 - mean_squared_error: 0.0052 - val_loss: 0.0052 - val_mean_squared_error: 0.0052
Epoch 73/200
96/96 - 0s - loss: 0.0052 - mean_squared_error: 0.0052 - val_loss: 0.0053 - val_mean_squared_error: 0.0053
Epoch 74/200
96/96 - 0s - loss: 0.0052 - mean_squared_error: 0.0052 - val_loss: 0.0054 - val_mean_squared_error: 0.0054
Epoch 75/200
96/96 - 0s - loss: 0.0052 - mean_squared_error: 0.0052 - val_loss: 0.0056 - val_mean_squared_error: 0.0056
Epoch 76/200
96/96 - 0s - loss: 0.0052 - mean_squared_error: 0.0052 - val_loss: 0.0052 - val_mean_squared_error: 0.0052
Epoch 77/200
96/96 - 0s - loss: 0.0052 - mean_squared_error: 0.0052 - val_loss: 0.0057 - val_mean_squared_error: 0.0057
Epoch 78/200
96/96 - 0s - loss: 0.0052 -

Epoch 138/200
96/96 - 0s - loss: 0.0051 - mean_squared_error: 0.0051 - val_loss: 0.0053 - val_mean_squared_error: 0.0053
Epoch 139/200
96/96 - 0s - loss: 0.0050 - mean_squared_error: 0.0050 - val_loss: 0.0052 - val_mean_squared_error: 0.0052
Epoch 140/200
96/96 - 0s - loss: 0.0051 - mean_squared_error: 0.0051 - val_loss: 0.0053 - val_mean_squared_error: 0.0053
Epoch 141/200
96/96 - 0s - loss: 0.0051 - mean_squared_error: 0.0051 - val_loss: 0.0053 - val_mean_squared_error: 0.0053
Epoch 142/200
96/96 - 0s - loss: 0.0051 - mean_squared_error: 0.0051 - val_loss: 0.0052 - val_mean_squared_error: 0.0052
Epoch 143/200
96/96 - 0s - loss: 0.0051 - mean_squared_error: 0.0051 - val_loss: 0.0052 - val_mean_squared_error: 0.0052
Epoch 144/200
96/96 - 0s - loss: 0.0051 - mean_squared_error: 0.0051 - val_loss: 0.0053 - val_mean_squared_error: 0.0053
Epoch 145/200
96/96 - 0s - loss: 0.0051 - mean_squared_error: 0.0051 - val_loss: 0.0053 - val_mean_squared_error: 0.0053
Epoch 146/200
96/96 - 0s - loss:

Epoch 7/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 8/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 9/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 10/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 11/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 12/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 13/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 14/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 15/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 16/200
96/96 - 0s - loss:

Epoch 83/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 84/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 85/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 86/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 87/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 88/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 89/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 90/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 91/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 92/200
96/96 - 0s - lo

Epoch 159/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 160/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 161/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 162/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 163/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 164/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 165/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 166/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 167/200
96/96 - 0s - loss: nan - mean_squared_error: nan - val_loss: nan - val_mean_squared_error: nan
Epoch 168/200
96/96

<keras.callbacks.History at 0x18331ec1340>

In [23]:
# use the test set to get some idea of the error distribution
test_disclosure_mask = [not np.isnan(el) for el in test_labels[:, 0]]
test_transaction_mask = [not np.isnan(el) for el in test_labels[:, 1]]

test_disclosure_scores = disclosure_date_model.predict(test_matrix[test_disclosure_mask]).flatten()
test_transaction_scores = disclosure_date_model.predict(test_matrix[test_transaction_mask]).flatten()

test_disclosure_labels = test_labels[test_disclosure_mask, 0]
test_transaction_labels = test_labels[test_disclosure_mask, 1]

test_disclosure_errors = test_disclosure_scores - test_disclosure_labels
test_transaction_errors = test_transaction_scores - test_transaction_labels


ValueError: operands could not be broadcast together with shapes (2266,) (2267,) 

In [24]:
disclosure_base_error = np.var(test_disclosure_labels)
disclosure_model_error = np.sum(test_disclosure_errors ** 2) / np.shape(test_disclosure_errors)[0]
# disclosure_model_error = np.var(test_disclosure_errors)

print('disclosure model')
print(np.mean(test_disclosure_labels))
print(
    disclosure_base_error,
    disclosure_model_error,
    disclosure_model_error / disclosure_base_error)

transaction_base_error = np.var(test_transaction_labels)
transaction_model_error = np.sum(test_transaction_errors ** 2) / np.shape(test_transaction_errors)[0]
# transaction_model_error = np.var(test_transaction_errors)

print('transaction model')
print(np.mean(test_transaction_labels))
print(
    transaction_base_error,
    transaction_model_error,
    transaction_model_error / transaction_base_error)


disclosure model
0.0027694127967911957
0.004999244210615829 0.005041954528043004 1.0085433548808198


NameError: name 'test_transaction_errors' is not defined