# Purpose

The purpose of this notebook is to illustrate how the data from `data.app_data.DataAlpacaPocCat` can be used for training a classification model.

In [31]:
import os
import sys
import pandas as pd
import numpy as np
import datetime
import pymongo
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from supervised.automl import AutoML

sys.path.append("../src/")
from tradingdmp.data.clf.price_perc_chg.app_data import DataAlpacaPocCat
from tradingdmp.model.clf.app_model import MljarAutoMl

In [2]:
pd.set_option('display.max_columns', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
mongodbkey = "" # PUT YOUR KEY HERE

# Functions

In [4]:
# Define function for evaluation
def evaluate(y_te, pred, labels, return_data = False):
    
    cm = confusion_matrix(y_te, pred, labels = labels)

    print("\nConfusion matrix (absolute):")
    cm_abs = pd.DataFrame(data = cm, index = labels, columns = labels)
    cm_abs = pd.concat([cm_abs], keys=['True'], names=[''], axis = 0)
    cm_abs = pd.concat([cm_abs], keys=['Predicted'], names=[''], axis = 1)
    display(cm_abs)

    print("\nConfusion matrix (relative):")
    cm_rel = pd.DataFrame(data = cm / cm.sum(), index = labels, columns = labels)
    cm_rel = pd.concat([cm_rel], keys=['True'], names=[''], axis = 0)
    cm_rel = pd.concat([cm_rel], keys=['Predicted'], names=[''], axis = 1)
    cm_rel = round(cm_rel * 100, 2)
    display(cm_rel)

    print("\nClassification report:\n")
    class_report = classification_report(y_true = y_te, y_pred = pred)
    print(class_report)

    acc = round(accuracy_score(y_true = y_te, y_pred = pred), 4)
    print("Overall accuracy: {}".format(acc))
    f1 = round(f1_score(y_true = y_te, y_pred = pred, average='weighted'), 4)
    print("Overall f1_score: {}".format(f1))
    
    if return_data == True:
        return({'cm_rel': cm_rel, 'cm_abs': cm_abs, 'class_report': class_report, 'acc': acc, 'f1': f1})

# Get Data

In [5]:
# Prepare arguments describing which data we want to get
adata = DataAlpacaPocCat(mongodbkey)
ticker_list = ['AAP', 'ABC', 'ACI', 'ACM', 'ADNT', 'AE', 'AFG', 'AGX', 'AIM', 'AIT', 'ALC', 'ALK', 'ALL', 'AMBA', 'AMCI', 'AMED', 'AMRC', 'AN', 'ANDE', 'ARD', 'ARLP', 'ARTNA', 'ASGN', 'ATEC', 'AX', 'AXR', 'AXTI', 'BBI', 'BBW', 'BBY', 'BCEI', 'BECN', 'BIOL', 'BKD', 'BL', 'BLMN', 'BRY', 'BTU', 'BWA', 'BXS', 'BZH', 'CAH', 'CBPO', 'CC', 'CGC', 'CHCI', 'CHE', 'CHEF', 'CHRS', 'CHTR', 'CINF', 'CJJD', 'CLR', 'CMLS', 'COG', 'CONE', 'CONN', 'COST', 'CPRT', 'CRAI', 'CREE', 'CRY', 'CSPI', 'CTO', 'CTSO', 'CVX', 'CWST', 'CXDC', 'CXO', 'CXP', 'CYH', 'DHI', 'DKS', 'DPZ', 'DVA', 'DY', 'EA', 'EDUC', 'EGP', 'EGY', 'EIGI', 'ENTX', 'ENZ', 'EOG', 'EPD', 'EQIX', 'EQT', 'ESI', 'ESS', 'ET', 'EVC', 'EWBC', 'FAST', 'FBHS', 'FCX', 'FE', 'FIX', 'FL', 'FLIR', 'FLR', 'FMC', 'FNCB', 'FSFG', 'FSM', 'FSP', 'FSTR', 'FVE', 'GFN', 'GLDD', 'GOGO', 'GPC', 'GPI', 'GPS', 'GTN', 'GV', 'GVA', 'GWW', 'HA', 'HAL', 'HBCP', 'HBP', 'HIW', 'HL', 'HMSY', 'HOV', 'HTLD', 'HTLF', 'HUBG', 'HWCC', 'HWKN', 'ICFI', 'IDRA', 'INCY', 'INOD', 'INSP', 'INT', 'JAX', 'JILL', 'JJSF', 'KBH', 'KBR', 'KEYS', 'KFY', 'KINS', 'KMX', 'KOPN', 'KSU', 'LEN', 'LGIH', 'LHCG', 'LII', 'LKQ', 'LL', 'LNT', 'LOPE', 'LOV', 'LSTR', 'MAG', 'MAT', 'MCF', 'MCK', 'MDC', 'MDU', 'MELI', 'MESA', 'MESO', 'MGI', 'MHO', 'MLM', 'MMC', 'MMP', 'MNR', 'MPLX', 'MPW', 'MPWR', 'MRO', 'MRTN', 'MSM', 'MTH', 'MTRX', 'MTZ', 'MVO', 'MYGN', 'NATH', 'NDAQ', 'NEM', 'NEP', 'NG', 'NGL', 'NLY', 'NOC', 'NOW', 'NS', 'NSIT', 'NSYS', 'NUS', 'NVR', 'NWHM', 'OGE', 'OLED', 'OMEX', 'OMI', 'OPHC', 'ORLY', 'ORN', 'OSTK', 'OXY', 'PAA', 'PBI', 'PCYG', 'PDCE', 'PEBO', 'PEN', 'PETS', 'PFGC', 'PHM', 'PKI', 'PLPC', 'PLUS', 'PLX', 'PNC', 'POOL', 'POR', 'PRAA', 'PRMW', 'PSMT', 'PSXP', 'PVG', 'PWR', 'PXD', 'QCRH', 'QEP', 'QNST', 'RDI', 'RE', 'RH', 'RLGT', 'RMG', 'RNET', 'ROCK', 'ROST', 'RRC', 'RRD', 'RRR', 'RS', 'SABR', 'SAFM', 'SAIC', 'SALM', 'SANW', 'SCCO', 'SCHN', 'SD', 'SEB', 'SFST', 'SHIP', 'SIG', 'SIGI', 'SIRI', 'SIVB', 'SKYW', 'SLB', 'SM', 'SNPS', 'SNX', 'SO', 'SONO', 'SPTN', 'SR', 'SRLP', 'SSD', 'STKS', 'STRL', 'STT', 'STX', 'SWTX', 'SYK', 'SYY', 'TA', 'TAIT', 'TBBK', 'TBNK', 'TC', 'TDS', 'TEAM', 'TECH', 'TEL', 'TESS', 'TIF', 'TITN', 'TOL', 'TPH', 'TSCO', 'TTI', 'TXN', 'UBA', 'USEG', 'UTSI', 'UVE', 'VEEV', 'VUZI', 'WEYS', 'WMC', 'WSO', 'AAPL', 'ADBE', 'ADI', 'ADP', 'ADSK', 'ALGN', 'ALXN', 'AMAT', 'AMD', 'AMGN', 'AMZN', 'ANSS', 'ASML', 'ATVI', 'AVGO', 'BIDU', 'BIIB', 'BKNG', 'BMRN', 'CDNS', 'CDW', 'CERN', 'CHKP', 'CMCSA', 'CSCO', 'CSX', 'CTAS', 'CTSH', 'CTXS', 'DLTR', 'DOCU', 'DXCM', 'EBAY', 'EXC', 'EXPE', 'FB', 'FISV', 'FOX', 'GILD', 'GOOG', 'IDXX', 'ILMN', 'INTC', 'INTU', 'ISRG', 'JD', 'KDP', 'KHC', 'KLAC', 'LBTYA', 'LRCX', 'LULU', 'MAR', 'MCHP', 'MDLZ', 'MNST', 'MRNA', 'MSFT', 'MU', 'MXIM', 'NFLX', 'NTES', 'NVDA', 'NXPI', 'PAYX', 'PCAR', 'PDD', 'PEP', 'PYPL', 'QCOM', 'REGN', 'SBUX', 'SGEN', 'SPLK', 'SWKS', 'TCOM', 'TMUS', 'TSLA', 'TTWO', 'ULTA', 'VRSK', 'VRSN', 'VRTX', 'WBA', 'WDAY', 'XEL', 'XLNX', 'ZM']
dt_start = datetime.datetime.strptime("2020-06-01", "%Y-%m-%d")
dt_end = datetime.datetime.strptime("2021-03-12", "%Y-%m-%d")

In [6]:
%%time
# Get data from our mongodb
df_x, df_y = adata.get_data(ticker_list, dt_start, dt_end, 
                            dt_end_required = True,
                            n_ppc_per_row = 10,
                            return_last_date_only = False,
                            return_date_col = True,
                            return_training_dfs = True)

CPU times: user 2min 37s, sys: 3.8 s, total: 2min 41s
Wall time: 3min 58s


In [7]:
# Take a glimpse at the data
print("Shape df_x: {}".format(df_x.shape))
display(df_x.head())

print("Shape df_y: {}".format(df_y.shape))
display(df_y.head())

Shape df_x: (30766, 185)


Unnamed: 0,date,av_open,av_close,av_high,av_low,av_volume,av_adjusted_close,av_dividend_amount,av_split_coefficient,yh_regularMarketChange,yh_regularMarketChangePercent,yh_regularMarketPrice,yh_regularMarketDayHigh,yh_regularMarketDayLow,yh_regularMarketVolume,yh_regularMarketPreviousClose,yh_bid,yh_ask,yh_bidSize,yh_askSize,yh_regularMarketOpen,yh_averageDailyVolume3Month,yh_averageDailyVolume10Day,yh_fiftyTwoWeekLowChange,yh_fiftyTwoWeekLowChangePercent,yh_fiftyTwoWeekHighChange,yh_fiftyTwoWeekHighChangePercent,yh_fiftyTwoWeekLow,yh_fiftyTwoWeekHigh,yh_sharesOutstanding,yh_fiftyDayAverage,yh_fiftyDayAverageChange,yh_fiftyDayAverageChangePercent,yh_twoHundredDayAverage,yh_twoHundredDayAverageChange,yh_twoHundredDayAverageChangePercent,yh_marketCap,yh_price,fv_Sector,fv_Industry,fv_PE,fv_EPSttm,fv_InsiderOwn,fv_ShsOutstand,fv_PerfWeek,fv_MarketCap,fv_ForwardPE,fv_EPSnextY,fv_InsiderTrans,fv_ShsFloat,fv_PerfMonth,fv_Income,fv_PEG,fv_EPSnextQ,fv_InstOwn,fv_ShortFloat,fv_PerfQuarter,fv_Sales,fv_PS,fv_EPSthisY,fv_ShortRatio,fv_PerfHalfY,fv_Booksh,fv_PB,fv_ROA,fv_TargetPrice,fv_PerfYear,fv_Cashsh,fv_PC,fv_EPSnext5Y,fv_ROE,fv_PerfYTD,fv_Dividend,fv_PFCF,fv_EPSpast5Y,fv_ROI,fv_52WHigh,fv_Beta,fv_QuickRatio,fv_Salespast5Y,fv_GrossMargin,fv_52WLow,fv_ATR,fv_Employees,fv_CurrentRatio,fv_SalesQQ,fv_OperMargin,fv_RSI14,fv_Optionable,fv_DebtEq,fv_EPSQQ,fv_ProfitMargin,fv_RelVolume,fv_PrevClose,fv_Shortable,fv_LTDebtEq,fv_Payout,fv_AvgVolume,fv_Price,fv_Recom,fv_SMA20,fv_SMA50,fv_SMA200,fv_Volume,fv_Change,av_open_0,av_close_0,av_high_0,av_low_0,av_volume_0,av_adjusted_close_0,av_dividend_amount_0,av_split_coefficient_0,av_open_1,av_close_1,av_high_1,av_low_1,av_volume_1,av_adjusted_close_1,av_dividend_amount_1,av_split_coefficient_1,av_open_2,av_close_2,av_high_2,av_low_2,av_volume_2,av_adjusted_close_2,av_dividend_amount_2,av_split_coefficient_2,av_open_3,av_close_3,av_high_3,av_low_3,av_volume_3,av_adjusted_close_3,av_dividend_amount_3,av_split_coefficient_3,av_open_4,av_close_4,av_high_4,av_low_4,av_volume_4,av_adjusted_close_4,av_dividend_amount_4,av_split_coefficient_4,av_open_5,av_close_5,av_high_5,av_low_5,av_volume_5,av_adjusted_close_5,av_dividend_amount_5,av_split_coefficient_5,av_open_6,av_close_6,av_high_6,av_low_6,av_volume_6,av_adjusted_close_6,av_dividend_amount_6,av_split_coefficient_6,av_open_7,av_close_7,av_high_7,av_low_7,av_volume_7,av_adjusted_close_7,av_dividend_amount_7,av_split_coefficient_7,av_open_8,av_close_8,av_high_8,av_low_8,av_volume_8,av_adjusted_close_8,av_dividend_amount_8,av_split_coefficient_8,av_open_9,av_close_9,av_high_9,av_low_9,av_volume_9,av_adjusted_close_9,av_dividend_amount_9,av_split_coefficient_9
0,2020-11-03,153.48,157.43,158.12,152.68,826738.0,157.43,0.0,1.0,6.049988,3.996557,157.43,158.12,152.68,826650.0,151.38,158.01,160.91,9.0,8.0,153.48,770232.0,772550.0,86.09999,1.207066,-14.0,-0.081666,71.33,171.43,69138800.0,153.11166,4.318329,0.028204,144.31396,13.116028,0.090885,10884520000.0,157.43,Consumer Cyclical,Specialty Retail,22.68,6.5,0.4,69120000.0,-4.81,10070000000.0,16.07,10.69,2.43,68890000.0,-4.89,453100000.0,1.91,2.64,0.0,3.32,-3.45,9620000000.0,1.05,21.1,2.95,22.62,53.92,2.73,3.9,174.79,-9.35,16.7,8.82,11.9,12.7,-8.04,0.68,20.93,0.4,12.3,-14.09,1.41,0.5,-0.3,43.8,106.48,3.68,39000.0,1.4,7.3,6.6,36.51,Yes,0.0,58.1,4.7,1.05,146.0,Yes,0.33,10.4,775580.0,147.28,2.3,-4.4,-4.36,7.57,811640.0,0.88,-0.028953,0.010634,-0.014148,0.001437,0.273049,0.010634,0.0,0.0,0.014008,0.005421,-3e-06,-0.001081,-0.620516,0.005421,0.0,0.0,0.004943,-0.014842,0.000505,-0.00029,1.077878,-0.014842,0.0,0.0,-0.002649,0.008112,0.003784,-0.001806,-0.307744,0.008112,0.0,0.0,-0.012962,-0.021267,-0.011465,-0.01305,0.037908,-0.021267,0.0,0.0,-0.017231,0.009592,-0.013918,-0.00648,-0.099791,0.009592,0.0,0.0,-0.050124,-0.05636,-0.04927,-0.049743,0.307998,-0.05636,0.0,0.0,-0.005901,0.008767,-0.00078,0.002565,0.113691,0.008767,0.0,0.0,0.031891,0.027838,0.034361,0.028769,0.239062,0.027838,0.0,0.0,0.026691,0.039966,0.037056,0.026351,-0.157326,0.039966,0.0,0.0
1,2020-11-04,158.37,155.53,159.55,154.86,761328.0,155.53,0.0,1.0,-1.899994,-1.206882,155.53,159.55,154.86,761268.0,157.43,155.68,156.13,9.0,8.0,158.37,775606.0,783271.0,84.2,1.180429,-15.899994,-0.092749,71.33,171.43,69138800.0,153.115,2.414993,0.015772,144.59145,10.938553,0.075651,10753160000.0,155.53,Consumer Cyclical,Specialty Retail,22.68,6.5,0.4,69120000.0,-4.81,10070000000.0,16.07,10.69,2.43,68890000.0,-4.89,453100000.0,1.91,2.64,0.0,3.32,-3.45,9620000000.0,1.05,21.1,2.95,22.62,53.92,2.73,3.9,174.79,-9.35,16.7,8.82,11.9,12.7,-8.04,0.68,20.93,0.4,12.3,-14.09,1.41,0.5,-0.3,43.8,106.48,3.68,39000.0,1.4,7.3,6.6,36.51,Yes,0.0,58.1,4.7,1.05,146.0,Yes,0.33,10.4,775580.0,147.28,2.3,-4.4,-4.36,7.57,811640.0,0.88,0.014008,0.005421,-3e-06,-0.001081,-0.620516,0.005421,0.0,0.0,0.004943,-0.014842,0.000505,-0.00029,1.077878,-0.014842,0.0,0.0,-0.002649,0.008112,0.003784,-0.001806,-0.307744,0.008112,0.0,0.0,-0.012962,-0.021267,-0.011465,-0.01305,0.037908,-0.021267,0.0,0.0,-0.017231,0.009592,-0.013918,-0.00648,-0.099791,0.009592,0.0,0.0,-0.050124,-0.05636,-0.04927,-0.049743,0.307998,-0.05636,0.0,0.0,-0.005901,0.008767,-0.00078,0.002565,0.113691,0.008767,0.0,0.0,0.031891,0.027838,0.034361,0.028769,0.239062,0.027838,0.0,0.0,0.026691,0.039966,0.037056,0.026351,-0.157326,0.039966,0.0,0.0,0.031861,-0.012069,0.009044,0.014278,-0.079118,-0.012069,0.0,0.0
2,2020-11-05,157.78,156.43,158.84,155.28,524181.0,156.43,0.0,1.0,0.899994,0.578663,156.43,158.84,155.28,524181.0,155.53,149.44,160.88,9.0,8.0,157.78,781635.0,784350.0,85.09999,1.193046,-15.0,-0.087499,71.33,171.43,69138800.0,153.0775,3.352493,0.021901,144.66957,11.760422,0.081292,10815380000.0,156.43,Consumer Cyclical,Specialty Retail,22.68,6.5,0.4,69120000.0,-4.81,10070000000.0,16.07,10.69,2.43,68890000.0,-4.89,453100000.0,1.91,2.64,0.0,3.32,-3.45,9620000000.0,1.05,21.1,2.95,22.62,53.92,2.73,3.9,174.79,-9.35,16.7,8.82,11.9,12.7,-8.04,0.68,20.93,0.4,12.3,-14.09,1.41,0.5,-0.3,43.8,106.48,3.68,39000.0,1.4,7.3,6.6,36.51,Yes,0.0,58.1,4.7,1.05,146.0,Yes,0.33,10.4,775580.0,147.28,2.3,-4.4,-4.36,7.57,811640.0,0.88,0.004943,-0.014842,0.000505,-0.00029,1.077878,-0.014842,0.0,0.0,-0.002649,0.008112,0.003784,-0.001806,-0.307744,0.008112,0.0,0.0,-0.012962,-0.021267,-0.011465,-0.01305,0.037908,-0.021267,0.0,0.0,-0.017231,0.009592,-0.013918,-0.00648,-0.099791,0.009592,0.0,0.0,-0.050124,-0.05636,-0.04927,-0.049743,0.307998,-0.05636,0.0,0.0,-0.005901,0.008767,-0.00078,0.002565,0.113691,0.008767,0.0,0.0,0.031891,0.027838,0.034361,0.028769,0.239062,0.027838,0.0,0.0,0.026691,0.039966,0.037056,0.026351,-0.157326,0.039966,0.0,0.0,0.031861,-0.012069,0.009044,0.014278,-0.079118,-0.012069,0.0,0.0,-0.003725,0.005787,-0.00445,0.002712,-0.311491,0.005787,0.0,0.0
3,2020-11-06,156.05,156.38,157.54,154.315,700283.0,156.38,0.0,1.0,-0.049988,-0.031955,156.38,157.54,154.315,700099.0,156.43,156.91,156.97,9.0,8.0,156.05,776827.0,775000.0,85.05,1.192346,-15.049988,-0.087791,71.33,171.43,69138800.0,153.12723,3.252777,0.021242,144.75298,11.627029,0.080323,10811930000.0,156.38,Consumer Cyclical,Specialty Retail,22.68,6.5,0.4,69120000.0,-4.81,10070000000.0,16.07,10.69,2.43,68890000.0,-4.89,453100000.0,1.91,2.64,0.0,3.32,-3.45,9620000000.0,1.05,21.1,2.95,22.62,53.92,2.73,3.9,174.79,-9.35,16.7,8.82,11.9,12.7,-8.04,0.68,20.93,0.4,12.3,-14.09,1.41,0.5,-0.3,43.8,106.48,3.68,39000.0,1.4,7.3,6.6,36.51,Yes,0.0,58.1,4.7,1.05,146.0,Yes,0.33,10.4,775580.0,147.28,2.3,-4.4,-4.36,7.57,811640.0,0.88,-0.002649,0.008112,0.003784,-0.001806,-0.307744,0.008112,0.0,0.0,-0.012962,-0.021267,-0.011465,-0.01305,0.037908,-0.021267,0.0,0.0,-0.017231,0.009592,-0.013918,-0.00648,-0.099791,0.009592,0.0,0.0,-0.050124,-0.05636,-0.04927,-0.049743,0.307998,-0.05636,0.0,0.0,-0.005901,0.008767,-0.00078,0.002565,0.113691,0.008767,0.0,0.0,0.031891,0.027838,0.034361,0.028769,0.239062,0.027838,0.0,0.0,0.026691,0.039966,0.037056,0.026351,-0.157326,0.039966,0.0,0.0,0.031861,-0.012069,0.009044,0.014278,-0.079118,-0.012069,0.0,0.0,-0.003725,0.005787,-0.00445,0.002712,-0.311491,0.005787,0.0,0.0,-0.010965,-0.00032,-0.008184,-0.006215,0.335956,-0.00032,0.0,0.0
4,2020-11-09,161.59,156.63,164.08,156.33,1877573.0,156.63,0.0,1.0,1.87999,1.202193,158.26,164.08,156.5624,644297.0,156.38,157.11,157.33,9.0,8.0,161.59,771843.0,790283.0,86.92999,1.218702,-13.169998,-0.076824,71.33,171.43,69138800.0,153.15201,5.107986,0.033352,145.50087,12.759125,0.087691,10941910000.0,158.26,Consumer Cyclical,Specialty Retail,24.08,6.5,0.4,69120000.0,6.18,10820000000.0,17.05,10.33,2.4,68890000.0,-0.81,453100000.0,2.02,2.66,0.0,3.45,2.0,9620000000.0,1.12,21.1,3.04,24.17,53.92,2.9,3.9,170.88,-7.62,16.51,9.47,11.9,12.7,-2.36,0.64,22.48,0.4,12.3,-8.14,1.38,0.5,-0.3,43.8,120.78,3.89,39000.0,1.4,7.3,6.6,57.42,Yes,0.0,58.1,4.7,2.0,156.38,Yes,0.33,10.4,781900.0,157.48,2.3,2.61,2.44,14.88,623279.0,0.7,-0.012962,-0.021267,-0.011465,-0.01305,0.037908,-0.021267,0.0,0.0,-0.017231,0.009592,-0.013918,-0.00648,-0.099791,0.009592,0.0,0.0,-0.050124,-0.05636,-0.04927,-0.049743,0.307998,-0.05636,0.0,0.0,-0.005901,0.008767,-0.00078,0.002565,0.113691,0.008767,0.0,0.0,0.031891,0.027838,0.034361,0.028769,0.239062,0.027838,0.0,0.0,0.026691,0.039966,0.037056,0.026351,-0.157326,0.039966,0.0,0.0,0.031861,-0.012069,0.009044,0.014278,-0.079118,-0.012069,0.0,0.0,-0.003725,0.005787,-0.00445,0.002712,-0.311491,0.005787,0.0,0.0,-0.010965,-0.00032,-0.008184,-0.006215,0.335956,-0.00032,0.0,0.0,0.035501,0.001599,0.041513,0.013058,1.681163,0.001599,0.0,0.0


Shape df_y: (30766, 1)


Unnamed: 0,y
0,sm_dec
1,no_chg
2,no_chg
3,no_chg
4,no_chg


# Split Data

We split in training, validation and test set - by `date`. Afterwards, we remove the `date` column since it's only meant for splitting and not for training.

In [8]:
# Define percentages
perc_tr = 0.7
perc_va = 0.15
perc_te = 1 - perc_tr - perc_va

In [9]:
# Bring together df_x and df_y
df_all = pd.merge(df_x, df_y, left_index=True, right_index=True)

In [10]:
# Sort by date
df_all = df_all.sort_values("date").reset_index(drop = True)

In [11]:
# Get the dates, where validation and test data start respectively
N = len(df_all)
start_idx_va = round(N * perc_tr)
start_idx_te = round(N * (perc_tr + perc_va))
start_date_va = df_all.loc[start_idx_va, "date"]
start_date_te = df_all.loc[start_idx_te, "date"]

In [12]:
# Split into training, test and validation
df_tr = df_all.loc[df_all.date < start_date_va, :]
df_va = df_all.loc[(df_all.date >= start_date_va) & (df_all.date < start_date_te) , :]
df_te = df_all.loc[df_all.date > start_date_te, :]

In [13]:
# Drop date column
df_tr = df_tr.drop(columns = ["date"])
df_va = df_va.drop(columns = ["date"])
df_te = df_te.drop(columns = ["date"])

In [14]:
# Split each into x and y
x_tr = df_tr.drop(columns = ["y"])
y_tr = df_tr.loc[:, ["y"]].to_numpy().flatten()
x_va = df_va.drop(columns = ["y"])
y_va = df_va.loc[:, ["y"]].to_numpy().flatten()
x_te = df_te.drop(columns = ["y"])
y_te = df_te.loc[:, ["y"]].to_numpy().flatten()

In [15]:
# Check percentages
print("Train: {}".format(round(len(x_tr) / N, 4)))
print("Valid: {}".format(round(len(x_va) / N, 4)))
print("Test: {}".format(round(len(x_te) / N, 4)))

Train: 0.6989
Valid: 0.1506
Test: 0.138


# Fit Models

## a) Baselines

**Majority Classifier**

In [16]:
%%time
model_majority = DummyClassifier(strategy = "most_frequent")
model_majority.fit(x_tr, y_tr)

CPU times: user 9.37 ms, sys: 0 ns, total: 9.37 ms
Wall time: 9.09 ms


DummyClassifier(strategy='most_frequent')

**Logistic Regression**

In [17]:
%%time
model_logistic = Pipeline(steps=[('OneHotEncoder', OneHotEncoder(handle_unknown = 'ignore')),
                                 ('LogisticRegression', LogisticRegression())])
model_logistic.fit(x_tr, y_tr)

CPU times: user 7min 13s, sys: 5min 4s, total: 12min 17s
Wall time: 2min 37s


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('OneHotEncoder', OneHotEncoder(handle_unknown='ignore')),
                ('LogisticRegression', LogisticRegression())])

## b) AutoML

In [18]:
%%time
automl = MljarAutoMl()
automl.fit(x_tr, y_tr)

Linear algorithm was disabled.
AutoML directory: AutoML_1
The task is multiclass_classification with evaluation metric logloss
AutoML will use algorithms: ['Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble availabe models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'mix_encoding', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree logloss 1.475456 trained in 1.29 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle,Stratify
* Step simple_algorithms will try to check up to 3 models
1_DecisionTree logloss 1.487704 trained in 11.14 seconds
2_DecisionTree logloss 1.484089 trained in 15

# Evaluate Models

Note: We're only evaluating on validation data (and not test data) here because this notebook does not contain any "final" models to be published. Let's keep the test data for fitting and evaluating the final model that we will in the end deploy with mlflow (in a different notebook, similar to this one).

In [19]:
labels = ["lg_dec", "sm_dec", "no_chg", "sm_inc", "lg_inc"]

## a) Baselines

**Majority Classifier**

In [20]:
# Make Predictions
pred = model_majority.predict(x_va)

In [21]:
# Get evaluation metrics
evaluate(y_va, pred, labels)


Confusion matrix (absolute):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted,Predicted,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,lg_dec,sm_dec,no_chg,sm_inc,lg_inc
,,,,,,
True,lg_dec,0.0,0.0,414.0,0.0,0.0
True,sm_dec,0.0,0.0,881.0,0.0,0.0
True,no_chg,0.0,0.0,1767.0,0.0,0.0
True,sm_inc,0.0,0.0,951.0,0.0,0.0
True,lg_inc,0.0,0.0,619.0,0.0,0.0



Confusion matrix (relative):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted,Predicted,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,lg_dec,sm_dec,no_chg,sm_inc,lg_inc
,,,,,,
True,lg_dec,0.0,0.0,8.94,0.0,0.0
True,sm_dec,0.0,0.0,19.02,0.0,0.0
True,no_chg,0.0,0.0,38.15,0.0,0.0
True,sm_inc,0.0,0.0,20.53,0.0,0.0
True,lg_inc,0.0,0.0,13.36,0.0,0.0



Classification report:

              precision    recall  f1-score   support

      lg_dec       0.00      0.00      0.00       414
      lg_inc       0.00      0.00      0.00       619
      no_chg       0.38      1.00      0.55      1767
      sm_dec       0.00      0.00      0.00       881
      sm_inc       0.00      0.00      0.00       951

    accuracy                           0.38      4632
   macro avg       0.08      0.20      0.11      4632
weighted avg       0.15      0.38      0.21      4632

Overall accuracy: 0.3815
Overall f1_score: 0.2107


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


**Logistic Regression**

In [22]:
# Make Predictions
pred = model_logistic.predict(x_va)

In [23]:
# Get evaluation metrics
evaluate(y_va, pred, labels)


Confusion matrix (absolute):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted,Predicted,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,lg_dec,sm_dec,no_chg,sm_inc,lg_inc
,,,,,,
True,lg_dec,17.0,23.0,215.0,48.0,111.0
True,sm_dec,14.0,49.0,661.0,79.0,78.0
True,no_chg,17.0,84.0,1439.0,122.0,105.0
True,sm_inc,6.0,47.0,723.0,92.0,83.0
True,lg_inc,24.0,53.0,299.0,76.0,167.0



Confusion matrix (relative):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted,Predicted,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,lg_dec,sm_dec,no_chg,sm_inc,lg_inc
,,,,,,
True,lg_dec,0.37,0.5,4.64,1.04,2.4
True,sm_dec,0.3,1.06,14.27,1.71,1.68
True,no_chg,0.37,1.81,31.07,2.63,2.27
True,sm_inc,0.13,1.01,15.61,1.99,1.79
True,lg_inc,0.52,1.14,6.46,1.64,3.61



Classification report:

              precision    recall  f1-score   support

      lg_dec       0.22      0.04      0.07       414
      lg_inc       0.31      0.27      0.29       619
      no_chg       0.43      0.81      0.56      1767
      sm_dec       0.19      0.06      0.09       881
      sm_inc       0.22      0.10      0.13       951

    accuracy                           0.38      4632
   macro avg       0.27      0.26      0.23      4632
weighted avg       0.31      0.38      0.30      4632

Overall accuracy: 0.3808
Overall f1_score: 0.3037


## b) AutoML

In [24]:
# Make class probability predictions
pred = automl.predict_proba(x_va)
pred

array([[0.0537677 , 0.10036561, 0.33586681, 0.2061538 , 0.30384606],
       [0.0865998 , 0.12075621, 0.32052002, 0.27034332, 0.20178065],
       [0.05333277, 0.02632116, 0.47439443, 0.29301323, 0.15293844],
       ...,
       [0.02788391, 0.08168676, 0.44139462, 0.14785052, 0.30118419],
       [0.05634383, 0.10620194, 0.39213747, 0.1647025 , 0.28061424],
       [0.03605465, 0.07577007, 0.43078251, 0.18547255, 0.27192021]])

In [25]:
# Make class predictions
pred = automl.predict(x_va)
pred

array(['no_chg', 'no_chg', 'no_chg', ..., 'no_chg', 'no_chg', 'no_chg'],
      dtype=object)

In [26]:
# Get evaluation metrics
res = evaluate(y_va, pred, labels, return_data = True)


Confusion matrix (absolute):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted,Predicted,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,lg_dec,sm_dec,no_chg,sm_inc,lg_inc
,,,,,,
True,lg_dec,66.0,19.0,205.0,18.0,106.0
True,sm_dec,31.0,46.0,658.0,57.0,89.0
True,no_chg,40.0,69.0,1471.0,66.0,121.0
True,sm_inc,24.0,42.0,747.0,50.0,88.0
True,lg_inc,60.0,34.0,356.0,15.0,154.0



Confusion matrix (relative):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted,Predicted,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,lg_dec,sm_dec,no_chg,sm_inc,lg_inc
,,,,,,
True,lg_dec,1.42,0.41,4.43,0.39,2.29
True,sm_dec,0.67,0.99,14.21,1.23,1.92
True,no_chg,0.86,1.49,31.76,1.42,2.61
True,sm_inc,0.52,0.91,16.13,1.08,1.9
True,lg_inc,1.3,0.73,7.69,0.32,3.32



Classification report:

              precision    recall  f1-score   support

      lg_dec       0.30      0.16      0.21       414
      lg_inc       0.28      0.25      0.26       619
      no_chg       0.43      0.83      0.57      1767
      sm_dec       0.22      0.05      0.08       881
      sm_inc       0.24      0.05      0.09       951

    accuracy                           0.39      4632
   macro avg       0.29      0.27      0.24      4632
weighted avg       0.32      0.39      0.30      4632

Overall accuracy: 0.3858
Overall f1_score: 0.303


# Sandbox

In [27]:
# Percentage of lg_inc, sm_inc among all predicted as lg_inc
den = res["cm_abs"].iloc[:, -1].sum()
num = res["cm_abs"].iloc[(-2):, -1].sum()
num/den

0.4336917562724014

In [28]:
# Percentage of lg_inc, sm_inc, no_inc among all predicted as lg_inc
den = res["cm_abs"].iloc[:, -1].sum()
num = res["cm_abs"].iloc[(-3):, -1].sum()
num/den

0.6505376344086021

In [29]:
# Percentage of lg_inc, sm_inc among all predicted as sm_inc
den = res["cm_abs"].iloc[:, -2].sum()
num = res["cm_abs"].iloc[(-2):, -2].sum()
num/den

0.3155339805825243

In [30]:
# Percentage of lg_inc, sm_inc, no_inc among all predicted as sm_inc
den = res["cm_abs"].iloc[:, -2].sum()
num = res["cm_abs"].iloc[(-3):, -2].sum()
num/den

0.6359223300970874