# Purpose

The purpose of this notebook is to illustrate how the data from `data.app_data.DataAlpacaPocCat` can be used for training a classification model.

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import datetime
import pymongo
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from supervised.automl import AutoML
import json

sys.path.append("../src/")
from tradingdmp.data.clf.price_perc_chg.app_data import DataAlpacaPocCat
from tradingdmp.model.clf.app_model import MljarAutoMl

In [2]:
pd.set_option('display.max_columns', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
mongodbkey = "" # PUT YOUR KEY HERE

# Functions

# Get data with cache

In [4]:
# Load model config data
with open("config_model.json") as f:
    data = json.load(f)

    test_size = data["test_size"]

    dt_start = data["dt_start"]
    dt_end = data["dt_end"]
    ticker_list = data["ticker_list"]
    n_ppc_per_row = data["n_ppc_per_row"]

    mode = data["mode"]
    eval_metric = data["eval_metric"]
    total_time_limit = data["total_time_limit"]
    algorithms = data["algorithms"]

In [5]:
adata = DataAlpacaPocCat(mongodbkey)
df_x, df_y = adata.get_data_cached(
        ticker_list,
        datetime.datetime.strptime(dt_start, "%Y-%m-%d"),
        datetime.datetime.strptime(dt_end, "%Y-%m-%d"),
        dt_end_required=True,
        n_ppc_per_row=n_ppc_per_row,
        return_last_date_only=False,
        return_date_col=True,
        return_training_dfs=True,
    )