In [1]:

import numpy as np
import os
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

import datetime
from dateutil.relativedelta import relativedelta
from dags.virgo_functions.configs import low_finder_configs

main_path = os.getcwd()



In [2]:
main_path

'c:\\Users\\Miguel\\virgo_airflow'

In [3]:

class slice_predict():

    def __init__(self, data, code, features, exeptions, scale_features, dates_back = 60 ):
    
        self.df = data
        self.code = code
        self.my_features = features
        self.exeptions = exeptions
        self.scale_features = scale_features
        self.dates_back = dates_back
        
        self.price = [ x for x in self.df.columns if '_price' in x ][0]
        self.target = [ x for x in self.df.columns if '_logdif' in x ][0]
        self.std_col = [ x for x in self.df.columns if '_stv' in x ][0]
        self.volume_col = [ x for x in self.df.columns if '_Volume' in x ][0]
        self.roll_mean_col = [ x for x in self.df.columns if '_roll_mean' in x ][0]

    def feature_engineering(self):
        df = (self.df
            .assign(up_yield = np.where(self.df[self.target] > 0, 1,0))
            .assign(low_yield = np.where(self.df[self.target] <= 0, 1,0))
        )
        
        df = df.rename(columns = {self.price:'price'})
        df["roll_up_yield"] = df.sort_values('Date')["up_yield"].transform(lambda x: x.rolling(10, min_periods=1).sum())
        df["roll_low_yield"] = df.sort_index()["low_yield"].transform(lambda x: x.rolling(10, min_periods=1).sum())
        df["roll_std"] = df.sort_index()[self.std_col].transform(lambda x: x.rolling(10, min_periods=1).mean())
        df['log_Volume'] = np.log(df[self.volume_col])
        df["roll_log_Volume"] = df.sort_index()['log_Volume'].transform(lambda x: x.rolling(5, min_periods=1).mean())
        df['Date'] = pd.to_datetime(df['Date']).dt.date
        self.df = df
    def get_slice(self):
        
        begin_date = datetime.date.today()- relativedelta(days = self.dates_back)
        
        ds = self.df[self.df.Date >= begin_date]
        ds = ds.rename(columns = {self.price:'price'})

        ds_max = ds[ds[self.roll_mean_col] == ds[self.roll_mean_col].max()].head(1).Date.values[0]
        ds_min = ds[ds[self.roll_mean_col] == ds[self.roll_mean_col].min()].head(1).Date.values[0]
        ds['time_to_max'] = pd.to_numeric((self.df.Date - ds_max).dt.days,downcast='float')
        ds['time_to_min'] = pd.to_numeric((self.df.Date - ds_min).dt.days,downcast='float')
        
        ### apply pipeline sklearn

        X_train = ds[self.my_features + self.exeptions]

        pipeline = Pipeline([
            ('scaler', ColumnTransformer([('scaling', StandardScaler(), self.scale_features)], remainder='passthrough'))
        ])

        pipeline.fit(X_train)
        self.pipeline = pipeline
        self.ds = ds
        
        self.X_train_transformed = pipeline.transform(X_train)


features = low_finder_configs.features
exeptions = low_finder_configs.exeptions
scale_features = low_finder_configs.scale_features

In [4]:


raw_data = pd.read_csv(f'{main_path}/tmp_data/raw_data_bidfinder.csv')
stock_codes = raw_data.stock_code.unique()

dataframes = list()

for code in stock_codes:
    stock_raw_data = raw_data[raw_data.stock_code == code]
    stock_raw_data = stock_raw_data.drop(columns = ['stock_code'])

    data_to_predict = slice_predict( stock_raw_data, code, features, exeptions, scale_features)
    data_to_predict.feature_engineering()
    data_to_predict.get_slice()
    dataset_to_predict = pd.DataFrame(data_to_predict.X_train_transformed, columns = features + exeptions)
    dataset_to_predict['stock_code'] = code
    
    dataframes.append(dataset_to_predict)

target_data_export = pd.concat(dataframes)
target_data_export.to_csv(f'{main_path}/tmp_data/dataset_to_predict_bidfinder.csv', header = True, index = False)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [5]:
data_to_predict.df

Unnamed: 0,Date,price,PFE_stv,PFE_logdif,PFE_Volume,PFE_roll_mean,PCAR_price,PCAR_stv,PCAR_logdif,PCAR_Volume,PCAR_roll_mean,up_yield,low_yield,roll_up_yield,roll_low_yield,roll_std,log_Volume,roll_log_Volume
654,2020-08-26,,,,,,52.951744,0.463762,-0.003562,1654650.0,53.209498,0,0,0.0,0.0,,,
655,2020-08-27,,,,,,52.622581,0.359254,-0.007961,1658550.0,53.015312,0,0,0.0,0.0,,,
656,2020-08-28,,,,,,52.720112,0.201237,-0.004729,1724850.0,52.874244,0,0,0.0,0.0,,,
657,2020-08-31,,,,,,52.323910,0.251507,-0.006618,2153400.0,52.757559,0,0,0.0,0.0,,,
658,2020-09-01,,,,,,53.061466,0.255039,0.002070,2595600.0,52.760172,0,0,0.0,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1303,2023-03-27,,,,,,70.940002,0.880227,-0.007023,2439800.0,70.210000,0,0,0.0,0.0,,,
1304,2023-03-28,,,,,,71.839996,0.772205,0.021101,1797600.0,70.659999,0,0,0.0,0.0,,,
1305,2023-03-29,,,,,,72.349998,0.980338,0.035738,2227100.0,70.945714,0,0,0.0,0.0,,,
1306,2023-03-30,,,,,,72.209999,1.082879,0.032513,1858600.0,71.055713,0,0,0.0,0.0,,,


In [6]:
raw_data

Unnamed: 0,Date,PFE_price,PFE_stv,PFE_logdif,PFE_Volume,PFE_roll_mean,stock_code,PCAR_price,PCAR_stv,PCAR_logdif,PCAR_Volume,PCAR_roll_mean
0,2020-08-26,33.002964,0.273399,-0.017455,24083689.0,33.395749,PFE,,,,,
1,2020-08-27,32.838154,0.345729,-0.026585,22156556.0,33.333794,PFE,,,,,
2,2020-08-28,32.881531,0.384369,-0.024236,32830519.0,33.290427,PFE,,,,,
3,2020-08-31,32.777439,0.402161,-0.016274,30032465.0,33.175191,PFE,,,,,
4,2020-09-01,31.988152,0.524345,-0.031232,36145560.0,32.927375,PFE,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1303,2023-03-27,,,,,,PCAR,70.940002,0.880227,-0.007023,2439800.0,70.210000
1304,2023-03-28,,,,,,PCAR,71.839996,0.772205,0.021101,1797600.0,70.659999
1305,2023-03-29,,,,,,PCAR,72.349998,0.980338,0.035738,2227100.0,70.945714
1306,2023-03-30,,,,,,PCAR,72.209999,1.082879,0.032513,1858600.0,71.055713
