In [1]:
! pip install yfinance



In [2]:
import numpy as np
from sklearn.metrics import accuracy_score
import numba
import pandas as pd
import random
from datetime import datetime, timezone, timedelta
from enum import Enum
from typing import List, Tuple, Dict, Any
import matplotlib.pyplot as plt
import pytz
import requests
from DataImportfromYf import DataImportfromYf
from Classes import Frequency, Position, Config, PositionType, Weight, BaseWeightComputation
from model_validation import get_train_val_test_idx_rolling, get_train_val_test_idx_regular
from Features import add_return, binarize_label, standardize_feat_basket
from keras.layers import LSTM, Dense, Dropout, Input, BatchNormalization 
from keras.models import load_model, model_from_json, Sequential, Model
from keras.callbacks import EarlyStopping
from keras.utils import plot_model
from keras.optimizers import Adam
from keras.initializers import RandomUniform, GlorotUniform, GlorotNormal
from LSTM_model import LongShortTermMemory
from Backtest_classes import PositionGenerator, Backtester


# Data Loading 
Define the universe, the time range and interval considered (2002-05-01 To 2020-01-01)

In [4]:
universe = ["GETI-B.ST", "ATCO-A.ST", "VOLV-B.ST", "TEL2-B.ST", "ELUX-B.ST", "TELIA.ST", "HM-B.ST", "SKA-B.ST", 
            "ALFA.ST", "ERIC-B.ST", "SKF-B.ST", "SAND.ST", "AZN.ST", "SECU-B.ST", "INVE-B.ST", "ABB.ST", 
            "SEB-A.ST", "SHB-A.ST", "SCA-B.ST", "SSAB-A.ST", "ASSA-B.ST", "ALIV-SDB.ST", "HEXA-B.ST", 
            "SWED-A.ST", "ATCO-B.ST", "KINV-B.ST", "BOL.ST", "NDA-SE.ST"]
start_date = datetime(2002, 5, 1)
end_date = datetime(2020, 1, 1)
interval = "1d"

obj = DataImportfromYf(universe=universe, start_ts=start_date, end_ts=end_date, interval=interval, ignore_tz=True)
data = obj.get_data()

[*********************100%***********************]  28 of 28 completed


In [5]:
data.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,adj_close,close,high,low,open,volume
ts,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2002-05-01,ABB.ST,68.324242,73.443237,73.443237,73.443237,73.443237,0.0
2002-05-01,ALIV-SDB.ST,190.604477,228.5,228.5,228.5,228.5,0.0
2002-05-01,ATCO-A.ST,3.608177,7.640396,7.640396,7.640396,7.640396,0.0
2002-05-01,ATCO-B.ST,3.110864,7.141966,7.141966,7.141966,7.141966,0.0
2002-05-01,AZN.ST,402.793365,482.0,482.0,482.0,482.0,0.0


## Data cleaning

In [6]:
# Are they any missing values ?
data[data.isnull().any(axis=1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,adj_close,close,high,low,open,volume
ts,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


<h3> Add past, future returns and target

In [7]:
# Past Returns
data = add_return(data, target="close", deltas=[1], by="symbol",disable_tqdm=True)
# Futur Returns
data = add_return(data, target="close", deltas=[-1], by="symbol",disable_tqdm=True)

In [8]:
# Drop useless data
data = data.dropna(subset=["past_ret_1D_close","fut_ret_1D_close"])

In [9]:
# Target feature (y_label)
label_name = "fut_ret_1D_close"
label_fit = data[label_name]
label_fit = binarize_label(data=label_fit)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  data = data.groupby("ts").apply(lambda df: pd.Series((df > df.median()).astype(int), index=df.index))


In [10]:
data = standardize_feat_basket(data=data, features_names=["past_ret_1D_close"], by=["ts"])

In [11]:
len(data[data.isnull().any(axis=1)])

891

In [12]:
# Dropna introtuced by the standardization process
data = data.dropna(subset=["past_ret_1D_close","fut_ret_1D_close"])