In [15]:
import pandas as pd
import os
from ta import add_all_ta_features
from ta.utils import dropna
import yfinance as yf

In [4]:
project_dir = "/home/jupyter-tfg2425paula/prediction_project_v2"
os.chdir(project_dir)

raw_data_dir = os.path.join(project_dir, "_00_data_raw")
transformed_data_dir = os.path.join(project_dir, "_01_data_transformed")
structured_data_dir = os.path.join(project_dir, "_02_data_strctured")

securities = "single_names"
stocks_folder = os.path.join(raw_data_dir, securities)

stock = 'EQNR'
filename = f'{stock}_Close.csv'

df = pd.read_csv(os.path.join(stocks_folder, filename), sep=";", decimal=",")

from _0_process_data._00_preprocess_data import preprocess_data
from _0_process_data._00_preprocess_data import appropiate_date_format
from _0_process_data._00_preprocess_data import create_return_column
clean_stocks_folder = os.path.join(raw_data_dir, 'cleaned_'+ str(securities))

date_col_name = "Date"
target_col_name = "Close"
return_col = "Return"

# standard, minmax or None
scaling_method = "standard"

# Only if date_format is not appropiate
df = appropiate_date_format(df, date_col_name, date_format="%d/%m/%y")
df = create_return_column(df, target_col_name, remove_close=False)

selected_na_cols = list(df.columns)
selected_scale_cols = list(df.drop(columns=[date_col_name]).columns) # All but Date

df_clean = preprocess_data(df, selected_na_cols, return_col, selected_scale_cols, scaling_method)
df_clean.head()

Missing values in each selected column before handling:
Date      1
Close     1
Return    1
dtype: int64

Rows with missing values in the selected columns:
           Date  Close  Return
0    2001-06-18   69.0     NaN
6113        NaT    NaN     0.0

Missing values in each selected column after handling:
Date      0
Close     0
Return    0
dtype: int64
Number of outliers eliminated: 296
Minimum extreme outlier value: -17.715532921062206


  df["Return"] = df[target_col_name].pct_change(fill_method="pad") * 100


Unnamed: 0,Date,Close,Return
1,2001-06-19,-1.29366,0.964005
2,2001-06-20,-1.279442,0.949936
3,2001-06-21,-1.29366,-0.977837
4,2001-06-22,-1.300769,-0.506148
5,2001-06-25,-1.300769,-0.020787


There are many possible indicators.

**Momentum indicators**

- RSI (Relative Strength Index): Measures the speed and change of price movements.
- Stochastic Oscillator: Compares the closing price to a price range over a period.
- Williams %R: Indicates overbought/oversold levels.
- Awesome Oscillator: Measures momentum using two SMAs (simple moving averages).
- KAMA (Kaufman’s Adaptive Moving Average): Adaptive moving average based on volatility.
- PPO (Percentage Price Oscillator): Measures the difference between two EMAs as a percentage of the larger EMA.
- PVO (Percentage Volume Oscillator): Similar to PPO but based on volume.
- ROC (Rate of Change): Measures the percentage change in price.

**Trend indicators**

- MACD (Moving Average Convergence Divergence): Identifies trend direction and strength.
- SMA (Simple Moving Average): Calculates the average price over a period.
- EMA (Exponential Moving Average): Weighted moving average that gives more weight to recent prices.
- WMA (Weighted Moving Average): Similar to SMA but with a weighting factor.
- DEMA (Double Exponential Moving Average): Reduces lag by applying EMA twice.
- TEMA (Triple Exponential Moving Average): Further reduces lag compared to DEMA.
- TRIX: A triple exponential moving average to identify trends.
- ADX (Average Directional Movement Index): Measures trend strength.
- Aroon Indicator: Measures the time since the highest/lowest point over a period.
- PSAR (Parabolic Stop and Reverse): Provides potential reversal points in a trend.
- Ichimoku Cloud: Identifies support, resistance, and trend strength.

**Volatility indicators**

- Bollinger Bands: Measures price volatility and potential breakouts.
- Average True Range (ATR): Measures market volatility.
- Donchian Channels: Identifies breakout levels over a period.
- Keltner Channels: Combines ATR and EMA to define price range.

**Volume indicators**

- OBV (On-Balance Volume): Combines volume and price movements to identify trends.
- CMF (Chaikin Money Flow): Measures money flow volume over a period.
- VWAP (Volume Weighted Average Price): Average price weighted by volume.
- ADI (Accumulation/Distribution Index): Tracks supply and demand using volume and price.
- Ease of Movement (EOM): Relates price movement to volume.
- MFI (Money Flow Index): Combines price and volume to identify overbought/oversold levels.

In [13]:
yf_data

Price,Date,Adj Close,Close,High,Low,Open,Volume
0,2001-06-19,2.359710,7.620000,7.730000,7.550000,7.550000,1410700
1,2001-06-20,2.365904,7.640000,7.690000,7.610000,7.680000,550400
2,2001-06-21,2.344226,7.570000,7.600000,7.550000,7.570000,643600
3,2001-06-22,2.325646,7.510000,7.550000,7.480000,7.520000,822600
4,2001-06-25,2.322548,7.500000,7.550000,7.500000,7.510000,488500
...,...,...,...,...,...,...,...
5888,2024-11-13,22.014797,22.670000,22.809999,22.309999,22.530001,3444600
5889,2024-11-14,23.005318,23.690001,23.820000,23.500000,23.620001,9806700
5890,2024-11-15,23.267513,23.959999,24.350000,23.610001,23.709999,6133200
5891,2024-11-18,23.519999,24.219999,24.389999,24.100000,24.170000,3141700


In [19]:
ticker = stock
yf_data = yf.download(ticker, start=df_clean["Date"].min().strftime('%Y-%m-%d'), 
                      end=df_clean["Date"].max().strftime('%Y-%m-%d'))

yf_data.columns = yf_data.columns.droplevel(1)
yf_data = yf_data.reset_index()

df_with_indicators = add_all_ta_features(
    yf_data,
    open="Open",    # Use Close for 'open'
    high="High",    # Use Close for 'high'
    low="Low",     # Use Close for 'low'
    close="Close",   # Use Close for 'close'
    volume="Volume",     # No volume data available
    fillna=False      # Fill missing values to avoid issues
)

df_with_indicators = df_with_indicators.interpolate(method="linear")

# They are only either at the beginning or the end
df_with_indicators = df_with_indicators.dropna()
df_with_indicators

[*********************100%***********************]  1 of 1 completed


Price,Date,Adj Close,Close,High,Low,Open,Volume
0,2001-06-19,2.359710,7.620000,7.730000,7.550000,7.550000,1410700
1,2001-06-20,2.365904,7.640000,7.690000,7.610000,7.680000,550400
2,2001-06-21,2.344226,7.570000,7.600000,7.550000,7.570000,643600
3,2001-06-22,2.325646,7.510000,7.550000,7.480000,7.520000,822600
4,2001-06-25,2.322548,7.500000,7.550000,7.500000,7.510000,488500
...,...,...,...,...,...,...,...
5888,2024-11-13,22.014797,22.670000,22.809999,22.309999,22.530001,3444600
5889,2024-11-14,23.005318,23.690001,23.820000,23.500000,23.620001,9806700
5890,2024-11-15,23.267513,23.959999,24.350000,23.610001,23.709999,6133200
5891,2024-11-18,23.519999,24.219999,24.389999,24.100000,24.170000,3141700
