Feature extraction:

Understood. In that case, here are three feature engineering techniques that you can focus on:

Lag Features: Creating lag features by including past exchange rates as predictors are valuable. By including lagged values of the exchange rate as features, we capture the autocorrelation and temporal dependencies in the data. 

Seasonality Indicators: Given that our data consists of monthly observations, incorporating seasonality indicators can be useful. We include dummy variables for each month to capture any systematic variations in exchange rates that occur seasonally. 

Moving Averages: moving averages of the exchange rate over different time windows. Moving averages can smooth out short-term fluctuations and highlight longer-term trends in the data.

In [58]:
import pandas as pd
import numpy as np

In [59]:
path = 'data/processed-data/processed_data_2024-03-24.csv'
df = pd.read_csv(path)

In [60]:
df['date'] = pd.to_datetime(df['date'])

In [61]:
# 1. Lag Features

def create_lag_features(data, lag_features):
    for lag in lag_features:
        data[f'lag_{lag}'] = data['rate'].shift(lag)
    
    #data = data.fillna(method='ffill', inplace=True)
    return data

lag_features = [1, 3, 6, 12]  # Experiment with different lag lengths
df = create_lag_features(df, lag_features)
df.head()

Unnamed: 0,rate,date,lag_1,lag_3,lag_6,lag_12
0,0.000122,2007-01-01,,,,
1,0.000122,2007-02-01,0.000122,,,
2,0.000122,2007-03-01,0.000122,,,
3,0.000121,2007-04-01,0.000122,0.000122,,
4,0.000121,2007-05-01,0.000121,0.000122,,


In [62]:
df

Unnamed: 0,rate,date,lag_1,lag_3,lag_6,lag_12
0,0.000122,2007-01-01,,,,
1,0.000122,2007-02-01,0.000122,,,
2,0.000122,2007-03-01,0.000122,,,
3,0.000121,2007-04-01,0.000122,0.000122,,
4,0.000121,2007-05-01,0.000121,0.000122,,
...,...,...,...,...,...,...
197,5063.853413,2023-07-01,5230.238379,967.528175,662.653591,334.371318
198,4581.357543,2023-08-01,5063.853413,1389.422391,829.140475,392.679524
199,4922.946548,2023-09-01,4581.357543,5230.238379,903.554391,479.995783
200,5643.509091,2023-10-01,4922.946548,5063.853413,967.528175,583.728455


Adding Seasonal Indicators

In [63]:
# Convert 'date' column to datetime if it's not already in datetime format
df['date'] = pd.to_datetime(df['date'])

# Create 'month' and 'quarter' columns
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter

# adding the seasonal trend features
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['quarter_sin'] = np.sin(2 * np.pi * df['quarter'] / 4)
df['quarter_cos'] = np.cos(2 * np.pi * df['quarter'] / 4)

df.head()



Unnamed: 0,rate,date,lag_1,lag_3,lag_6,lag_12,month,quarter,month_sin,month_cos,quarter_sin,quarter_cos
0,0.000122,2007-01-01,,,,,1,1,0.5,0.8660254,1.0,6.123234000000001e-17
1,0.000122,2007-02-01,0.000122,,,,2,1,0.866025,0.5,1.0,6.123234000000001e-17
2,0.000122,2007-03-01,0.000122,,,,3,1,1.0,6.123234000000001e-17,1.0,6.123234000000001e-17
3,0.000121,2007-04-01,0.000122,0.000122,,,4,2,0.866025,-0.5,1.224647e-16,-1.0
4,0.000121,2007-05-01,0.000121,0.000122,,,5,2,0.5,-0.8660254,1.224647e-16,-1.0


Adding Moving Averages

In [65]:
# Define the window size for the moving averages
window_sizes = [3, 6, 12]  # window sizes: 3 months, 6 months, and 12 months

# Calculate moving averages for each window size
for window_size in window_sizes:

    moving_average_column_name = f'moving_avg_{window_size}'

    # Calculate the moving average and assign it to a new column
    df[moving_average_column_name] = df['rate'].rolling(window=window_size).mean()


df.head()


Unnamed: 0,rate,date,lag_1,lag_3,lag_6,lag_12,month,quarter,month_sin,month_cos,quarter_sin,quarter_cos,moving_avg_3,moving_avg_6,moving_avg_12
0,0.000122,2007-01-01,,,,,1,1,0.5,0.8660254,1.0,6.123234000000001e-17,,,
1,0.000122,2007-02-01,0.000122,,,,2,1,0.866025,0.5,1.0,6.123234000000001e-17,,,
2,0.000122,2007-03-01,0.000122,,,,3,1,1.0,6.123234000000001e-17,1.0,6.123234000000001e-17,0.000122,,
3,0.000121,2007-04-01,0.000122,0.000122,,,4,2,0.866025,-0.5,1.224647e-16,-1.0,0.000122,,
4,0.000121,2007-05-01,0.000121,0.000122,,,5,2,0.5,-0.8660254,1.224647e-16,-1.0,0.000121,,


In [67]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202 entries, 0 to 201
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   rate           202 non-null    float64       
 1   date           202 non-null    datetime64[ns]
 2   lag_1          201 non-null    float64       
 3   lag_3          199 non-null    float64       
 4   lag_6          196 non-null    float64       
 5   lag_12         190 non-null    float64       
 6   month          202 non-null    int32         
 7   quarter        202 non-null    int32         
 8   month_sin      202 non-null    float64       
 9   month_cos      202 non-null    float64       
 10  quarter_sin    202 non-null    float64       
 11  quarter_cos    202 non-null    float64       
 12  moving_avg_3   200 non-null    float64       
 13  moving_avg_6   197 non-null    float64       
 14  moving_avg_12  191 non-null    float64       
dtypes: datetime64[ns](1), f

In [68]:
df.to_csv('data/processed-data/festure_engineered_data.csv')