### Goals:
1. Check the quality of data
2. Explore the connections in data
3. Explore periodicity of time series data
4. Explore stationarity

In [1]:
from sqlalchemy import create_engine

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (6, 6)

In [2]:
engine = create_engine("postgresql://airflow:airflow@localhost:5454/forex")
with engine.connect() as con:
  connection = con.connection

  ex_pairs = pd.read_sql_query('SELECT * FROM ex_pairs', con=connection)
  macro = pd.read_sql_query('SELECT * FROM macro', con=connection)
  entities = pd.read_sql_query('SELECT * FROM entity_dimension_final', con=connection)

  ex_pairs = pd.read_sql_query('SELECT * FROM ex_pairs', con=connection)
  macro = pd.read_sql_query('SELECT * FROM macro', con=connection)
  entities = pd.read_sql_query('SELECT * FROM entity_dimension_final', con=connection)


#### Quality checks
- Check for duplicates
- Check missing values
- Check for outliers(wrong values)
- Check the availability of data:
  - What is the range of time in which data is available for each entity

In [3]:
any(ex_pairs[['currency_1', 'currency_2', 'date']].duplicated())

False

In [4]:
any(macro[['index', 'measure', 'inr_measure', 'date']].duplicated())

False

In [5]:
ex_pairs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1409483 entries, 0 to 1409482
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype         
---  ------      --------------    -----         
 0   currency_1  1409483 non-null  int64         
 1   currency_2  1409483 non-null  int64         
 2   date        1409483 non-null  datetime64[ns]
 3   rate        1409483 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 43.0 MB


In [6]:
macro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9423 entries, 0 to 9422
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   index        9423 non-null   int64         
 1   measure      9423 non-null   int64         
 2   inr_measure  8829 non-null   float64       
 3   date         9423 non-null   datetime64[ns]
 4   bop_value    9423 non-null   float64       
 5   inr_value    8829 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(2)
memory usage: 441.8 KB


In [7]:
miss_na = macro[macro['inr_measure'].isna()]

print("First: {0},\nLast: {1}".format(
      miss_na['date'].min(),
      miss_na['date'].max()))

First: 2015-01-01 00:00:00,
Last: 2024-07-01 00:00:00


In [8]:
has_missing = miss_na['index'].unique()

In [9]:
for ix in has_missing:
  if not all(macro[macro['index'] == ix]['inr_value'].isna()):
    print(f"Index {ix} has non-missing values also")

Index 45015 has non-missing values also


In [10]:
miss_na[macro['index'] == 45015]['date'].unique()

  miss_na[macro['index'] == 45015]['date'].unique()


<DatetimeArray>
['2023-01-01 00:00:00', '2023-04-01 00:00:00', '2023-07-01 00:00:00',
 '2023-10-01 00:00:00']
Length: 4, dtype: datetime64[ns]

In [11]:
miss_na

Unnamed: 0,index,measure,inr_measure,date,bop_value,inr_value
0,42001,5,,2015-01-01,-1535.791,
1,42001,6,,2015-01-01,-5543.862,
2,42001,3,,2015-01-01,-865.751,
87,11017,6,,2015-01-01,-11024.350,
88,11017,3,,2015-01-01,11579.390,
...,...,...,...,...,...,...
9322,1027,3,,2024-07-01,89339.700,
9323,1027,6,,2024-07-01,139550.900,
9411,28050,3,,2024-07-01,-353279.900,
9412,28050,6,,2024-07-01,-238724.800,


In [14]:
entities[entities['index'] == 45015]['area_name']

27    Türkiye
Name: area_name, dtype: object

#### The only missing are from countries, that haven't interest rates records in OECD database in the past 10 years. The only exception being Turkey, that doesn't have interest rates data in the OECD database for the year 2023 

### Correlation analysis

- Correlation matrix with data in wide format
- Cross-correlation functions
- Correlation after feature engineering
- Cross-correlation after reafture engineering

**Feature engineering**

The main idea of the following feature engineering is to pair the data.

- Get the exchange approximate rate between two 
- Get the differences and ratios between indicators at all dimensions 

### Periodicity of the time series

- Explore the ACF and periodogram of each of the exchange rates

### Stationarity tests

- Augmented-Dickey Fuller test