In [2]:
# Load libraries 
import pandas as pd

# **src**

In [None]:
def get_days_in_week(df, date_column='date'):
    """
    Function to return a list of days in the week corresponding to each date in the DataFrame.

    Parameters:
        dataframe (pd.DataFrame): DataFrame containing the date column.
        date_column (str): Name of the date column in the DataFrame.

    Returns:
        list: List of days in the week corresponding to each date.
    """
    dataframe = df.copy()
    # Convert the date column to datetime if not already
    dataframe[date_column] = pd.to_datetime(dataframe[date_column])

    # Extract the day of the week
    days_in_week = dataframe[date_column].dt.day_name().tolist()

    return set(days_in_week)

In [None]:
def get_days(df, date_column='date', day_name="Sunday"):
    """
    Function to return a list of days in the week corresponding to each date in the DataFrame.

    Parameters:
        dataframe (pd.DataFrame): DataFrame containing the date column.
        date_column (str): Name of the date column in the DataFrame.

    Returns:
        list: List of days in the week corresponding to each date.
    """
    dataframe = df.copy()
    # Convert the date column to datetime if not already
    dataframe[date_column] = pd.to_datetime(dataframe[date_column])

    # Extract the day of the week
    dataframe['days_in_week'] = dataframe[date_column].dt.day_name()

    return dataframe[dataframe['days_in_week']==day_name]

In [None]:
def convert_to_datetime(date):
  from datetime import datetime
  import warnings
  warnings.filterwarnings("ignore")
  return datetime.strptime(date, '%Y-%m-%d').date()

In [None]:
def resample_to_daily(input_df, column_date='date', freq='D'):
    """
    Resamples the date column of the DataFrame to a frequency of one day.

    Parameters:
        input_df (pandas.DataFrame): The DataFrame containing the date column to be resampled.
        column_date (str): The name of the date column to be resampled. Default is 'date'.
        freq (str): The frequency to which the date column will be resampled. Default is 'D' for daily.

    Returns:
        pandas.DataFrame: The DataFrame with the date column resampled to a frequency of one day.
    """
    df = input_df.copy()
    df[column_date] = pd.to_datetime(df[column_date])  # Convert 'date' column to datetime if not already

    # Set the index to the date column
    df.set_index(column_date, inplace=True)

    # Resample the DataFrame to daily frequency without filling values
    df_resampled = df.resample(freq).asfreq()

    return df_resampled

# **Processing**

## **SJC data**

In [6]:
# Load dataset 
df_sjc = pd.read_csv("../data/raw_datasets/sjc_price_hochiminh.csv")
df_sjc

Unnamed: 0,purchase_price,selling_price,date_actual
0,76.50,78.70,2024-02-06
1,76.20,78.40,2024-02-05
2,75.90,78.30,2024-02-03
3,76.40,78.70,2024-02-02
4,76.20,78.40,2024-02-01
...,...,...,...
3079,35.10,35.25,2014-01-05
3080,35.08,35.18,2014-01-04
3081,35.02,35.10,2014-01-03
3082,34.95,35.05,2014-01-02


In [None]:
# Check for duplicates
df_sjc['date_actual'].duplicated().sum()

0

In [None]:
# Rename column and sort date 
df_sjc = df_sjc.rename(columns={'date_actual':'date'})
df_sjc = df_sjc.sort_values(by='date', ascending=True)

In [None]:
# Check for day of week in the data 
get_days_in_week(df_sjc)

{'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday'}

- There are records for SJC gold tael bar on weekend

In [None]:
# Check for possible missing values on weekend 
# Return rows which are Sundays
get_days(df=df_sjc, day_name='Sunday')

Unnamed: 0,purchase_price,selling_price,date,days_in_week
3079,35.1,35.25,2014-01-05,Sunday
3054,35.43,35.52,2014-02-09,Sunday
3047,36.1,36.25,2014-02-16,Sunday
3022,36.36,36.56,2014-03-16,Sunday
3003,35.57,35.67,2014-04-06,Sunday
2967,36.4,36.6,2014-05-18,Sunday
2849,35.53,35.63,2014-10-05,Sunday
2824,35.35,35.43,2014-11-02,Sunday
1815,36.73,36.93,2018-02-11,Sunday
1220,44.2,44.95,2020-02-02,Sunday


## **Exchange rate data**

In [7]:
# Load dataset 
df_exchange_rates = pd.read_csv("../data/raw_datasets/data_exchange_rates.csv")
df_exchange_rates

Unnamed: 0,Date,exchange_rate
0,2014-01-01,20835.0
1,2014-01-02,21055.0
2,2014-01-03,21050.0
3,2014-01-06,21050.0
4,2014-01-07,21045.0
...,...,...
2629,2024-01-31,24395.0
2630,2024-02-01,24415.0
2631,2024-02-02,24400.0
2632,2024-02-05,24335.0


In [None]:
# Check for duplicates
df_exchange_rates['Date'].duplicated().sum()

0

In [None]:
# Rename column
df_exchange_rates = df_exchange_rates.rename(columns={'Date':'date'})

In [None]:
# Check for day of week in the data 
get_days_in_week(df_exchange_rates)

{'Friday', 'Monday', 'Thursday', 'Tuesday', 'Wednesday'}

- There are no records for exchange rates on weekend

## **XAUUSD data**

In [8]:
# Load dataset 
df_xauusd= pd.read_csv("../data/raw_datasets/data_xauusd.csv")
df_xauusd

Unnamed: 0.1,Unnamed: 0,date,xauusd
0,0,2014-01-01,1209.006691
1,1,2014-01-02,1224.088499
2,2,2014-01-03,1237.028331
3,3,2014-01-04,1237.462953
4,4,2014-01-05,1237.600787
...,...,...,...
3685,3685,2024-02-03,2039.833460
3686,3686,2024-02-04,2040.174296
3687,3687,2024-02-05,2024.960063
3688,3688,2024-02-06,2036.074351


In [None]:
# Check for duplicates
df_xauusd['date'].duplicated().sum()

0

In [None]:
# Drop redundant column
df_xauusd = df_xauusd.drop(['Unnamed: 0'], axis=1)

In [None]:
# Check for day of week in the data 
get_days_in_week(df_xauusd)

{'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday'}

- There are records for XAUUSD gold price on weekend

# **Resampling**

In [None]:
# Resample the data to ensure the interval between two consecutive dates is exactly one day (the time series is evenly spaced)
df_sjc_resampled = resample_to_daily(df_sjc)

In [None]:
df_sjc_resampled

Unnamed: 0_level_0,purchase_price,selling_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-01-01,34.70,34.95
2014-01-02,34.95,35.05
2014-01-03,35.02,35.10
2014-01-04,35.08,35.18
2014-01-05,35.10,35.25
...,...,...
2024-02-02,76.40,78.70
2024-02-03,75.90,78.30
2024-02-04,,
2024-02-05,76.20,78.40


In [None]:
# Resample exchange rates series 
df_exchange_rates_resampled = resample_to_daily(df_exchange_rates)

In [None]:
df_exchange_rates_resampled

Unnamed: 0_level_0,exchange_rate
date,Unnamed: 1_level_1
2014-01-01,20835.0
2014-01-02,21055.0
2014-01-03,21050.0
2014-01-04,
2014-01-05,
...,...
2024-02-02,24400.0
2024-02-03,
2024-02-04,
2024-02-05,24335.0


In [None]:
# Resample xauusd series 
df_xauusd_resampled = resample_to_daily(df_xauusd[:-1]) # omit the excessive row

In [None]:
df_xauusd_resampled

Unnamed: 0_level_0,xauusd
date,Unnamed: 1_level_1
2014-01-01,1209.006691
2014-01-02,1224.088499
2014-01-03,1237.028331
2014-01-04,1237.462953
2014-01-05,1237.600787
...,...
2024-02-02,2039.099737
2024-02-03,2039.833460
2024-02-04,2040.174296
2024-02-05,2024.960063


# **Merging**

In [None]:
# Perform left merge on three resampled dataframes 
merged_df = pd.merge(df_sjc_resampled, df_exchange_rates_resampled, on='date', how='left')
merged_df = pd.merge(merged_df, df_xauusd_resampled, on='date', how='left')

In [None]:
merged_df

Unnamed: 0_level_0,purchase_price,selling_price,exchange_rate,xauusd
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-01-01,34.70,34.95,20835.0,1209.006691
2014-01-02,34.95,35.05,21055.0,1224.088499
2014-01-03,35.02,35.10,21050.0,1237.028331
2014-01-04,35.08,35.18,,1237.462953
2014-01-05,35.10,35.25,,1237.600787
...,...,...,...,...
2024-02-02,76.40,78.70,24400.0,2039.099737
2024-02-03,75.90,78.30,,2039.833460
2024-02-04,,,,2040.174296
2024-02-05,76.20,78.40,24335.0,2024.960063


In [None]:
# Export merged data
merged_df.to_csv(f"../data/raw_datasets/dataset_full.csv")