### Transforming US treasury yields to hourly for matching with other data

In [13]:
import pandas as pd
import numpy as np
import os

path_directory = os.getcwd()

file_path = f"{path_directory}\\US_yield.csv"

df = pd.read_csv(file_path, sep=",")

df.head()


Unnamed: 0,date,realtime_start,realtime_end,value
0,2001-07-31,2024-10-28,2024-10-28,3.67
1,2001-08-01,2024-10-28,2024-10-28,3.65
2,2001-08-02,2024-10-28,2024-10-28,3.65
3,2001-08-03,2024-10-28,2024-10-28,3.63
4,2001-08-06,2024-10-28,2024-10-28,3.62


In [2]:
df_2020 = df[df["date"]>= "2020-01-08"].copy()

del df

print(df_2020.head())

            date realtime_start realtime_end value
4811  2020-01-08     2024-10-28   2024-10-28  1.50
4812  2020-01-09     2024-10-28   2024-10-28  1.53
4813  2020-01-10     2024-10-28   2024-10-28  1.52
4814  2020-01-13     2024-10-28   2024-10-28  1.54
4815  2020-01-14     2024-10-28   2024-10-28  1.53


In [3]:
df_2020.drop(columns=["realtime_start", "realtime_end"], inplace=True)

print(df_2020.head())

            date value
4811  2020-01-08  1.50
4812  2020-01-09  1.53
4813  2020-01-10  1.52
4814  2020-01-13  1.54
4815  2020-01-14  1.53


In [4]:
df_2020.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1252 entries, 4811 to 6062
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    1252 non-null   object
 1   value   1252 non-null   object
dtypes: object(2)
memory usage: 29.3+ KB


In [5]:
df_2020["date"] = pd.to_datetime(df_2020["date"])
df_2020["value"] = pd.to_numeric(df_2020['value'], errors='coerce')

print(df_2020.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1252 entries, 4811 to 6062
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1252 non-null   datetime64[ns]
 1   value   1202 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 29.3 KB
None


In [6]:
print(df_2020.head())

           date  value
4811 2020-01-08   1.50
4812 2020-01-09   1.53
4813 2020-01-10   1.52
4814 2020-01-13   1.54
4815 2020-01-14   1.53


In [7]:
nan_count = df_2020['value'].isna().sum()
print(f"Number of NaN values: {nan_count}")

Number of NaN values: 50


In [8]:
print("Rows with NaN values:")
print(df_2020[df_2020['value'].isna()])

Rows with NaN values:
           date  value
4819 2020-01-20    NaN
4839 2020-02-17    NaN
4878 2020-04-10    NaN
4909 2020-05-25    NaN
4938 2020-07-03    NaN
4984 2020-09-07    NaN
5009 2020-10-12    NaN
5031 2020-11-11    NaN
5042 2020-11-26    NaN
5063 2020-12-25    NaN
5068 2021-01-01    NaN
5079 2021-01-18    NaN
5099 2021-02-15    NaN
5174 2021-05-31    NaN
5199 2021-07-05    NaN
5244 2021-09-06    NaN
5269 2021-10-11    NaN
5292 2021-11-11    NaN
5302 2021-11-25    NaN
5323 2021-12-24    NaN
5339 2022-01-17    NaN
5364 2022-02-21    NaN
5403 2022-04-15    NaN
5434 2022-05-30    NaN
5449 2022-06-20    NaN
5459 2022-07-04    NaN
5504 2022-09-05    NaN
5529 2022-10-10    NaN
5553 2022-11-11    NaN
5562 2022-11-24    NaN
5584 2022-12-26    NaN
5589 2023-01-02    NaN
5599 2023-01-16    NaN
5624 2023-02-20    NaN
5694 2023-05-29    NaN
5709 2023-06-19    NaN
5720 2023-07-04    NaN
5764 2023-09-04    NaN
5789 2023-10-09    NaN
5822 2023-11-23    NaN
5844 2023-12-25    NaN
5849 2024-01

In [9]:
df_2020['value'] = df_2020['value'].interpolate(method='linear')

# Verify if there are still any NaNs left (just to check)
print(df_2020['value'].isna().sum())

0


In [10]:
full_date_range = pd.date_range(start=df_2020['date'].min(), end=df_2020['date'].max())

# Reindex the DataFrame to include all dates
df_2020 = df_2020.set_index('date').reindex(full_date_range)

# Interpolate values
df_2020['value'] = df_2020['value'].interpolate(method='linear')

# Reset the index to make 'date' a column again
df_2020 = df_2020.reset_index()
df_2020.rename(columns={'index': 'date'}, inplace=True)

# Display the final DataFrame
print(df_2020)

           date     value
0    2020-01-08  1.500000
1    2020-01-09  1.530000
2    2020-01-10  1.520000
3    2020-01-11  1.526667
4    2020-01-12  1.533333
...         ...       ...
1747 2024-10-20  4.920000
1748 2024-10-21  4.920000
1749 2024-10-22  4.890000
1750 2024-10-23  4.880000
1751 2024-10-24  4.870000

[1752 rows x 2 columns]


In [11]:
hourly_data = []

# Expand each date to 24 hours
for index, row in df_2020.iterrows():
    for hour in range(24):  # 0 to 23
        hourly_data.append({
            'datetime': row['date'] + pd.Timedelta(hours=hour),
            'value': row['value']
        })

# Create a new DataFrame
df_hourly = pd.DataFrame(hourly_data)

# Display the new DataFrame
print(df_hourly)

                 datetime  value
0     2020-01-08 00:00:00   1.50
1     2020-01-08 01:00:00   1.50
2     2020-01-08 02:00:00   1.50
3     2020-01-08 03:00:00   1.50
4     2020-01-08 04:00:00   1.50
...                   ...    ...
42043 2024-10-24 19:00:00   4.87
42044 2024-10-24 20:00:00   4.87
42045 2024-10-24 21:00:00   4.87
42046 2024-10-24 22:00:00   4.87
42047 2024-10-24 23:00:00   4.87

[42048 rows x 2 columns]


In [12]:
file_path_1 = f"{path_directory}\\US_yield_hourly.csv"

df_2020.to_csv(file_path_1, sep=",")