This notebook contains code to download the yield curve data from the U.S. treasury website. The data is in XML format and can be obtained year by year.

In [None]:
import requests
import xml.dom.minidom

import numpy as np
import pandas as pd

In [None]:
# base url of the treasury website (need to append year at the end)
BASE_URL = 'http://data.treasury.gov/feed.svc/DailyTreasuryYieldCurveRateData?$filter=year(NEW_DATE)%20eq%20'

TENORS = ['1M', '3M', '6M', '1Y', '2Y', '3Y', '5Y', '7Y', '10Y', '20Y', '30Y']

# dict mapping field names to the xml tag names of the 'content' DOM node
CONTENT_TAG_MAP = {
    'date': 'd:NEW_DATE',
    '1M': 'd:BC_1MONTH',
    '3M': 'd:BC_3MONTH',
    '6M': 'd:BC_6MONTH',
    '1Y': 'd:BC_1YEAR',
    '2Y': 'd:BC_2YEAR',
    '3Y': 'd:BC_3YEAR',
    '5Y': 'd:BC_5YEAR',
    '7Y': 'd:BC_7YEAR',
    '10Y': 'd:BC_10YEAR',
    '20Y': 'd:BC_20YEAR',
    '30Y': 'd:BC_30YEAR'
}

In [None]:
def get_tag_value(content, tagname):
    '''
    get a specific tag value from 'contents' dom node
    '''
    el = content.getElementsByTagName(tagname)[0]
    is_null = el.getAttribute('m:null') == 'true'
    return np.nan if is_null else el.firstChild.nodeValue

def parse_content(content):
    '''
    returns the dict of yield values by parsing the 'content'
    DOM node
    '''    
    return {k: get_tag_value(content, v) for k, v in CONTENT_TAG_MAP.items()}

def get_yc_data_for_year(year):
    url = BASE_URL + str(year)
    f = requests.get(url)
    xml_data = f.text
    dom = xml.dom.minidom.parseString(xml_data)
    contents = dom.getElementsByTagName('content')
    
    yc_df = pd.DataFrame([parse_content(content) for content in contents], 
                         dtype='float')
    yc_df.index = pd.to_datetime(yc_df['date'])
    yc_df.index.name = None
    return yc_df.sort_index()[TENORS] / 100.

def get_yield_curve_data(start_year=2006, end_year=2018):
    return pd.concat([get_yc_data_for_year(year) for year in range(start_year, end_year + 1)])

In [None]:
yc_time_series = get_yield_curve_data()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

yc_time_series.plot(figsize=[20, 10], grid=True)

In [None]:
# data cleanup

# 30Y yields missing till 2006-02-08, start from 2006/03/01
yc_time_series = yc_time_series.loc['2006-03-01':]

# null data
yc_time_series.isnull().sum()

In [None]:
# all zero row data
yc_time_series[np.all(yc_time_series == 0, axis=1)]

In [None]:
yc_time_series = yc_time_series.dropna(how='all')\
    .drop(pd.to_datetime('2017-04-14'))\
    .fillna(0.)

In [None]:
yc_time_series.to_csv('yc_time_series.csv')