# Data collection


In [54]:
import os
import json
import requests

import pandas as pd

from dotenv import load_dotenv
from tqdm.notebook import tqdm
tqdm.pandas()

from IPython.display import Image

In [55]:
load_dotenv()

api_key = os.getenv("FRED_API_KEY")

In [56]:
def get_GDP_data(api_key, start_date, end_date, sort_order='desc', file_type='json'):
    """
    Fetch GDP data from the FRED API.

    Parameters:
    - api_key (str): FRED API key.
    - start_date (str): The start date for the data (YYYY-MM-DD).
    - end_date (str): The end date for the data (YYYY-MM-DD).
    - sort_order (str): The order of the data based on observation date.
    - file_type (str): The format for the data.

    Returns:
    - list: A list of GDP data observations.
    """

    url = 'https://api.stlouisfed.org/fred/series/observations'

    params = {
        'series_id': 'GDPCA',
        'api_key': api_key,          
        'file_type': file_type,      
        'sort_order': sort_order,    
        'observation_start': start_date,  
        'observation_end': end_date  
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        return data['observations']
    else:
        return f"Error: {response.status_code} - {response.text}"


In [57]:
data = get_GDP_data(api_key, '1929-01-01', '2023-01-01')

with open("../data/raw/GDP_data.json", "w") as f:
    json.dump(data, f)

print("Data saved to GDP_data.json")


Data saved to GDP_data.json


# Data Processing

In [58]:
# Seeing what the JSON file looks like
filepath = '../data/raw/GDP_data.json'
GDP_df = pd.read_json(filepath)
GDP_df.head()


Unnamed: 0,realtime_start,realtime_end,date,value
0,2025-01-21,2025-01-21,2023-01-01,22671.096
1,2025-01-21,2025-01-21,2022-01-01,22034.828
2,2025-01-21,2025-01-21,2021-01-01,21494.798
3,2025-01-21,2025-01-21,2020-01-01,20267.585
4,2025-01-21,2025-01-21,2019-01-01,20715.671


In [59]:
relevant_columns = ['date', 'value']
GDP_df = GDP_df[relevant_columns].copy()
GDP_df.rename(columns={'value': 'GDP (billions)'}, inplace= True)

GDP_df

Unnamed: 0,date,GDP (billions)
0,2023-01-01,22671.096
1,2022-01-01,22034.828
2,2021-01-01,21494.798
3,2020-01-01,20267.585
4,2019-01-01,20715.671
...,...,...
90,1933-01-01,877.431
91,1932-01-01,888.414
92,1931-01-01,1019.977
93,1930-01-01,1089.785


In [60]:
# Saving to CSV as this is easier for Pandas to read
GDP_df.to_csv('../data/processed/GDP_data.csv', index=False)

In [61]:
# Summing the GDP data for each decade to match Smithsonian API
decades_df = pd.read_csv('../data/processed/GDP_data.csv')

decades_df['date'] = pd.to_datetime(decades_df['date']).dt.year
decades_df['decade'] = (decades_df['date'] // 10) * 10
decades_df = decades_df.groupby('decade').agg({'GDP (billions)': 'sum'}).reset_index()

decades_df

Unnamed: 0,decade,GDP (billions)
0,1920,1191.124
1,1930,10885.694
2,1940,21402.316
3,1950,29702.267
4,1960,43691.739
5,1970,62227.444
6,1980,83739.466
7,1990,114391.799
8,2000,155440.685
9,2010,185821.763


In [63]:
decades_df.to_csv('../data/processed/GDP_by_decades.csv', index=False)