<a href="https://colab.research.google.com/github/maryambahri/recession_analysis/blob/main/scripts/ingestion_WB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#installing necessary library

!pip install boto3 requests


Collecting boto3
  Downloading boto3-1.41.2-py3-none-any.whl.metadata (6.8 kB)
Collecting botocore<1.42.0,>=1.41.2 (from boto3)
  Downloading botocore-1.41.2-py3-none-any.whl.metadata (5.9 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.16.0,>=0.15.0 (from boto3)
  Downloading s3transfer-0.15.0-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.41.2-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.41.2-py3-none-any.whl (14.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.3/14.3 MB[0m [31m80.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.15.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00

In [None]:
# loading all libraries here and connecting to S3 bucket

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import os
from datetime import datetime
import io

# === EDIT THESE ===
AWS_ACCESS_KEY_ID = "xxxxxxxxxx"
AWS_SECRET_ACCESS_KEY = "xxxxxxxxxxx"
AWS_REGION = "us-east-1"   # or whatever region you use
S3_BUCKET_NAME = "group24-recessionanalysis"
S3_PREFIX = "raw_data/"

# Create S3 client using your keys
s3_client = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_REGION,
)

print("S3 client created OK")


S3 client created OK


In [None]:
# Downloading the data directly from worldbank and pushing it to S3

# IMPORTS & AWS / S3 CONFIG
import requests
import xml.etree.ElementTree as ET

# WORLD BANK CONFIG
WORLD_BANK_BASE_URL = "http://api.worldbank.org/v2"
COUNTRY_CODE = "US"

INDICATORS = {
    "GDP_USA": "NY.GDP.MKTP.CD",        # GDP (current US$)
    "GDP_GROWTH": "NY.GDP.MKTP.KD.ZG",  # Real GDP growth (annual %)
    "USA_Inflation": "FP.CPI.TOTL.ZG",  # Inflation (annual %)
    "USA_Unemployment": "SL.UEM.TOTL.ZS",  # Unemployment (% of labor force)
}

FILE_NAMES = {
    "GDP_USA": "GDP_USA.csv",
    "USA_Inflation": "USA_Inflation.csv",
    "USA_Unemployment": "USA_Unemployment.csv",
    "USA_Recession": "USA_Recession.csv",
    "USA_Unemployement_XML": "USA_Unemployement.xml",  # requested spelling
}

START_YEAR = 1960
END_YEAR = None  # or 2024 if you want to cut off

# HELPER FUNCTIONS

def fetch_world_bank_indicator(country, indicator, start_year=None, end_year=None):
    """
    Fetch a World Bank indicator for a given country and return a tidy DataFrame.
    Columns: country, indicator, year, value
    """
    params = {"format": "json", "per_page": 20000}
    if start_year:
        params["date"] = f"{start_year}:{end_year or ''}".rstrip(":")

    url = f"{WORLD_BANK_BASE_URL}/country/{country}/indicator/{indicator}"
    resp = requests.get(url, params=params)
    resp.raise_for_status()
    data = resp.json()

    # data[0] = metadata, data[1] = actual records
    records = data[1]
    rows = []
    for rec in records:
        year = rec.get("date")
        value = rec.get("value")
        if year is None:
            continue
        try:
            year_int = int(year)
        except ValueError:
            continue
        if start_year and year_int < start_year:
            continue
        if end_year and year_int > end_year:
            continue
        rows.append(
            {
                "country": rec.get("country", {}).get("value"),
                "indicator": indicator,
                "year": year_int,
                "value": value,
            }
        )

    df = pd.DataFrame(rows).sort_values("year").reset_index(drop=True)
    return df


def create_recession_table(gdp_growth_df: pd.DataFrame) -> pd.DataFrame:
    """
    Simple recession flag: year is recession if real GDP growth < 0.
    """
    df = gdp_growth_df.copy()
    df.rename(columns={"value": "gdp_growth_percent"}, inplace=True)
    df["is_recession"] = df["gdp_growth_percent"].apply(
        lambda x: 1 if (x is not None and pd.notna(x) and x < 0) else 0
    )
    return df[["country", "year", "gdp_growth_percent", "is_recession"]].sort_values("year")


def build_unemployment_xml(unemp_df: pd.DataFrame) -> bytes:
    """
    Build XML content for USA unemployment data and return as bytes.

    Example structure:
    <USA_Unemployment country="USA">
        <Year value="1990" unemployment_percent="5.2" />
        ...
    </USA_Unemployment>
    """
    root = ET.Element("USA_Unemployment", country="USA")

    for _, row in unemp_df.iterrows():
        year = row["year"]
        value = row["value"]

        year_el = ET.SubElement(root, "Year")
        year_el.set("value", str(year))
        if pd.notna(value):
            year_el.set("unemployment_percent", str(value))

    xml_bytes = ET.tostring(root, encoding="utf-8", xml_declaration=True)
    return xml_bytes


def upload_bytes_to_s3(data_bytes: bytes, bucket: str, key: str, content_type: str):
    """
    Upload raw bytes directly to S3.
    """
    print(f"Uploading to s3://{bucket}/{key} ...")
    s3_client.put_object(
        Bucket=bucket,
        Key=key,
        Body=data_bytes,
        ContentType=content_type,
    )
    print("Upload done.")


# Fetch data from World Bank
gdp_df = fetch_world_bank_indicator(COUNTRY_CODE, INDICATORS["GDP_USA"], START_YEAR, END_YEAR)
gdp_growth_df = fetch_world_bank_indicator(COUNTRY_CODE, INDICATORS["GDP_GROWTH"], START_YEAR, END_YEAR)
infl_df = fetch_world_bank_indicator(COUNTRY_CODE, INDICATORS["USA_Inflation"], START_YEAR, END_YEAR)
unemp_df = fetch_world_bank_indicator(COUNTRY_CODE, INDICATORS["USA_Unemployment"], START_YEAR, END_YEAR)

# Create recession table
recession_df = create_recession_table(gdp_growth_df)

# Convert DataFrames to CSV IN MEMORY and upload

# GDP CSV
gdp_csv_buf = io.StringIO()
gdp_df[["year", "value"]].rename(columns={"value": "gdp_current_usd"}).to_csv(gdp_csv_buf, index=False)
gdp_bytes = gdp_csv_buf.getvalue().encode("utf-8")
upload_bytes_to_s3(
    gdp_bytes,
    S3_BUCKET_NAME,
    f"{S3_PREFIX}{FILE_NAMES['GDP_USA']}",
    content_type="text/csv",
)

# Inflation CSV
infl_csv_buf = io.StringIO()
infl_df[["year", "value"]].rename(columns={"value": "inflation_annual_percent"}).to_csv(infl_csv_buf, index=False)
infl_bytes = infl_csv_buf.getvalue().encode("utf-8")
upload_bytes_to_s3(
    infl_bytes,
    S3_BUCKET_NAME,
    f"{S3_PREFIX}{FILE_NAMES['USA_Inflation']}",
    content_type="text/csv",
)

# Unemployment CSV
unemp_csv_buf = io.StringIO()
unemp_df[["year", "value"]].rename(columns={"value": "unemployment_percent"}).to_csv(unemp_csv_buf, index=False)
unemp_bytes = unemp_csv_buf.getvalue().encode("utf-8")
upload_bytes_to_s3(
    unemp_bytes,
    S3_BUCKET_NAME,
    f"{S3_PREFIX}{FILE_NAMES['USA_Unemployment']}",
    content_type="text/csv",
)

# Recession CSV
recession_csv_buf = io.StringIO()
recession_df.to_csv(recession_csv_buf, index=False)
recession_bytes = recession_csv_buf.getvalue().encode("utf-8")
upload_bytes_to_s3(
    recession_bytes,
    S3_BUCKET_NAME,
    f"{S3_PREFIX}{FILE_NAMES['USA_Recession']}",
    content_type="text/csv",
)

# Build unemployment XML IN MEMORY and upload
xml_bytes = build_unemployment_xml(unemp_df)
upload_bytes_to_s3(
    xml_bytes,
    S3_BUCKET_NAME,
    f"{S3_PREFIX}{FILE_NAMES['USA_Unemployement_XML']}",
    content_type="application/xml",
)
print("All in-memory uploads completed to S3.")


Uploading to s3://group24-recessionanalysis/cleansed_data/GDP_USA.csv ...
Upload done.
Uploading to s3://group24-recessionanalysis/cleansed_data/USA_Inflation.csv ...
Upload done.
Uploading to s3://group24-recessionanalysis/cleansed_data/USA_Unemployment.csv ...
Upload done.
Uploading to s3://group24-recessionanalysis/cleansed_data/USA_Recession.csv ...
Upload done.
Uploading to s3://group24-recessionanalysis/cleansed_data/USA_Unemployement.xml ...
Upload done.
All in-memory uploads completed to S3.


--------------------------------------------------------------------------------