<a href="https://colab.research.google.com/github/maryambahri/recession_analysis/blob/main/scripts/Cleanse_GDP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Cleansing Raw GDP file**

data source: WorldBank
retrieved 11/16/2025

In [9]:
# importing all libraries here

import numpy as np
import pandas as pd

!git clone https://github.com/maryambahri/recession_analysis.git


Cloning into 'recession_analysis'...
remote: Enumerating objects: 39, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 39 (delta 10), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (39/39), 223.59 KiB | 5.88 MiB/s, done.
Resolving deltas: 100% (10/10), done.


In [17]:
# loading the raw data here and displaying the head

path = "/content/recession_analysis/raw_data/GDP_USA.csv"

raw_df = pd.read_csv(
    path,
    skiprows=4   # first 3 rows are messy
)

raw_df=raw_df[raw_df["Country Name"] == "United States"]


raw_df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,Unnamed: 69
251,United States,USA,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,,2.565343,6.129637,4.357286,5.762747,6.498454,...,1.819451,2.457622,2.966505,2.583825,-2.163029,6.055053,2.512375,2.887556,2.79619,


In [22]:
# we neat things up here and transpose years

# list of year columns
year_cols = [c for c in raw_df.columns if c.isdigit()]

usa_gdp = raw_df.melt(
    id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
    value_vars=year_cols,
    var_name="Year",
    value_name="GDP_value"
)

# optional: clean up
usa_gdp["Year"] = usa_gdp["Year"].astype(int)
usa_gdp = usa_gdp.sort_values("Year").reset_index(drop=True)
usa_gdp = usa_gdp.dropna(subset=["GDP_value"])  # drop missing years if you want

usa_gdp.head()



Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,GDP_value
1,United States,USA,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,1961,2.565343
2,United States,USA,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,1962,6.129637
3,United States,USA,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,1963,4.357286
4,United States,USA,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,1964,5.762747
5,United States,USA,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,1965,6.498454


# **Cleansing Raw Unemployment file**

data source: WorldBank
retrieved 11/16/2025

In [23]:
# loading the raw data here and displaying the head

path = "/content/recession_analysis/raw_data/USA_Unemployment.csv"

raw_df = pd.read_csv(
    path,
    skiprows=4   # first 3 rows are messy
)

raw_df=raw_df[raw_df["Country Name"] == "United States"]


raw_df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,Unnamed: 69
251,United States,USA,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.NE.ZS,5.5,6.7,5.5,5.7,5.2,4.5,...,4.869,4.355,3.896,3.669,8.055,5.349,3.65,3.638,4.022,


In [24]:
# we neat things up here and transpose years

# list of year columns
year_cols = [c for c in raw_df.columns if c.isdigit()]

usa_unemployment = raw_df.melt(
    id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
    value_vars=year_cols,
    var_name="Year",
    value_name="GDP_value"
)

# cleaning up
usa_unemployment["Year"] = usa_unemployment["Year"].astype(int)
usa_unemployment = usa_unemployment.sort_values("Year").reset_index(drop=True)
usa_unemployment = usa_unemployment.dropna(subset=["GDP_value"])  # drop missing years if you want

usa_unemployment.head()


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,GDP_value
0,United States,USA,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.NE.ZS,1960,5.5
1,United States,USA,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.NE.ZS,1961,6.7
2,United States,USA,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.NE.ZS,1962,5.5
3,United States,USA,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.NE.ZS,1963,5.7
4,United States,USA,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.NE.ZS,1964,5.2


# **Cleansing Raw Inflation file**

data source: WorldBank
retrieved 11/16/2025

In [25]:
# loading the raw data here and displaying the head

path = "/content/recession_analysis/raw_data/USA_Inflation.csv"

raw_df = pd.read_csv(
    path,
    skiprows=4   # first 3 rows are messy
)

raw_df=raw_df[raw_df["Country Name"] == "United States"]


raw_df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,Unnamed: 69
251,United States,USA,"Inflation, consumer prices (annual %)",FP.CPI.TOTL.ZG,1.457976,1.070724,1.198773,1.239669,1.278912,1.585169,...,1.261583,2.13011,2.442583,1.81221,1.233584,4.697859,8.0028,4.116338,2.949525,


In [26]:
# we neat things up here and transpose years

# list of year columns
year_cols = [c for c in raw_df.columns if c.isdigit()]

usa_inflation = raw_df.melt(
    id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
    value_vars=year_cols,
    var_name="Year",
    value_name="GDP_value"
)

# cleaning up
usa_inflation["Year"] = usa_inflation["Year"].astype(int)
usa_inflation = usa_inflation.sort_values("Year").reset_index(drop=True)
usa_inflation = usa_inflation.dropna(subset=["GDP_value"])  # drop missing years if you want

usa_inflation.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,GDP_value
0,United States,USA,"Inflation, consumer prices (annual %)",FP.CPI.TOTL.ZG,1960,1.457976
1,United States,USA,"Inflation, consumer prices (annual %)",FP.CPI.TOTL.ZG,1961,1.070724
2,United States,USA,"Inflation, consumer prices (annual %)",FP.CPI.TOTL.ZG,1962,1.198773
3,United States,USA,"Inflation, consumer prices (annual %)",FP.CPI.TOTL.ZG,1963,1.239669
4,United States,USA,"Inflation, consumer prices (annual %)",FP.CPI.TOTL.ZG,1964,1.278912


# **Cleansing Raw Recession file**

data source: WorldBank
retrieved 11/16/2025

In [28]:
# loading the raw data here and displaying the head

path = "/content/recession_analysis/raw_data/USA_Recession.csv"

raw_df = pd.read_csv(
    path,

)

raw_df.head()

Unnamed: 0,observation_date,JHDUSRGDPBR
0,1967-10-01,0
1,1968-01-01,0
2,1968-04-01,0
3,1968-07-01,0
4,1968-10-01,0


In [29]:
# Rename columns
usa_recession = raw_df.rename(columns={
    "observation_date": "Year",
    "JHDUSRGDPBR": "Recession_Tag"
})

# Convert date to year only
usa_recession["Year"] = pd.to_datetime(usa_recession["Year"]).dt.year

usa_recession.head()


Unnamed: 0,Year,Recession_Tag
0,1967,0
1,1968,0
2,1968,0
3,1968,0
4,1968,0


----------------------------------------------------------------

# **Preprocessing Data**

In [31]:
# alighning data start years

usa_GDP_startYear=usa_gdp["Year"].min()
usa_GDP_endYear=usa_gdp["Year"].max()

usa_unemployment_startYear=usa_unemployment["Year"].min()
usa_unemployment_endYear=usa_unemployment["Year"].max()

usa_inflation_startYear=usa_inflation["Year"].min()
usa_inflation_endYear=usa_inflation["Year"].max()

usa_recession_startYear=usa_recession["Year"].min()
usa_recession_endYear=usa_recession["Year"].max()

print("USA GDP start year: ",usa_GDP_startYear)
print("USA GDP end year: ",usa_GDP_endYear)
print("USA Unemployment start year: ",usa_unemployment_startYear)
print("USA Unemployment end year: ",usa_unemployment_endYear)
print("USA Inflation start year: ",usa_inflation_startYear)
print("USA Inflation end year: ",usa_inflation_endYear)
print("USA Recession start year: ",usa_recession_startYear)
print("USA Recession end year: ",usa_recession_endYear)


USA GDP start year:  1961
USA GDP end year:  2024
USA Unemployment start year:  1960
USA Unemployment end year:  2024
USA Inflation start year:  1960
USA Inflation end year:  2024
USA Recession start year:  1967
USA Recession end year:  2025


In [33]:
# instead of hardcoding year alighnments, we will do it dynamically as data refresh

common_start = max(usa_GDP_startYear,
               usa_unemployment_startYear,
               usa_inflation_startYear,
               usa_recession_startYear)

common_end = min(usa_GDP_endYear,
               usa_unemployment_endYear,
               usa_inflation_endYear,
               usa_recession_endYear)



usa_gdp = usa_gdp[(usa_gdp["Year"] >= common_start) & (usa_gdp["Year"] <= common_end)]
usa_unemployment = usa_unemployment[(usa_unemployment["Year"] >= common_start) & (usa_unemployment["Year"] <= common_end)]
usa_inflation = usa_inflation[(usa_inflation["Year"] >= common_start) & (usa_inflation["Year"] <= common_end)]
usa_recession = usa_recession[(usa_recession["Year"] >= common_start) & (usa_recession["Year"] <= common_end)]


In [None]:
# implementing the alighnment

common_start = 1967
common_end   = 2024

usa_gdp_df = usa_gdp_df[(usa_gdp_df["Year"] >= common_start) & (usa_gdp_df["Year"] <= common_end)]
usa_unemployment_df = usa_unemployment_df[(usa_unemployment_df["Year"] >= common_start) & (usa_unemployment_df["Year"] <= common_end)]
usa_inflation_df = usa_inflation_df[(usa_inflation_df["Year"] >= common_start) & (usa_inflation_df["Year"] <= common_end)]
usa_recession_df = usa_recession_df[(usa_recession_df["Year"] >= common_start) & (usa_recession_df["Year"] <= common_end)]

# Reset index for cleanliness
usa_gdp_df.reset_index(drop=True, inplace=True)
usa_unemployment_df.reset_index(drop=True, inplace=True)
usa_inflation_df.reset_index(drop=True, inplace=True)
usa_recession_df.reset_index(drop=True, inplace=True)


In [34]:
# recheck alighnment
usa_GDP_startYear=usa_gdp["Year"].min()
usa_GDP_endYear=usa_gdp["Year"].max()

usa_unemployment_startYear=usa_unemployment["Year"].min()
usa_unemployment_endYear=usa_unemployment["Year"].max()

usa_inflation_startYear=usa_inflation["Year"].min()
usa_inflation_endYear=usa_inflation["Year"].max()

usa_recession_startYear=usa_recession["Year"].min()
usa_recession_endYear=usa_recession["Year"].max()

print("USA GDP start year: ",usa_GDP_startYear)
print("USA GDP end year: ",usa_GDP_endYear)
print("USA Unemployment start year: ",usa_unemployment_startYear)
print("USA Unemployment end year: ",usa_unemployment_endYear)
print("USA Inflation start year: ",usa_inflation_startYear)
print("USA Inflation end year: ",usa_inflation_endYear)
print("USA Recession start year: ",usa_recession_startYear)
print("USA Recession end year: ",usa_recession_endYear)


USA GDP start year:  1967
USA GDP end year:  2024
USA Unemployment start year:  1967
USA Unemployment end year:  2024
USA Inflation start year:  1967
USA Inflation end year:  2024
USA Recession start year:  1967
USA Recession end year:  2024


In [None]:
# Brief EDA

usa_gdp_df.reset_index(drop=True, inplace=True)
usa_unemployment_df.reset_index(drop=True, inplace=True)
usa_inflation_df.reset_index(drop=True, inplace=True)
usa_recession_df.reset_index(drop=True, inplace=True)
