# Load Libraries

In [146]:
# Load libraries
library(readxl)
library(dplyr)
library(readr)

# Part 1: Clean Population Data

In [147]:
pop_raw <- read.csv("/workspaces/DUKE_MIDS_QFC_Final_Project/dataset/population.csv")
glimpse(pop_raw)

Rows: 264
Columns: 2
$ DATE   [3m[90m<fct>[39m[23m 1959-01-01, 1959-04-01, 1959-07-01, 1959-10-01, 1960-01-01, 196…
$ POPTHM [3m[90m<fct>[39m[23m 176045.333333333333, 176726.666666666667, 177480.666666666667, …


In [148]:
sum(is.na(pop_raw$POPTHM))

In [149]:
# Creating new column population
pop_raw$population <- pop_raw$POPTHM

# Making column pop into a numeric
pop_raw$population <- as.numeric(as.character(pop_raw$POPTHM))

# Converting pop into the right unit by multiplything by 1000
pop_raw$population <- pop_raw$population * 1000

“NAs introduced by coercion”


In [150]:
# Time as a date
pop_raw$DATE <- as.character(pop_raw$DATE)
pop_raw$time_as_date <- as.Date(pop_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
pop_raw$time_as_numeric <- as.numeric(pop_raw$time_as_date)

In [151]:
# Filter data between 1984-01-01 and 2024-04-01
pop_raw <- pop_raw[pop_raw$time_as_date >= as.Date("1984-01-01") & pop_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "population")
pop_filtered <- pop_raw[, columns_to_keep]

# Check
glimpse(pop_filtered)
sum(is.na(pop_filtered))

Rows: 162
Columns: 3
$ time_as_date    [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 1985-…
$ time_as_numeric [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 5844, …
$ population      [3m[90m<dbl>[39m[23m 235604667, 236082333, 236657333, 237232667, 237672667,…


# Part 2: Clean Unemployment Rate

In [152]:
unemployment_rate_raw <- read.csv("/workspaces/DUKE_MIDS_QFC_Final_Project/dataset/unemployement_rate.csv")
glimpse(unemployment_rate_raw)

Rows: 308
Columns: 2
$ DATE   [3m[90m<fct>[39m[23m 1948-01-01, 1948-04-01, 1948-07-01, 1948-10-01, 1949-01-01, 194…
$ UNRATE [3m[90m<fct>[39m[23m 3.7333333333333333, 3.6666666666666667, 3.7666666666666667, 3.8…


In [153]:
# Creating new column for rate
unemployment_rate_raw$rate <- unemployment_rate_raw$UNRATE

# Making column pop into a numeric
unemployment_rate_raw$rate <- as.numeric(as.character(unemployment_rate_raw$UNRATE))

# Converting pop into a decima since it's in percentage 
unemployment_rate_raw$rate <- unemployment_rate_raw$rate / 100

# Time as a date
unemployment_rate_raw$DATE <- as.character(unemployment_rate_raw$DATE)
unemployment_rate_raw$time_as_date <- as.Date(unemployment_rate_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
unemployment_rate_raw$time_as_numeric <- as.numeric(unemployment_rate_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
unemployment_rate_raw <- unemployment_rate_raw[unemployment_rate_raw$time_as_date >= as.Date("1984-01-01") & unemployment_rate_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "rate")
unemployment_rate_filtered <- unemployment_rate_raw[, columns_to_keep]

# Check
glimpse(unemployment_rate_filtered)
sum(is.na(unemployment_rate_filtered))

“NAs introduced by coercion”


Rows: 162
Columns: 3
$ time_as_date    [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 1985-…
$ time_as_numeric [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 5844, …
$ rate            [3m[90m<dbl>[39m[23m 0.07866667, 0.07433333, 0.07433333, 0.07300000, 0.0723…


# Part 3: Clean Consumer Price Index

In [154]:
CPI_raw <- read.csv("/workspaces/DUKE_MIDS_QFC_Final_Project/dataset/consumer_price_index.csv")
glimpse(CPI_raw)

Rows: 276
Columns: 2
$ DATE            [3m[90m<fct>[39m[23m 1955-04-01, 1955-07-01, 1955-10-01, 1956-01-01, 1956-0…
$ CPALTT01USQ657N [3m[90m<dbl>[39m[23m 0.0000000, 0.4993758, 0.1242236, -0.2481390, 0.8706468…


In [155]:
# Convert ConsumerPriceIndex into a numeric column
CPI_raw$ConsumerPriceIndex <- as.numeric(as.character(CPI_raw$CPALTT01USQ657N))

# Convert DATE column to a Date format
CPI_raw$DATE <- as.character(CPI_raw$DATE)
CPI_raw$time_as_date <- as.Date(CPI_raw$DATE, format = "%Y-%m-%d")

# Convert time_as_date to numeric for modeling purposes
CPI_raw$time_as_numeric <- as.numeric(CPI_raw$time_as_date)

# Filter data to include only rows between 1984-01-01 and 2024-04-01
CPI_raw <- CPI_raw[CPI_raw$time_as_date >= as.Date("1984-01-01") & 
                     CPI_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "ConsumerPriceIndex")
CPI_filtered <- CPI_raw[, columns_to_keep]

# Handle the missing ConsumerPriceIndex for 2024-04-01 ==> will just put the last value as that one
# Get the last available ConsumerPriceIndex value
last_value <- tail(CPI_filtered$ConsumerPriceIndex, 1)

# Create a new row for 2024-04-01 using the last available value
new_row <- data.frame(
  time_as_date = as.Date("2024-04-01"),
  time_as_numeric = max(CPI_filtered$time_as_numeric, na.rm = TRUE) + 1, # Next numeric value
  ConsumerPriceIndex = last_value
)

# Append the new row to the filtered dataset
CPI_filtered <- rbind(CPI_filtered, new_row)

# Check the updated dataset
glimpse(CPI_filtered)
sum(is.na(CPI_filtered))


Rows: 162
Columns: 3
$ time_as_date       [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 19…
$ time_as_numeric    [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 584…
$ ConsumerPriceIndex [3m[90m<dbl>[39m[23m 1.1202636, 1.0752688, 1.0960671, 0.7334184, 0.63311…


# Part 4: Disposable Personal Income

In [156]:
disposable_income_raw <- read.csv("/workspaces/DUKE_MIDS_QFC_Final_Project/dataset/disposable_personal_income.csv")
glimpse(disposable_income_raw)
# https://fred.stlouisfed.org/series/DSPI

Rows: 260
Columns: 2
$ DATE     [3m[90m<fct>[39m[23m 1960-01-01, 1960-04-01, 1960-07-01, 1960-10-01, 1961-01-01, 1…
$ DSPI_PC1 [3m[90m<fct>[39m[23m 5.13401, 4.26418, 4.34502, 3.47304, 3.31239, 3.64620, 4.81694…


In [157]:
# Creating new column for disposable personal income
disposable_income_raw$DisposablePersonalIncome <- disposable_income_raw$DSPI_PC1

# Making column disposable personal income numeric
disposable_income_raw$DisposablePersonalIncome <- as.numeric(as.character(disposable_income_raw$DSPI_PC1))

# Converting dis income into true unites it is in trillions so multiply by billions
disposable_income_raw$DisposablePersonalIncome <- disposable_income_raw$DisposablePersonalIncome * 1000000000

# Time as a date
disposable_income_raw$DATE <- as.character(disposable_income_raw$DATE)
disposable_income_raw$time_as_date <- as.Date(disposable_income_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
disposable_income_raw$time_as_numeric <- as.numeric(disposable_income_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
disposable_income_raw <- disposable_income_raw[disposable_income_raw$time_as_date >= as.Date("1984-01-01") & disposable_income_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "DisposablePersonalIncome")
disposable_income_filtered <- disposable_income_raw[, columns_to_keep]

# Check
glimpse(disposable_income_filtered)
sum(is.na(disposable_income_filtered))

“NAs introduced by coercion”


Rows: 162
Columns: 3
$ time_as_date             [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-…
$ time_as_numeric          [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 575…
$ DisposablePersonalIncome [3m[90m<dbl>[39m[23m 10729270000, 11779660000, 10982020000, 947008…


# Part 5: Cleaing 10 Year Treasury Yield

In [158]:
treasury_yield_raw <- read.csv("/workspaces/DUKE_MIDS_QFC_Final_Project/dataset/10_year_treasury_yield.csv")
glimpse(treasury_yield_raw)

Rows: 286
Columns: 2
$ DATE            [3m[90m<fct>[39m[23m 1953-04-01, 1953-07-01, 1953-10-01, 1954-01-01, 1954-0…
$ IRLTLT01USQ156N [3m[90m<dbl>[39m[23m 2.996667, 2.916667, 2.643333, 2.440000, 2.346667, 2.34…


In [159]:
# Creating new column for yield
treasury_yield_raw$Yield <- treasury_yield_raw$IRLTLT01USQ156N

# Making column yield numeric
treasury_yield_raw$Yield <- as.numeric(as.character(treasury_yield_raw$IRLTLT01USQ156N))

# Converting dis income into true unites by making it a decimal
treasury_yield_raw$Yield <- treasury_yield_raw$Yield / 100

# Time as a date
treasury_yield_raw$DATE <- as.character(treasury_yield_raw$DATE)
treasury_yield_raw$time_as_date <- as.Date(treasury_yield_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
treasury_yield_raw$time_as_numeric <- as.numeric(treasury_yield_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
treasury_yield_raw <- treasury_yield_raw[treasury_yield_raw$time_as_date >= as.Date("1984-01-01") & treasury_yield_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "Yield")
treasury_yield_filtered <- treasury_yield_raw[, columns_to_keep]

# Check
glimpse(treasury_yield_filtered)
sum(is.na(treasury_yield_filtered))

Rows: 162
Columns: 3
$ time_as_date    [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 1985-…
$ time_as_numeric [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 5844, …
$ Yield           [3m[90m<dbl>[39m[23m 0.11943333, 0.13200000, 0.12866667, 0.11743333, 0.1158…


# Part 6: Clean Fed Funds Rate Data

In [160]:
fed_rate_raw <- read.csv("/workspaces/DUKE_MIDS_QFC_Final_Project/dataset/fed_funds_rate.csv")
glimpse(fed_rate_raw)
# https://fred.stlouisfed.org/series/FEDFUNDS

Rows: 282
Columns: 2
$ DATE     [3m[90m<fct>[39m[23m 1954-07-01, 1954-10-01, 1955-01-01, 1955-04-01, 1955-07-01, 1…
$ FEDFUNDS [3m[90m<fct>[39m[23m 1.03000000000000000000, 0.98666666666666666667, 1.34333333333…


In [161]:
# Creating new column for yield
fed_rate_raw$Rate <- fed_rate_raw$FEDFUNDS

# Making column yield numeric
fed_rate_raw$Rate <- as.numeric(as.character(fed_rate_raw$FEDFUNDS))

# Converting rate into true unites by making it a decimal
fed_rate_raw$Rate <- fed_rate_raw$Rate / 100

# Time as a date
fed_rate_raw$DATE <- as.character(fed_rate_raw$DATE)
fed_rate_raw$time_as_date <- as.Date(fed_rate_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
fed_rate_raw$time_as_numeric <- as.numeric(fed_rate_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
fed_rate_raw <- fed_rate_raw[fed_rate_raw$time_as_date >= as.Date("1984-01-01") & fed_rate_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "Rate")
fed_rate_filtered <- fed_rate_raw[, columns_to_keep]

# Check
glimpse(fed_rate_filtered)
sum(is.na(fed_rate_filtered))

“NAs introduced by coercion”


Rows: 162
Columns: 3
$ time_as_date    [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 1985-…
$ time_as_numeric [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 5844, …
$ Rate            [3m[90m<dbl>[39m[23m 0.09686667, 0.10556667, 0.11390000, 0.09266667, 0.0847…


# Part 7: Clean GDP Data

In [162]:
# https://fred.stlouisfed.org/series/GDP
GDP_raw <- read.csv("/workspaces/DUKE_MIDS_QFC_Final_Project/dataset/GDP.csv")
glimpse(GDP_raw)


Rows: 311
Columns: 2
$ DATE [3m[90m<fct>[39m[23m 1947-01-01, 1947-04-01, 1947-07-01, 1947-10-01, 1948-01-01, 1948-…
$ GDP  [3m[90m<dbl>[39m[23m 243.164, 245.968, 249.585, 259.745, 265.742, 272.567, 279.196, 28…


In [163]:

# Making column GDP numeric
GDP_raw$GDP <- as.numeric(as.character(GDP_raw$GDP))

# Converting dis income into true unites by making it a decimal
GDP_raw$GDP <- GDP_raw$GDP * 1000000000

# Time as a date
GDP_raw$DATE <- as.character(GDP_raw$DATE)
GDP_raw$time_as_date <- as.Date(GDP_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
GDP_raw$time_as_numeric <- as.numeric(GDP_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
GDP_raw <- GDP_raw[GDP_raw$time_as_date >= as.Date("1984-01-01") & GDP_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "GDP")
GDP_filtered <- GDP_raw[, columns_to_keep]

# Check
glimpse(GDP_filtered)
sum(is.na(GDP_filtered))

Rows: 162
Columns: 3
$ time_as_date    [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 1985-…
$ time_as_numeric [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 5844, …
$ GDP             [3m[90m<dbl>[39m[23m 3.908054e+12, 4.009601e+12, 4.084250e+12, 4.148551e+12…


# Part 8: Group all data into one dataframe

In [164]:
# Merge two datasets by the 'Date' column
datasets_to_merge <- list(pop_filtered, unemployment_rate_filtered, CPI_filtered, disposable_income_filtered, treasury_yield_filtered, fed_rate_filtered, GDP_filtered)

regressors_data <- Reduce(function(x, y) merge(x, y, by = c("time_as_date", "time_as_numeric")), datasets_to_merge)

glimpse(regressors_data)


Rows: 161
Columns: 9
$ time_as_date             [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-…
$ time_as_numeric          [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 575…
$ population               [3m[90m<dbl>[39m[23m 235604667, 236082333, 236657333, 237232667, 2…
$ rate                     [3m[90m<dbl>[39m[23m 0.07866667, 0.07433333, 0.07433333, 0.0730000…
$ ConsumerPriceIndex       [3m[90m<dbl>[39m[23m 1.1202636, 1.0752688, 1.0960671, 0.7334184, 0…
$ DisposablePersonalIncome [3m[90m<dbl>[39m[23m 10729270000, 11779660000, 10982020000, 947008…
$ Yield                    [3m[90m<dbl>[39m[23m 0.11943333, 0.13200000, 0.12866667, 0.1174333…
$ Rate                     [3m[90m<dbl>[39m[23m 0.09686667, 0.10556667, 0.11390000, 0.0926666…
$ GDP                      [3m[90m<dbl>[39m[23m 3.908054e+12, 4.009601e+12, 4.084250e+12, 4.1…
