In [7]:
# Load libraries
#install.packages(c("readr", "dplyr"))
library(readxl)
library(dplyr)
library(readr)
# Inflation rate: https://fred.stlouisfed.org/series/FPCPITOTLZGUSA

# Part 1: Clean Population Data

In [8]:
pop_raw <- read.csv("/workspaces/DUKE_IDS789_Final_Project/10_Datasets/population.csv")

sum(is.na(pop_raw$POPTHM))

# Creating new column population
pop_raw$population <- pop_raw$POPTHM

# Making column pop into a numeric
pop_raw$population <- as.numeric(as.character(pop_raw$POPTHM))

# Converting pop into the right unit by multiplything by 1000
pop_raw$population <- pop_raw$population * 1000

# Time as a date
pop_raw$DATE <- as.character(pop_raw$DATE)
pop_raw$time_as_date <- as.Date(pop_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
pop_raw$time_as_numeric <- as.numeric(pop_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
pop_raw <- pop_raw[pop_raw$time_as_date >= as.Date("1984-01-01") & pop_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "population")
pop_filtered <- pop_raw[, columns_to_keep]

# Check
glimpse(pop_filtered)
sum(is.na(pop_filtered))

“NAs introduced by coercion”


Rows: 162
Columns: 3
$ time_as_date    [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 1985-…
$ time_as_numeric [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 5844, …
$ population      [3m[90m<dbl>[39m[23m 235604667, 236082333, 236657333, 237232667, 237672667,…


# Part 2: Clean Unemployment Rate

In [9]:
unemployment_rate_raw <- read.csv("/workspaces/DUKE_IDS789_Final_Project/10_Datasets/unemployement_rate.csv")
glimpse(unemployment_rate_raw)

# Creating new column for rate
unemployment_rate_raw$unemployment_rate <- unemployment_rate_raw$UNRATE

# Making column pop into a numeric
unemployment_rate_raw$unemployment_rate <- as.numeric(as.character(unemployment_rate_raw$UNRATE))

# Converting pop into a decima since it's in percentage 
unemployment_rate_raw$unemployment_rate <- unemployment_rate_raw$unemployment_rate / 100

# Time as a date
unemployment_rate_raw$DATE <- as.character(unemployment_rate_raw$DATE)
unemployment_rate_raw$time_as_date <- as.Date(unemployment_rate_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
unemployment_rate_raw$time_as_numeric <- as.numeric(unemployment_rate_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
unemployment_rate_raw <- unemployment_rate_raw[unemployment_rate_raw$time_as_date >= as.Date("1984-01-01") & unemployment_rate_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "unemployment_rate")
unemployment_rate_filtered <- unemployment_rate_raw[, columns_to_keep]

# Check
glimpse(unemployment_rate_filtered)
sum(is.na(unemployment_rate_filtered))

Rows: 308
Columns: 2
$ DATE   [3m[90m<chr>[39m[23m "1948-01-01", "1948-04-01", "1948-07-01", "1948-10-01", "1949-0…
$ UNRATE [3m[90m<chr>[39m[23m "3.7333333333333333", "3.6666666666666667", "3.7666666666666667…


“NAs introduced by coercion”


Rows: 162
Columns: 3
$ time_as_date      [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 198…
$ time_as_numeric   [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 5844…
$ unemployment_rate [3m[90m<dbl>[39m[23m 0.07866667, 0.07433333, 0.07433333, 0.07300000, 0.07…


# Part 3: Clean Consumer Price Index

In [10]:
# Source: https://fred.stlouisfed.org/series/CPALTT01USM657N#0
CPI_raw <- read.csv("/workspaces/DUKE_IDS789_Final_Project/10_Datasets/consumer_price_index.csv")

# Convert ConsumerPriceIndex into a numeric column
CPI_raw$ConsumerPriceIndex <- as.numeric(as.character(CPI_raw$CPALTT01USQ657N))

# Convert DATE column to a Date format
CPI_raw$DATE <- as.character(CPI_raw$DATE)
CPI_raw$time_as_date <- as.Date(CPI_raw$DATE, format = "%Y-%m-%d")

# Convert time_as_date to numeric for modeling purposes
CPI_raw$time_as_numeric <- as.numeric(CPI_raw$time_as_date)

# Filter data to include only rows between 1984-01-01 and 2024-04-01
CPI_raw <- CPI_raw[CPI_raw$time_as_date >= as.Date("1984-01-01") & 
                     CPI_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "ConsumerPriceIndex")
CPI_filtered <- CPI_raw[, columns_to_keep]


# Check the updated dataset
glimpse(CPI_filtered)
sum(is.na(CPI_filtered))

Rows: 161
Columns: 3
$ time_as_date       [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 19…
$ time_as_numeric    [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 584…
$ ConsumerPriceIndex [3m[90m<dbl>[39m[23m 1.1202636, 1.0752688, 1.0960671, 0.7334184, 0.63311…


# Part 4: Disposable Personal Income

In [11]:
disposable_income_raw <- read.csv("/workspaces/DUKE_IDS789_Final_Project/10_Datasets/disposable_personal_income.csv")
glimpse(disposable_income_raw)
# https://fred.stlouisfed.org/series/DSPI

# Creating new column for disposable personal income
disposable_income_raw$DisposablePersonalIncome <- disposable_income_raw$DSPI_PC1

# Making column disposable personal income numeric
disposable_income_raw$DisposablePersonalIncome <- as.numeric(as.character(disposable_income_raw$DSPI_PC1))

# Converting dis income into true unites it is in trillions so multiply by billions
disposable_income_raw$DisposablePersonalIncome <- disposable_income_raw$DisposablePersonalIncome * 1000000000

# Time as a date
disposable_income_raw$DATE <- as.character(disposable_income_raw$DATE)
disposable_income_raw$time_as_date <- as.Date(disposable_income_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
disposable_income_raw$time_as_numeric <- as.numeric(disposable_income_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
disposable_income_raw <- disposable_income_raw[disposable_income_raw$time_as_date >= as.Date("1984-01-01") & disposable_income_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "DisposablePersonalIncome")
disposable_income_filtered <- disposable_income_raw[, columns_to_keep]

# Check
glimpse(disposable_income_filtered)
sum(is.na(disposable_income_filtered))

Rows: 260
Columns: 2
$ DATE     [3m[90m<chr>[39m[23m "1960-01-01", "1960-04-01", "1960-07-01", "1960-10-01", "1961…
$ DSPI_PC1 [3m[90m<chr>[39m[23m "5.13401", "4.26418", "4.34502", "3.47304", "3.31239", "3.646…


“NAs introduced by coercion”


Rows: 162
Columns: 3
$ time_as_date             [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-…
$ time_as_numeric          [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 575…
$ DisposablePersonalIncome [3m[90m<dbl>[39m[23m 10729270000, 11779660000, 10982020000, 947008…


# Part 5: Cleaing 10 Year Treasury Yield

In [12]:
treasury_yield_raw <- read.csv("/workspaces/DUKE_IDS789_Final_Project/10_Datasets/10_year_treasury_yield.csv")

# Creating new column for yield
treasury_yield_raw$Yield <- treasury_yield_raw$IRLTLT01USQ156N

# Making column yield numeric
treasury_yield_raw$Yield <- as.numeric(as.character(treasury_yield_raw$IRLTLT01USQ156N))

# Converting dis income into true unites by making it a decimal
treasury_yield_raw$Yield <- treasury_yield_raw$Yield / 100

# Time as a date
treasury_yield_raw$DATE <- as.character(treasury_yield_raw$DATE)
treasury_yield_raw$time_as_date <- as.Date(treasury_yield_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
treasury_yield_raw$time_as_numeric <- as.numeric(treasury_yield_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
treasury_yield_raw <- treasury_yield_raw[treasury_yield_raw$time_as_date >= as.Date("1984-01-01") & treasury_yield_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "Yield")
treasury_yield_filtered <- treasury_yield_raw[, columns_to_keep]

# Check
glimpse(treasury_yield_filtered)
sum(is.na(treasury_yield_filtered))

Rows: 162
Columns: 3
$ time_as_date    [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 1985-…
$ time_as_numeric [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 5844, …
$ Yield           [3m[90m<dbl>[39m[23m 0.11943333, 0.13200000, 0.12866667, 0.11743333, 0.1158…


# Part 6: Clean Fed Funds Rate Data

In [13]:
fed_rate_raw <- read.csv("/workspaces/DUKE_IDS789_Final_Project/10_Datasets/fed_funds_rate.csv")

# https://fred.stlouisfed.org/series/FEDFUNDS

# Creating new column for yield
fed_rate_raw$fed_rate <- fed_rate_raw$FEDFUNDS

# Making column yield numeric
fed_rate_raw$fed_rate <- as.numeric(as.character(fed_rate_raw$FEDFUNDS))

# Converting rate into true unites by making it a decimal
fed_rate_raw$fed_rate <- fed_rate_raw$fed_rate / 100

# Time as a date
fed_rate_raw$DATE <- as.character(fed_rate_raw$DATE)
fed_rate_raw$time_as_date <- as.Date(fed_rate_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
fed_rate_raw$time_as_numeric <- as.numeric(fed_rate_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
fed_rate_raw <- fed_rate_raw[fed_rate_raw$time_as_date >= as.Date("1984-01-01") & fed_rate_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "fed_rate")
fed_rate_filtered <- fed_rate_raw[, columns_to_keep]

# Check
glimpse(fed_rate_filtered)
sum(is.na(fed_rate_filtered))

“NAs introduced by coercion”


Rows: 162
Columns: 3
$ time_as_date    [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 1985-…
$ time_as_numeric [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 5844, …
$ fed_rate        [3m[90m<dbl>[39m[23m 0.09686667, 0.10556667, 0.11390000, 0.09266667, 0.0847…


# Part 7: Clean GDP Data

In [14]:
# https://fred.stlouisfed.org/series/GDP
GDP_raw <- read.csv("/workspaces/DUKE_IDS789_Final_Project/10_Datasets/GDP.csv")

# Making column GDP numeric
GDP_raw$GDP <- as.numeric(as.character(GDP_raw$GDP))

# Converting dis income into true unites by making it a decimal
GDP_raw$GDP <- GDP_raw$GDP * 1000000000

# Time as a date
GDP_raw$DATE <- as.character(GDP_raw$DATE)
GDP_raw$time_as_date <- as.Date(GDP_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
GDP_raw$time_as_numeric <- as.numeric(GDP_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
GDP_raw <- GDP_raw[GDP_raw$time_as_date >= as.Date("1984-01-01") & GDP_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "GDP")
GDP_filtered <- GDP_raw[, columns_to_keep]

# Add GDP growth rate column
GDP_filtered <- GDP_filtered %>%
  mutate(
    GDP_growth_rate = (GDP - lag(GDP)) / lag(GDP)
  )

# View the result
glimpse(GDP_filtered)
sum(is.na(GDP_filtered))

Rows: 162
Columns: 4
$ time_as_date    [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 1985-…
$ time_as_numeric [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 5844, …
$ GDP             [3m[90m<dbl>[39m[23m 3.908054e+12, 4.009601e+12, 4.084250e+12, 4.148551e+12…
$ GDP_growth_rate [3m[90m<dbl>[39m[23m NA, 0.025984032, 0.018617563, 0.015743649, 0.019673616…


# Part 8: Clean Loan Data

In [15]:
# https://fred.stlouisfed.org/series/TOTCI
loan_raw <- read.csv("/workspaces/DUKE_IDS789_Final_Project/10_Datasets/Loans.csv")


# Making column loan numeric
loan_raw$loan <- as.numeric(as.character(loan_raw$TOTCI))

# Converting this loan into billions 
loan_raw$loan <- loan_raw$loan * 1000000000

# Time as a date
loan_raw$DATE <- as.character(loan_raw$DATE)
loan_raw$time_as_date <- as.Date(loan_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
loan_raw$time_as_numeric <- as.numeric(loan_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
loan_raw <- loan_raw[loan_raw$time_as_date >= as.Date("1984-01-01") & loan_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "loan")
loan_filtered <- loan_raw[, columns_to_keep]

# Check
glimpse(loan_filtered)
sum(is.na(loan_filtered))

“NAs introduced by coercion”


Rows: 162
Columns: 3
$ time_as_date    [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 1985-…
$ time_as_numeric [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 5844, …
$ loan            [3m[90m<dbl>[39m[23m 422715769231, 444453800000, 458430284615, 469857400000…


# Part 9: Clean Money Supply Data

In [16]:
# Source: https://fred.stlouisfed.org/series/MABMM301USM189S#0
money_supply_raw <- read.csv("/workspaces/DUKE_IDS789_Final_Project/10_Datasets/money_supply.csv")

# Making column money supply numeric
money_supply_raw$money_supply <- as.numeric(as.character(money_supply_raw$MABMM301USM189S))

# Time as a date
money_supply_raw$DATE <- as.character(money_supply_raw$DATE)
money_supply_raw$time_as_date <- as.Date(money_supply_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
money_supply_raw$time_as_numeric <- as.numeric(money_supply_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
money_supply_raw <- money_supply_raw[money_supply_raw$time_as_date >= as.Date("1984-01-01") & money_supply_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "money_supply")
money_supply_filtered <- money_supply_raw[, columns_to_keep]

# Check
glimpse(money_supply_filtered)
sum(is.na(money_supply_filtered))

“NAs introduced by coercion”


Rows: 160
Columns: 3
$ time_as_date    [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 1985-…
$ time_as_numeric [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 5844, …
$ money_supply    [3m[90m<dbl>[39m[23m 2.157200e+12, 2.203633e+12, 2.232767e+12, 2.282233e+12…


# Part 10: Clean Reserves of Depository

In [17]:
# Source: https://fred.stlouisfed.org/series/TOTRESNS#0
depository_reserve_raw <- read.csv("/workspaces/DUKE_IDS789_Final_Project/10_Datasets/reserves_deposit.csv")

# Making column loan numeric
depository_reserve_raw$deposit_reserve<- as.numeric(as.character(depository_reserve_raw$TOTRESNS))

# Converting this loan into billions 
depository_reserve_raw$deposit_reserve <- depository_reserve_raw$deposit_reserve * 1000000000

# Time as a date
depository_reserve_raw$DATE <- as.character(depository_reserve_raw$DATE)
depository_reserve_raw$time_as_date <- as.Date(depository_reserve_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
depository_reserve_raw$time_as_numeric <- as.numeric(depository_reserve_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
depository_reserve_raw <- depository_reserve_raw[depository_reserve_raw$time_as_date >= as.Date("1984-01-01") & depository_reserve_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "deposit_reserve")
depository_reserve_filtered <- depository_reserve_raw[, columns_to_keep]

# Check
glimpse(depository_reserve_filtered)
sum(is.na(depository_reserve_filtered))

“NAs introduced by coercion”


Rows: 162
Columns: 3
$ time_as_date    [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 1985-…
$ time_as_numeric [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 5844, …
$ deposit_reserve [3m[90m<dbl>[39m[23m 37533333333, 37033333333, 37600000000, 39466666667, 40…


# Part 11: Consumer Sentimnent Index

In [18]:
# Source: https://fred.stlouisfed.org/series/UMCSENT#0
consumer_sentiment_raw <- read.csv("/workspaces/DUKE_IDS789_Final_Project/10_Datasets/consumer_sentiment_index.csv")

# Making column loan numeric
consumer_sentiment_raw$consumer_sentiment<- as.numeric(as.character(consumer_sentiment_raw$UMCSENT))

# Time as a date
consumer_sentiment_raw$DATE <- as.character(consumer_sentiment_raw$DATE)
consumer_sentiment_raw$time_as_date <- as.Date(consumer_sentiment_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
consumer_sentiment_raw$time_as_numeric <- as.numeric(consumer_sentiment_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
consumer_sentiment_raw <- consumer_sentiment_raw[consumer_sentiment_raw$time_as_date >= as.Date("1984-01-01") & consumer_sentiment_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "consumer_sentiment")
consumer_sentiment_filtered <- consumer_sentiment_raw[, columns_to_keep]

# Check
glimpse(consumer_sentiment_filtered)
sum(is.na(consumer_sentiment_filtered))

“NAs introduced by coercion”


Rows: 162
Columns: 3
$ time_as_date       [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 19…
$ time_as_numeric    [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 584…
$ consumer_sentiment [3m[90m<dbl>[39m[23m 99.50000, 96.56667, 98.86667, 94.96667, 94.46667, 9…


# Part 12: Clean Homeownership Rate

In [19]:
# Source: https://fred.stlouisfed.org/series/RHORUSQ156N#0
homeownership_raw <- read.csv("/workspaces/DUKE_IDS789_Final_Project/10_Datasets/homeownership_rate.csv")

# Making column loan numeric
homeownership_raw$homeownership_rate<- as.numeric(as.character(homeownership_raw$RHORUSQ156N))

# Converting rate into true unites by making it a decimal
homeownership_raw$homeownership_rate <- homeownership_raw$homeownership_rate / 100

# Time as a date
homeownership_raw$DATE <- as.character(homeownership_raw$DATE)
homeownership_raw$time_as_date <- as.Date(homeownership_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
homeownership_raw$time_as_numeric <- as.numeric(homeownership_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
homeownership_raw <- homeownership_raw[homeownership_raw$time_as_date >= as.Date("1984-01-01") & homeownership_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "homeownership_rate")
homeownership_filtered <- homeownership_raw[, columns_to_keep]

# Check
glimpse(homeownership_filtered)
sum(is.na(homeownership_filtered))

Rows: 162
Columns: 3
$ time_as_date       [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 19…
$ time_as_numeric    [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 584…
$ homeownership_rate [3m[90m<dbl>[39m[23m 0.646, 0.646, 0.646, 0.641, 0.641, 0.641, 0.639, 0.…


# Part 13: Clean CBOE Volatility Index VIX

In [20]:
vix_raw <- read.csv("/workspaces/DUKE_IDS789_Final_Project/10_Datasets/VIX.csv")

# Making column loan numeric
vix_raw$vix_index<- as.numeric(as.character(vix_raw$VIXCLS))

# Time as a date
vix_raw$DATE <- as.character(vix_raw$DATE)
vix_raw$time_as_date <- as.Date(vix_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
vix_raw$time_as_numeric <- as.numeric(vix_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
vix_raw <- vix_raw[vix_raw$time_as_date >= as.Date("1984-01-01") & vix_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "vix_index")
vix_filtered <- vix_raw[, columns_to_keep]

# Check
glimpse(vix_filtered)
sum(is.na(vix_filtered))

“NAs introduced by coercion”


Rows: 138
Columns: 3
$ time_as_date    [3m[90m<date>[39m[23m 1990-01-01, 1990-04-01, 1990-07-01, 1990-10-01, 1991-…
$ time_as_numeric [3m[90m<dbl>[39m[23m 7305, 7395, 7486, 7578, 7670, 7760, 7851, 7943, 8035, …
$ vix_index       [3m[90m<dbl>[39m[23m 22.17460, 18.72206, 25.19556, 26.11328, 22.51583, 17.1…


# Part 14: NASDAQ Composite Index

In [21]:
# Source: https://fred.stlouisfed.org/series/NASDAQCOM#0
NASDAQ_raw <- read.csv("/workspaces/DUKE_IDS789_Final_Project/10_Datasets/NASDAQ.csv




li")

# Making column index numeric
NASDAQ_raw$NASDAQ_index<- as.numeric(as.character(NASDAQ_raw$NASDAQCOM))

# Time as a date
NASDAQ_raw$DATE <- as.character(NASDAQ_raw$DATE)
NASDAQ_raw$time_as_date <- as.Date(NASDAQ_raw$DATE, format = "%Y-%m-%d")

# Time as numeric (used in certain regression models or methods)
NASDAQ_raw$time_as_numeric <- as.numeric(NASDAQ_raw$time_as_date)

# Filter data between 1984-01-01 and 2024-04-01
NASDAQ_raw <- NASDAQ_raw[NASDAQ_raw$time_as_date >= as.Date("1984-01-01") & NASDAQ_raw$time_as_date <= as.Date("2024-04-01"), ]

# Keep only the relevant columns
columns_to_keep <- c("time_as_date", "time_as_numeric", "NASDAQ_index")
NASDAQ_filtered <- NASDAQ_raw[, columns_to_keep]

# Check
glimpse(NASDAQ_filtered)
sum(is.na(NASDAQ_filtered))

“NAs introduced by coercion”


Rows: 162
Columns: 3
$ time_as_date    [3m[90m<date>[39m[23m 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01, 1985-…
$ time_as_numeric [3m[90m<dbl>[39m[23m 5113, 5204, 5295, 5387, 5479, 5569, 5660, 5752, 5844, …
$ NASDAQ_index    [3m[90m<dbl>[39m[23m 262.4971, 242.9894, 244.8656, 245.2678, 275.1484, 286.…


# Part 15: Group all data into one dataframe

In [22]:
# Merge two datasets by the 'Date' column
datasets_to_merge <- list(pop_filtered, unemployment_rate_filtered, CPI_filtered, disposable_income_filtered, treasury_yield_filtered, fed_rate_filtered, GDP_filtered)

# Re-run the merge
regressors_data <- Reduce(function(x, y) {
  merge(x, y, by = c("time_as_date", "time_as_numeric"), all = TRUE)
}, datasets_to_merge)

In [23]:
saveRDS(regressors_data, "regressors_data.rds")  # Save the data to call it in other notebooks