In [None]:
install.packages(c("dplyr", "glmnet", "zoo", "lubridate"))


In [56]:
library(dplyr)
library(glmnet)
library(zoo)
library(lubridate)

In [112]:
clean_oil_data = function(filename) {
  read.csv2(filename) %>% 
    filter(!is.na(Date)) %>% 
    # put Date into dateformat
    mutate(Date = as.Date(Date, "%d.%m.%Y")) %>% 
    # put all other variables into numer format
    mutate(across(-Date, ~ as.numeric(as.character(.))))
}

files = list(
  "Industry_D.csv", 
  "Industry_M.csv", 
  "Industry_W.csv", 
  "Macro_M.csv", 
  "StockPrices_M.csv"
)

cleaned_data = lapply(files, clean_oil_data)

industry_d = cleaned_data[[1]]
industry_m = cleaned_data[[2]] %>% 
# these variables already appear inside industry_d
  select(-any_of(c("CL1", "CL2", "Brent", "CRKS321C.Index", "Baltic.Dry.Index", "datadate")))
industry_w = cleaned_data[[3]]
macro_m = cleaned_data[[4]]
stockprices_m = cleaned_data[[5]]

head(industry_d)
head(industry_m)

Unnamed: 0_level_0,Date,CL1,CL2,Brent,CRKS321C.Index,BDIY.Index
Unnamed: 0_level_1,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2024-07-30,74.73,73.88,78.63,21.883,1762
2,2024-07-29,75.81,74.8,79.78,22.171,1797
3,2024-07-26,77.16,76.0,81.13,21.071,1808
4,2024-07-25,78.28,77.11,82.37,20.832,1834
5,2024-07-24,77.59,76.58,81.71,19.776,1864
6,2024-07-23,76.96,75.92,81.01,21.118,1869


Unnamed: 0_level_0,Date,Daily.Production,Inventories,Rig.Count,Commercial.Long,Commercial.Short,Total.Open.Interest,X,X.1,X.2,X.3
Unnamed: 0_level_1,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2024-06-28,13200,821134,581,1600085,1631509,3231594,,,,
2,2024-05-31,13100,826109,600,1750160,1785108,3535268,,,,
3,2024-04-30,13100,827161,613,1708427,1751709,3460136,,,,
4,2024-03-29,13100,815058,621,1687838,1725350,3413188,,,,
5,2024-02-29,13300,807417,626,1583286,1606750,3190036,,,,
6,2024-01-31,13000,779314,621,1673204,1695615,3368819,,,,


In [115]:
data_compile = industry_d %>% 
  merge(industry_w, by = "Date", all = TRUE) %>% 
  merge(industry_m, by = "Date", all = TRUE) %>% 
  merge(macro_m, by = "Date", all = TRUE) %>% 
  merge(stockprices_m, by = "Date", all = TRUE) %>% 
  arrange(Date)

head(data_compile)
print(paste("Rows after merge:", nrow(data_compile)))

Unnamed: 0_level_0,Date,CL1,CL2,Brent,CRKS321C.Index,BDIY.Index,Weekly.Prod.Crude,Weekly.Rig.Count,Weekly.change.in.Crude.Stock,Weekly.Commercial.Long,⋯,DXY.returns,VIX,Gasoline.All.Grades,Dow.Jones.US.Oil.Gas.Index,ExxonMobil,ConocoPhilips,Chevron,BP,Shell,TotalEnergies
Unnamed: 0_level_1,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1990-01-02,22.89,22.41,21.95,,,,,,,⋯,,,,,,,,,,
2,1990-01-03,23.68,22.97,23.48,,,,,,,⋯,,,,,,,,,,
3,1990-01-04,23.41,22.53,26.78,,,,,,,⋯,,,,,,,,,,
4,1990-01-05,23.08,22.03,27.4,,,7512.0,,,,⋯,,,,,,,,,,
5,1990-01-08,21.62,21.03,24.98,,,,,,,⋯,,,,,,,,,,
6,1990-01-09,22.07,21.23,24.75,,,,,,,⋯,,,,,,,,,,


[1] "Rows after merge: 133833"


In [119]:
data_compile_filled = data_compile %>%
    arrange(Date) %>%
    mutate(across(-Date, ~ zoo::na.locf(., na.rm = FALSE))) %>% 
    mutate(across(-Date, ~ zoo::na.locf(., fromLast = TRUE, na.rm = FALSE))) %>% 
    arrange(desc(Date))

head(data_compile_filled)

Unnamed: 0_level_0,Date,CL1,CL2,Brent,CRKS321C.Index,BDIY.Index,Weekly.Prod.Crude,Weekly.Rig.Count,Weekly.change.in.Crude.Stock,Weekly.Commercial.Long,⋯,DXY.returns,VIX,Gasoline.All.Grades,Dow.Jones.US.Oil.Gas.Index,ExxonMobil,ConocoPhilips,Chevron,BP,Shell,TotalEnergies
Unnamed: 0_level_1,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2024-07-30,74.73,73.88,78.63,21.883,1762,13300,482,-3741,654310,⋯,0.01135205,12.44,3.557,760.4,115.12,114.38,156.42,36.1,72.18,66.68
2,2024-07-29,75.81,74.8,79.78,22.171,1797,13300,482,-3741,654310,⋯,0.01135205,12.44,3.557,760.4,115.12,114.38,156.42,36.1,72.18,66.68
3,2024-07-26,77.16,76.0,81.13,21.071,1808,13300,482,-3741,654310,⋯,0.01135205,12.44,3.557,760.4,115.12,114.38,156.42,36.1,72.18,66.68
4,2024-07-25,78.28,77.11,82.37,20.832,1834,13300,482,-3741,654310,⋯,0.01135205,12.44,3.557,760.4,115.12,114.38,156.42,36.1,72.18,66.68
5,2024-07-24,77.59,76.58,81.71,19.776,1864,13300,482,-3741,654310,⋯,0.01135205,12.44,3.557,760.4,115.12,114.38,156.42,36.1,72.18,66.68
6,2024-07-23,76.96,75.92,81.01,21.118,1869,13300,482,-3741,654310,⋯,0.01135205,12.44,3.557,760.4,115.12,114.38,156.42,36.1,72.18,66.68
