In [1]:
library(tidyverse)
library(repr)
library(dplyr)
library(tidymodels)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.0.0 ──

[32m✔[39m [34mbroom       [39m 1.0.0     [32m✔[39m [34mrsample     [39m 1.0.0
[32m✔[39m [34mdials       [39m 1.0.0     [32m✔[39m [34mtune        [39m 1.0.0
[32m✔[39m [34minfer       [39m 1.0.2     [32m✔[39m [34mworkflows   [39m 1.0.0
[32m✔

In [2]:
G20_economic_data <- read_csv("https://raw.githubusercontent.com/mparhar1/DSCI-100-Group-48-Project/main/WEOApri23G20.csv")
colnames(G20_economic_data) <- make.names(colnames(G20_economic_data), unique = TRUE)

[1mRows: [22m[34m836[39m [1mColumns: [22m[34m59[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (46): ISO, WEO Subject Code, Country, Subject Descriptor, Subject Notes,...
[32mdbl[39m  (2): WEO Country Code, Estimates Start After

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [4]:
tidy_G20_economic_data <- G20_economic_data |>
    select(Country,
           Subject.Descriptor,
           Units,
           X2010:X2023
          ) |>
    filter(Subject.Descriptor == "Gross domestic product per capita, current prices" & Units == "U.S. dollars" |
           Subject.Descriptor == "Volume of imports of goods and services" |
           Subject.Descriptor == "Volume of exports of goods and services" | 
           Subject.Descriptor == "Inflation, average consumer prices" & Units == "Percent change" |
           Subject.Descriptor == "Unemployment rate" 
          ) |>
    filter(Country != "Argentina" & Country != "Saudi Arabia") |>
    mutate(X2014 = as.numeric(gsub(",","", X2014)),
           X2015 = as.numeric(gsub(",","", X2015)),
           X2016 = as.numeric(gsub(",","", X2016)),
           X2022 = as.numeric(gsub(",","", X2022)),
           X2023 = as.numeric(gsub(",","", X2023)),
          ) |>
    pivot_longer(cols = X2010:X2023,
                 names_to = "Year",
                 values_to = "Metric"
                ) |>
    group_by(Country, Year, Subject.Descriptor) |>
    summarise(Metric = mean(Metric, na.rm = TRUE)) |>
    pivot_wider(names_from = Subject.Descriptor,
                values_from = Metric
               )

colnames(tidy_G20_economic_data) <- make.names(colnames(tidy_G20_economic_data), unique = TRUE)



[1m[22m`summarise()` has grouped output by 'Country', 'Year'. You can override using
the `.groups` argument.


In [6]:
set.seed(100)
gdp_split <- initial_split(tidy_G20_economic_data, prop = 0.70, strata = Gross.domestic.product.per.capita..current.prices)
gdp_training <- training(gdp_split)
gdp_testing <- testing(gdp_split)


In [9]:

lm_spec <- linear_reg() |>
        set_engine("lm") |>
        set_mode("regression")

lm_recipe <- recipe(Gross.domestic.product.per.capita..current.prices ~ Inflation..average.consumer.prices + Unemployment.rate + Volume.of.exports.of.goods.and.services + Volume.of.imports.of.goods.and.services, data = gdp_training)

lm_fit <- workflow() |>
        add_recipe(lm_recipe) |>
        add_model(lm_spec) |>
        fit(data = gdp_training)

lm_fit

══ Workflow [trained] ══════════════════════════════════════════════════════════
[3mPreprocessor:[23m Recipe
[3mModel:[23m linear_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────

Call:
stats::lm(formula = ..y ~ ., data = data)

Coefficients:
                            (Intercept)  
                                38632.1  
     Inflation..average.consumer.prices  
                                 -581.3  
                      Unemployment.rate  
                                 -836.3  
Volume.of.exports.of.goods.and.services  
                                 -284.9  
Volume.of.imports.of.goods.and.services  
                                  169.0  


In [12]:
lm_test_results <- lm_fit |>
                predict(gdp_testing) |>
                bind_cols(gdp_testing) |>
                metrics(truth = Gross.domestic.product.per.capita..current.prices, estimate = .pred)

lm_rmspe <- lm_test_results |>
            filter(.metric == "rmse") |>
            select(.estimate) |>
            pull()
lm_rmspe

In [35]:
australia <- filter(tidy_G20_economic_data, Country == "Australia" & Year <= "X2019")
australia

Country,Year,Gross.domestic.product.per.capita..current.prices,Inflation..average.consumer.prices,Unemployment.rate,Volume.of.exports.of.goods.and.services,Volume.of.imports.of.goods.and.services
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Australia,X2010,56579.03,2.863,5.208,5.709,16.142
Australia,X2011,67270.36,3.356,5.083,-0.144,10.557
Australia,X2012,68450.31,1.686,5.233,5.42,6.183
Australia,X2013,65200.37,2.45,5.658,5.89,-2.284
Australia,X2014,61615.05,2.513,6.058,7.01,-1.41
Australia,X2015,51412.25,1.485,6.05,6.319,2.265
Australia,X2016,51813.64,1.277,5.708,6.673,0.131
Australia,X2017,55797.38,1.995,5.583,3.432,7.775
Australia,X2018,56341.94,1.933,5.292,5.112,4.275
Australia,X2019,54266.75,1.588,5.192,3.226,-1.033


In [36]:
set.seed(100)
aus_split <- initial_split(australia, prop = 0.70, strata = Gross.domestic.product.per.capita..current.prices)
aus_training <- training(aus_split)
aus_testing <- testing(aus_split)


“The number of observations in each quantile is below the recommended threshold of 20.
[36m•[39m Stratification will use 0 breaks instead.”
“Too little data to stratify.
[36m•[39m Resampling will be unstratified.”


In [37]:
lm_aus <- linear_reg() |>
        set_engine("lm") |>
        set_mode("regression")

lm_recipe_aus <- recipe(Gross.domestic.product.per.capita..current.prices ~ Inflation..average.consumer.prices + Unemployment.rate + Volume.of.exports.of.goods.and.services + Volume.of.imports.of.goods.and.services, data = aus_training)

lm_fit_aus <- workflow() |>
        add_recipe(lm_recipe) |>
        add_model(lm_spec) |>
        fit(data = aus_training)

lm_fit_aus

══ Workflow [trained] ══════════════════════════════════════════════════════════
[3mPreprocessor:[23m Recipe
[3mModel:[23m linear_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────

Call:
stats::lm(formula = ..y ~ ., data = data)

Coefficients:
                            (Intercept)  
                                93651.3  
     Inflation..average.consumer.prices  
                                 6173.1  
                      Unemployment.rate  
                                -8766.8  
Volume.of.exports.of.goods.and.services  
                                  458.2  
Volume.of.imports.of.goods.and.services  
                                 -401.7  


In [38]:
lm_test_results_aus <- lm_fit_aus |>
                predict(aus_testing) |>
                bind_cols(aus_testing) |>
                metrics(truth = Gross.domestic.product.per.capita..current.prices, estimate = .pred)

lm_rmspe_aus <- lm_test_results_aus |>
            filter(.metric == "rmse") |>
            select(.estimate) |>
            pull()
lm_rmspe_aus

In [39]:
canada <- filter(tidy_G20_economic_data, Country == "Canada" & Year <= "X2019") 
canada

Country,Year,Gross.domestic.product.per.capita..current.prices,Inflation..average.consumer.prices,Unemployment.rate,Volume.of.exports.of.goods.and.services,Volume.of.imports.of.goods.and.services
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Canada,X2010,47627.35,1.777,8.142,6.678,13.788
Canada,X2011,52285.94,2.912,7.617,4.828,5.594
Canada,X2012,52744.0,1.516,7.408,2.815,3.709
Canada,X2013,52708.61,0.938,7.175,2.461,2.077
Canada,X2014,51020.84,1.907,7.033,6.323,2.531
Canada,X2015,43626.47,1.125,6.95,3.419,0.751
Canada,X2016,42382.64,1.429,7.033,1.407,0.051
Canada,X2017,45191.99,1.597,6.408,1.445,4.627
Canada,X2018,46625.86,2.268,5.85,3.825,3.309
Canada,X2019,46449.96,1.949,5.7,2.704,0.353


In [40]:
set.seed(100)
can_split <- initial_split(canada, prop = 0.70, strata = Gross.domestic.product.per.capita..current.prices)
can_training <- training(can_split)
can_testing <- testing(can_split)

“The number of observations in each quantile is below the recommended threshold of 20.
[36m•[39m Stratification will use 0 breaks instead.”
“Too little data to stratify.
[36m•[39m Resampling will be unstratified.”


In [41]:
lm_can <- linear_reg() |>
        set_engine("lm") |>
        set_mode("regression")

lm_recipe_can <- recipe(Gross.domestic.product.per.capita..current.prices ~ Inflation..average.consumer.prices + Unemployment.rate + Volume.of.exports.of.goods.and.services + Volume.of.imports.of.goods.and.services, data = can_training)

lm_fit_can <- workflow() |>
        add_recipe(lm_recipe) |>
        add_model(lm_spec) |>
        fit(data = can_training)

lm_fit_can

══ Workflow [trained] ══════════════════════════════════════════════════════════
[3mPreprocessor:[23m Recipe
[3mModel:[23m linear_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────

Call:
stats::lm(formula = ..y ~ ., data = data)

Coefficients:
                            (Intercept)  
                                27479.5  
     Inflation..average.consumer.prices  
                                 3778.4  
                      Unemployment.rate  
                                 1694.6  
Volume.of.exports.of.goods.and.services  
                                  634.2  
Volume.of.imports.of.goods.and.services  
                                 -234.6  


In [42]:
lm_test_results_can <- lm_fit_can |>
                predict(can_testing) |>
                bind_cols(can_testing) |>
                metrics(truth = Gross.domestic.product.per.capita..current.prices, estimate = .pred)

lm_rmspe_can <- lm_test_results_can |>
            filter(.metric == "rmse") |>
            select(.estimate) |>
            pull()
lm_rmspe_can

In [44]:
china <- filter(tidy_G20_economic_data, Country == "China" & Year <= "X2019")
china

Country,Year,Gross.domestic.product.per.capita..current.prices,Inflation..average.consumer.prices,Unemployment.rate,Volume.of.exports.of.goods.and.services,Volume.of.imports.of.goods.and.services
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
China,X2010,4499.8,3.176,4.14,28.461,23.094
China,X2011,5553.24,5.534,4.09,10.964,13.422
China,X2012,6282.71,2.609,4.09,5.882,6.591
China,X2013,7039.57,2.568,4.05,8.76,10.647
China,X2014,7645.88,2.051,4.09,4.295,7.774
China,X2015,8034.29,1.542,4.05,-2.156,-0.438
China,X2016,8063.45,2.121,4.02,0.679,4.387
China,X2017,8760.26,1.521,3.9,7.95,7.291
China,X2018,9848.95,1.928,3.8,3.991,7.263
China,X2019,10170.06,2.902,3.62,0.445,-3.662


In [45]:
set.seed(100)
china_split <- initial_split(china, prop = 0.70, strata = Gross.domestic.product.per.capita..current.prices)
china_training <- training(china_split)
china_testing <- testing(china_split)

“The number of observations in each quantile is below the recommended threshold of 20.
[36m•[39m Stratification will use 0 breaks instead.”
“Too little data to stratify.
[36m•[39m Resampling will be unstratified.”


In [46]:
lm_china <- linear_reg() |>
        set_engine("lm") |>
        set_mode("regression")

lm_recipe_china <- recipe(Gross.domestic.product.per.capita..current.prices ~ Inflation..average.consumer.prices + Unemployment.rate + Volume.of.exports.of.goods.and.services + Volume.of.imports.of.goods.and.services, data = china_training)

lm_fit_china <- workflow() |>
        add_recipe(lm_recipe) |>
        add_model(lm_spec) |>
        fit(data = china_training)

lm_fit_china

══ Workflow [trained] ══════════════════════════════════════════════════════════
[3mPreprocessor:[23m Recipe
[3mModel:[23m linear_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────

Call:
stats::lm(formula = ..y ~ ., data = data)

Coefficients:
                            (Intercept)  
                                48737.7  
     Inflation..average.consumer.prices  
                                 -512.0  
                      Unemployment.rate  
                               -10003.7  
Volume.of.exports.of.goods.and.services  
                                 -216.4  
Volume.of.imports.of.goods.and.services  
                                  213.2  


In [47]:
lm_test_results_china <- lm_fit_china |>
                predict(can_testing) |>
                bind_cols(can_testing) |>
                metrics(truth = Gross.domestic.product.per.capita..current.prices, estimate = .pred)

lm_rmspe_china <- lm_test_results_china |>
            filter(.metric == "rmse") |>
            select(.estimate) |>
            pull()
lm_rmspe_china