In [1]:
library(tidyverse)
library(repr)
library(dplyr)
library(tidymodels)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.0.0 ──

[32m✔[39m [34mbroom       [39m 1.0.0     [32m✔[39m [34mrsample     [39m 1.0.0
[32m✔[39m [34mdials       [39m 1.0.0     [32m✔[39m [34mtune        [39m 1.0.0
[32m✔[39m [34minfer       [39m 1.0.2     [32m✔[39m [34mworkflows   [39m 1.0.0
[32m✔

In [2]:
G20_economic_data <- read_csv("https://raw.githubusercontent.com/mparhar1/DSCI-100-Group-48-Project/main/WEOApri23G20.csv")
colnames(G20_economic_data) <- make.names(colnames(G20_economic_data), unique = TRUE)

[1mRows: [22m[34m836[39m [1mColumns: [22m[34m59[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (46): ISO, WEO Subject Code, Country, Subject Descriptor, Subject Notes,...
[32mdbl[39m  (2): WEO Country Code, Estimates Start After

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [4]:
tidy_G20_economic_data <- G20_economic_data |>
    select(Country,
           Subject.Descriptor,
           Units,
           X2010:X2023
          ) |>
    filter(Subject.Descriptor == "Gross domestic product per capita, current prices" & Units == "U.S. dollars" |
           Subject.Descriptor == "Volume of imports of goods and services" |
           Subject.Descriptor == "Volume of exports of goods and services" | 
           Subject.Descriptor == "Inflation, average consumer prices" & Units == "Percent change" |
           Subject.Descriptor == "Unemployment rate" 
          ) |>
    filter(Country != "Argentina" & Country != "Saudi Arabia") |>
    mutate(X2014 = as.numeric(gsub(",","", X2014)),
           X2015 = as.numeric(gsub(",","", X2015)),
           X2016 = as.numeric(gsub(",","", X2016)),
           X2022 = as.numeric(gsub(",","", X2022)),
           X2023 = as.numeric(gsub(",","", X2023)),
          ) |>
    pivot_longer(cols = X2010:X2023,
                 names_to = "Year",
                 values_to = "Metric"
                ) |>
    group_by(Country, Year, Subject.Descriptor) |>
    summarise(Metric = mean(Metric, na.rm = TRUE)) |>
    pivot_wider(names_from = Subject.Descriptor,
                values_from = Metric
               )

colnames(tidy_G20_economic_data) <- make.names(colnames(tidy_G20_economic_data), unique = TRUE)



[1m[22m`summarise()` has grouped output by 'Country', 'Year'. You can override using
the `.groups` argument.


In [6]:
set.seed(100)
gdp_split <- initial_split(tidy_G20_economic_data, prop = 0.70, strata = Gross.domestic.product.per.capita..current.prices)
gdp_training <- training(gdp_split)
gdp_testing <- testing(gdp_split)


In [9]:

lm_spec <- linear_reg() |>
        set_engine("lm") |>
        set_mode("regression")

lm_recipe <- recipe(Gross.domestic.product.per.capita..current.prices ~ Inflation..average.consumer.prices + Unemployment.rate + Volume.of.exports.of.goods.and.services + Volume.of.imports.of.goods.and.services, data = gdp_training)

lm_fit <- workflow() |>
        add_recipe(lm_recipe) |>
        add_model(lm_spec) |>
        fit(data = gdp_training)

lm_fit

══ Workflow [trained] ══════════════════════════════════════════════════════════
[3mPreprocessor:[23m Recipe
[3mModel:[23m linear_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────

Call:
stats::lm(formula = ..y ~ ., data = data)

Coefficients:
                            (Intercept)  
                                38632.1  
     Inflation..average.consumer.prices  
                                 -581.3  
                      Unemployment.rate  
                                 -836.3  
Volume.of.exports.of.goods.and.services  
                                 -284.9  
Volume.of.imports.of.goods.and.services  
                                  169.0  


In [12]:
lm_test_results <- lm_fit |>
                predict(gdp_testing) |>
                bind_cols(gdp_testing) |>
                metrics(truth = Gross.domestic.product.per.capita..current.prices, estimate = .pred)

lm_rmspe <- lm_test_results |>
            filter(.metric == "rmse") |>
            select(.estimate) |>
            pull()
lm_rmspe

In [20]:
australia <- filter(tidy_G20_economic_data, Country == "Australia")
australia

Country,Year,Gross.domestic.product.per.capita..current.prices,Inflation..average.consumer.prices,Unemployment.rate,Volume.of.exports.of.goods.and.services,Volume.of.imports.of.goods.and.services
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Australia,X2010,56579.03,2.863,5.208,5.709,16.142
Australia,X2011,67270.36,3.356,5.083,-0.144,10.557
Australia,X2012,68450.31,1.686,5.233,5.42,6.183
Australia,X2013,65200.37,2.45,5.658,5.89,-2.284
Australia,X2014,61615.05,2.513,6.058,7.01,-1.41
Australia,X2015,51412.25,1.485,6.05,6.319,2.265
Australia,X2016,51813.64,1.277,5.708,6.673,0.131
Australia,X2017,55797.38,1.995,5.583,3.432,7.775
Australia,X2018,56341.94,1.933,5.292,5.112,4.275
Australia,X2019,54266.75,1.588,5.192,3.226,-1.033


In [21]:
set.seed(100)
aus_split <- initial_split(australia, prop = 0.70, strata = Gross.domestic.product.per.capita..current.prices)
aus_training <- training(aus_split)
aus_testing <- testing(aus_split)


“The number of observations in each quantile is below the recommended threshold of 20.
[36m•[39m Stratification will use 0 breaks instead.”
“Too little data to stratify.
[36m•[39m Resampling will be unstratified.”


In [22]:
lm_aus <- linear_reg() |>
        set_engine("lm") |>
        set_mode("regression")

lm_recipe_aus <- recipe(Gross.domestic.product.per.capita..current.prices ~ Inflation..average.consumer.prices + Unemployment.rate + Volume.of.exports.of.goods.and.services + Volume.of.imports.of.goods.and.services, data = aus_training)

lm_fit_aus <- workflow() |>
        add_recipe(lm_recipe) |>
        add_model(lm_spec) |>
        fit(data = aus_training)

lm_fit_aus

══ Workflow [trained] ══════════════════════════════════════════════════════════
[3mPreprocessor:[23m Recipe
[3mModel:[23m linear_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────

Call:
stats::lm(formula = ..y ~ ., data = data)

Coefficients:
                            (Intercept)  
                               136702.1  
     Inflation..average.consumer.prices  
                                -4594.7  
                      Unemployment.rate  
                               -13520.2  
Volume.of.exports.of.goods.and.services  
                                 -161.7  
Volume.of.imports.of.goods.and.services  
                                  980.9  


In [23]:
lm_test_results_aus <- lm_fit_aus |>
                predict(aus_testing) |>
                bind_cols(aus_testing) |>
                metrics(truth = Gross.domestic.product.per.capita..current.prices, estimate = .pred)

lm_rmspe_aus <- lm_test_results_aus |>
            filter(.metric == "rmse") |>
            select(.estimate) |>
            pull()
lm_rmspe_aus

In [None]:
canada <- filter(tidy_G20_economic_data, Country == "Australia")
australia