In [1]:
library(glmnet)

Loading required package: Matrix

Loaded glmnet 4.1-1

Learn more about sjPlot with 'browseVignettes("sjPlot")'.



In [2]:
df_train = read.csv('../input/country-metrics/country_info_clean.csv')

df_train$num_learners = scale(df_train$num_learners)
df_train$happiness = scale(df_train$happiness)
df_train$gdp_bill = scale(df_train$gdp_bill)
df_train$population = scale(df_train$population)
df_train$per_capita = scale(df_train$per_capita)

df_train$pdi = scale(df_train$pdi)
df_train$idv = scale(df_train$idv)
df_train$mas = scale(df_train$mas)
df_train$uai = scale(df_train$uai)
df_train$ltowvs = scale(df_train$ltowvs)
df_train$ivr = scale(df_train$ivr)

# Feature Selection Using LASSO Regression

1. Determine best lambda using cross-validation.

In [None]:
y = df_train$auc_score
x = data.matrix(df_train[, c('num_learners', 'happiness', 'gdp_bill', 'population', 'per_capita')])

cv_model = cv.glmnet(x, y, alpha = 1)
best_lambda = cv_model$lambda.min

In [None]:
plot(cv_model)
best_lambda

2. Determine the best set of features using the best lambda.

In [None]:
best_model = glmnet(x, y, alpha = 1, lambda = best_lambda)

In [None]:
coef(best_model)

3. Compute the training data's R-squared

In [None]:
y_predicted = predict(best_model, s = best_lambda, newx = x)
sst = sum((y - mean(y))^2)
sse = sum((y_predicted - y)^2)
rsq = 1 - sse/sst
rsq

# Feature Selection Using Stepwise Forward Selection

1. Define base (intercept-only) model and full model with all predictors.

In [3]:
base.mod = lm(auc_score ~ 1 , data = df_train)  

all.mod = lm(auc_score ~ num_learners + happiness + gdp_bill + population + per_capita, data = df_train)

2. Perform step-wise algorithm
* direction = 'both': both forward and backward stepwise
* direction = 'forward': forward stepwise
* direction = 'backward': backward stepwise

In [4]:
stepMod = step(base.mod, scope = list(lower = base.mod, upper = all.mod), direction = "forward", trace = 0, steps = 1000)

3. Get the short-list of selected features.

In [5]:
shortlist = names(unlist(stepMod[[1]])) 
shortlist = shortlist[!shortlist %in% "(Intercept)"] # remove intercept
shortlist

4. Fit a linear model using only the selected features to determine R-squared and coefficients.

In [6]:
lm_auc = lm(auc_score ~ happiness + population, data = df_train)
summary(lm_auc)
anova(lm_auc)


Call:
lm(formula = auc_score ~ happiness + population, data = df_train)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.078404 -0.005569  0.003967  0.012044  0.026034 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 0.971076   0.002148 451.996  < 2e-16 ***
happiness   0.011555   0.002242   5.154 1.87e-06 ***
population  0.007016   0.002242   3.130  0.00246 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.01934 on 78 degrees of freedom
Multiple R-squared:  0.2772,	Adjusted R-squared:  0.2587 
F-statistic: 14.96 on 2 and 78 DF,  p-value: 3.169e-06


Unnamed: 0_level_0,Df,Sum Sq,Mean Sq,F value,Pr(>F)
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<dbl>
happiness,1,0.00752351,0.0075235101,20.123271,2.466721e-05
population,1,0.003662324,0.0036623238,9.795685,0.002461091
Residuals,78,0.029161948,0.0003738711,,


# Repeat for model regressed on Hofstede's dimension indices.

In [7]:
base.mod = lm(auc_score ~ 1 , data = df_train)  
all.mod = lm(auc_score ~ pdi + idv + mas + uai + ltowvs + ivr, data = df_train)
stepMod = step(base.mod, scope = list(lower = base.mod, upper = all.mod), direction = "forward", trace = 0, steps = 1000)
shortlist = names(unlist(stepMod[[1]])) 
shortlist = shortlist[!shortlist %in% "(Intercept)"] # remove intercept
shortlist

In [8]:
lm_auc = lm(auc_score ~ ltowvs + idv, data = df_train)
summary(lm_auc)


Call:
lm(formula = auc_score ~ ltowvs + idv, data = df_train)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.091247 -0.002789  0.005938  0.013456  0.022373 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 0.971076   0.002279 426.127  < 2e-16 ***
ltowvs      0.006832   0.002389   2.859  0.00544 ** 
idv         0.005238   0.002389   2.192  0.03135 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.02051 on 78 degrees of freedom
Multiple R-squared:  0.1868,	Adjusted R-squared:  0.166 
F-statistic:  8.96 on 2 and 78 DF,  p-value: 0.0003143


In [9]:
anova(lm_auc)

Unnamed: 0_level_0,Df,Sum Sq,Mean Sq,F value,Pr(>F)
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<dbl>
ltowvs,1,0.005516123,0.0055161225,13.113554,0.0005194313
idv,1,0.00202152,0.0020215196,4.805787,0.0313461956
Residuals,78,0.03281014,0.0004206428,,
