In [None]:
#Clean the environment
rm(list=ls())

#set the working directory
library(rstudioapi)

current_path <- getActiveDocumentContext()$path 
setwd(dirname(current_path ))
print( getwd())

#UPLOAD DATASET
library(readxl)
Russell_3000_Fundamentals_Enlarged_With_README <- read_excel("Russell_3000_Fundamentals_Enlarged_With_README.xlsx")
Dataset1=Russell_3000_Fundamentals_Enlarged_With_README
View(Dataset1)

#UPLOAD TARGET VALUES
target<- read_excel("target.xlsx")
t=target[,-1]
View(t)

#UNIQUE THE DATASETS
Dataset= cbind(Dataset1,t)
View(Dataset)

#eliminate qualitative variables that are not useful for prediction purposes
Dataset=Dataset[-c(1,2,3,4,5)]
colnames(Dataset)[5] <- "RETURN_ON_EQUITY"

View(Dataset)
str(Dataset)
summary(Dataset)
dim(Dataset)
table(is.na(Dataset))#there is no missing values
Dataset_std=as.data.frame(scale(Dataset))
#Dataset_std=cbind(Dataset_std,t)
View(Dataset_std)

summary(Dataset_std)


In [None]:
#CORRELATIONS BETWEEN VARIABLES
library(ggcorrplot)
library(tidyverse)

cml<-Dataset_std%>%as.matrix%>%cor()
ggcorrplot(cml, lab = T)+
  ggtitle("Correlations")+
  theme(plot.title=element_text(size=20,hjust=0.5,face="bold"))

In [None]:
##CREATE TRAINING AND TEST SET
set.seed(199) 

index = sample(1:nrow(Dataset_std), 0.7*nrow(Dataset_std)) 

train = Dataset_std[index,] # Create the training data 
test = Dataset_std[-index,] # Create the test data
View(train)
dim(train) #1745
dim(test)  #748

In [None]:
###LASSO REGRESSION
library(glmnet)
library(boot)
y=train$EPS_12M_FORWARD
X= model.matrix(EPS_12M_FORWARD ~ ., train)
View(X)

# Regularization and cross-validation to choose lambda
lasso.cv = cv.glmnet(X,y,alpha=1)
plot(lasso.cv)
lambda = lasso.cv$lambda.min
mse10cv.lasso = lasso.cv$cvm[which(lasso.cv$lambda==lambda)]
# Refit using the best value for lambda chosen by CV
mod.lasso = glmnet(X,y,alpha=1,lambda=lambda)
coef(mod.lasso)



In [None]:
# ELASTIC NET
library(caret)
elnet.cv = train(EPS_12M_FORWARD ~ ., data = train , method = "glmnet", tuneLength = 10,
                 trControl = trainControl(method = "cv", number=10))

res.elnet = elnet.cv$results
res.elnet
best = which.min(res.elnet[,3]) 
# Best alpha and lambda
alpha = res.elnet[best,1];  lambda = res.elnet[best,2]
mse10cv.elnet = (res.elnet[best,3])^2

mod.elnet = glmnet(X,y,alpha=alpha, lambda=lambda)
coef(mod.elnet)


In [None]:
#STEPWISE SELECTION
library(olsrr)
model <- lm(EPS_12M_FORWARD ~., data = Dataset_std)
g=ols_step_best_subset(model)
plot(g)

#STEPWISE FORWARD SELECTION
model1 <- lm(EPS_12M_FORWARD ~ ., data = Dataset_std)
k <- ols_step_forward_p(model1)
plot(k)
k <- ols_step_forward_aic(model)
plot(k)

##STEPWISE BACKWARD SELECTION
model2 <- lm(EPS_12M_FORWARD ~ ., data = Dataset_std)
k <- ols_step_backward_aic(model2)
plot(k)

##RECREATE THE TRAIN AND TEST DATASET WITH ONLY 3 PREDICTORS
train2=train[,-c(2,3,5,6)]
View(train2)
test2=data.frame(test[,-c(2,3,5,6)])
View(test2)

In [None]:
##MULTIPLE LINEAR REGRESSION
models= lm(EPS_12M_FORWARD~., data = train2)
summary(models)
t.test(models$coefficients)
anova(models)


h=print(summary(models))
models$coefficients 
models$fitted
View(cbind(train2$EPS_12M_FORWARD,models$fitted))

### calculate residuals
e=train2$EPS_12M_FORWARD-models$fitted.values
plot(e)
plot(models$residuals)
dife=e-models$residuals
plot(dife)
norma=sum(dife*dife)
print(round(norma))

library(car)
vif(models)  ## 2.72, 2.74, 1.01 --> good results, no collinearity!


In [None]:
##EPS PREDICTION ON TEST SET
my.predict=predict(models,test2, se.fit=F, interval="confidence")
View(my.predict)
View(cbind(test2,my.predict))