In [None]:
## Importing packages

# This R environment comes with all of CRAN and many other helpful packages preinstalled.
# You can see which packages are installed by checking out the kaggle/rstats docker image: 
# https://github.com/kaggle/docker-rstats

library(tidyverse) # metapackage with lots of helpful functions

## Running code

# In a notebook, you can run a single code cell by clicking in the cell and then hitting 
# the blue arrow to the left, or by clicking in the cell and pressing Shift+Enter. In a script, 
# you can run code by highlighting the code you want to run and then clicking the blue arrow
# at the bottom of this window.

## Reading in files

# You can access files from datasets you've added to this kernel in the "../input/" directory.
# You can see the files added to this kernel by running the code below. 

list.files(path = "../input")

## Saving data

# If you save any files or images, these will be put in the "output" directory. You 
# can see the output directory by committing and running your kernel (using the 
# Commit & Run button) and then checking out the compiled version of your kernel.

In [None]:
library(lubridate)
library(dplyr)
library(RSQLite)
library(ggplot2)
library(reshape2)
library(knitr)
system("ls ../input")

In [None]:
#initialize all the connections 
path <- "/kaggle/input/soccer/database.sqlite"
path2 <- "/kaggle/input/player/player_data_complete.csv"
path3 <- "/kaggle/input/matchn/match_nonull.csv"
db <- dbConnect(SQLite(), path)
all_tables = dbGetQuery(db,"SELECT name FROM sqlite_master WHERE type ='table'")
all_tables

Here we analyze the data for English Primier League

In [None]:
#The dataset, select EPL data 

Player_Attributes = dbGetQuery(db,"SELECT * FROM Player_Attributes")
Player = dbGetQuery(db,"SELECT * FROM Player")
Match = dbGetQuery(db,"SELECT * FROM Match WHERE league_id = 1729")
Match2 = read.csv(file = path3)
Match2 = filter(Match2,league_id == '1729')
League = dbGetQuery(db,"SELECT * FROM League")
Country = dbGetQuery(db,"SELECT * FROM Country")
Team = dbGetQuery(db,"SELECT * FROM Team")
Team_Attributes = dbGetQuery(db,"SELECT * FROM Team_Attributes")


In [None]:
#select the season of 2009/2010
Match = filter(Match,season=='2009/2010')

#total matche in a season
nrow(Match)

In [None]:
#Player2 = read.csv(file = path2)
#Player2_Simple = select(Player2,-c(X,Unnamed..0,))

# 1. Data Cleaning 

Remove the columns that is mostly null and the columns that are not relevant for modelling 

In [None]:
#Simplify the Match Table

names(Match2)
Match_Betting = Match2%>%select(34:60)
Match_filtered = Match2[,-c(12:55)]
Match_filtered = Match_filtered[,-c(42:71)]
#Match_betting = Match_filtered[,c(42:71)]
Match_filtered = Match_filtered[,c(-37)]
Match_filtered = select(Match_filtered,-c(GBD,GBA,BSH,BSD,BSA))
#Match_filtered = select(Match_filtered,-c(card,goal,shoton,shotoff,cross,corner,possession))
names(Match_filtered)
Match_Simple = Match_filtered[,c('match_api_id','country_id','league_id','season','stage','date','home_team_api_id','away_team_api_id','home_team_goal','away_team_goal')]
head(Match_Simple,5)

In [None]:
#Find NUll counts per column

#na_count <-sapply(Match_filtered, function(y) sum(length(which(is.na(y)))))
#na_count <- data.frame(na_count)
#na_count           

In [None]:
#Player3 = inner_join(Player2,Player,by=c('Name' = 'player_name'))
#head(Player3,10)
#names(Player3)
#Player3 = select(Player3,-c('X','Unnamed..0'))

The Match table has details of the match. Here there are two types of the match, one is Home and other is Away.
Here, we filter the Home and Away match for all the teams, and then join them.

In [None]:
# Change the Team Name

Match_Simple <- inner_join(Match_Simple, Team, by = c("home_team_api_id" = "team_api_id"))%>%
select(-home_team_api_id) %>%
rename( home_team = team_long_name)

Match_Simple <- inner_join(Match_Simple, Team, by = c("away_team_api_id" = "team_api_id"))%>%
select (-away_team_api_id) %>%
rename(away_team = team_long_name)

head(Match_Simple,5)

In [None]:
#Join the Team with Team Attributes 

Team_Attributes <- inner_join(Team_Attributes, Team, by='team_api_id')
#names(Team_Attributes)
Team_Attributes = Team_Attributes%>%rename(season=date)
Team_Attributes = select(Team_Attributes,-c(id.x,team_fifa_api_id.x,buildUpPlayDribbling,id.y,team_fifa_api_id.y,team_short_name))
Team_Attributes = Team_Attributes%>%rename(team_name=team_long_name)
Team_Attributes = Team_Attributes%>%mutate(season = paste(as.numeric(substr(season,1,4))-1,substr(season,1,4),sep="/"))

Feature engineer the home goal and away goal percentage

In [None]:
# calculate the mean goals per league as percentage from the total

mean_goals = aggregate(Match_Simple[,7:8], list(Match_Simple$league_id), mean)
total_goals = mean_goals['home_team_goal'] + mean_goals['away_team_goal']
mean_goals['home_team_goal_percent'] = 1e2 * mean_goals['home_team_goal'] / total_goals
mean_goals['away_team_goal_percent'] = 1e2 * mean_goals['away_team_goal'] / total_goals

# make a subset of 
mean_goals_percent = inner_join(x=mean_goals,y=League,by=c('Group.1' = 'id'))%>%select(name,home_team_goal_percent,away_team_goal_percent)
mean_goals_percent

In [None]:
league_names = c('Belgium Jupiler League','England Premier League','France Ligue 1','Germany 1. Bundesliga','Italy Serie A','Netherlands Eredivisie','Poland Ekstraklasa','Portugal Liga ZON Sagres','Scotland Premier League','Spain LIGA BBVA','Switzerland Super League')

ggplot (mean_goals_percent, aes(x=name, y=away_team_goal_percent,fill=home_team_goal_percent)) + 
geom_bar (stat="identity", position = position_dodge(width = 0.01))+
coord_flip()

# 2. Feature Engineering

In [None]:
#further simplify the match table 

Match_Simple2 <- Match[,1:11]
away_matches <- merge(Team,Match_Simple2,by.x="team_api_id",by.y="away_team_api_id")
home_matches <- merge(Team,Match_Simple2,by.x="team_api_id",by.y="home_team_api_id")

away_matches <- subset(away_matches,select=-c(id.x,id.y))
home_matches <- subset(home_matches,select=-c(id.x,id.y))

colnames(away_matches)[11:13] <- c("opponent_team_id","goals_conceded","goals_scored")
colnames(home_matches)[11:13] <- c("opponent_team_id","goals_scored","goals_conceded")

away_matches <- cbind(away_matches,side="away")
home_matches <- cbind(home_matches,side="home")
allmatches <- rbind(home_matches,away_matches)

overall_matches <- rbind(home_matches,away_matches)
overall_matches$result <- "draw"
overall_matches$result[which(overall_matches$goals_scored > overall_matches$goals_conceded)] <- "win"
overall_matches$result[which(overall_matches$goals_scored < overall_matches$goals_conceded)] <- "loss"
overall_matches = select(overall_matches,-c(team_fifa_api_id,team_short_name,country_id,league_id,stage,date,match_api_id))
overall_matches = overall_matches%>%rename(home_team_id=team_api_id,home_team=team_long_name)
head(overall_matches,5)

In [None]:
#Find the stats for each team of the league for the season

ateam_stats <- overall_matches %>% group_by(home_team)%>%
summarise(matches=n(),h_matches=length(result[side=="home"]),a_matches=length(result[side=="away"]),
tot_scored=sum(goals_scored),home_scored=sum(goals_scored[side=="home"]),away_scored=sum(goals_scored[side=="away"]),
tot_conceded = sum(goals_conceded),home_conceded=sum(goals_conceded[side=="home"]),away_conceded = sum(goals_conceded[side=="away"]),
wins=length(result[result=="win"]),losses=length(result[result=="loss"]),draws=length(result[result=="draw"]),
h_wins=length(result[result=="win" & side=="home"]), a_wins=length(result[result=="win" & side=="away"]),
h_loss=length(result[result=="loss" & side=="home"]), a_loss=length(result[result=="loss" & side=="away"]),
mean_goals=mean(goals_scored),var_goals=var(goals_scored), win_pct=wins/matches,loss_pct=losses/matches,
hwin_pct=h_wins/h_matches, awin_pct=a_wins/a_matches, points = as.numeric(length(result[result=="win"]))*3 + as.numeric(length(result[result=="draw"]))*1 )

In [None]:
head(ateam_stats)

In [None]:
#The Top teams in this case is just a sorted list of all the teams

top_teams <- ateam_stats %>% arrange(desc(win_pct))%>%head(30)
top_teams$rank <- seq.int(1,20,1)
    
Team_Attributes_temp = filter(Team_Attributes,season=='2009/2010')
top_teams = inner_join(top_teams,Team_Attributes_temp,by =c('home_team'='team_name'))

head(top_teams,10)
names(top_teams)

In [None]:
overall_matches = inner_join(overall_matches,top_teams,by=c('home_team_id'='team_api_id')) #join the home team first 
overall_matches = inner_join(overall_matches,top_teams,by=c('opponent_team_id'='team_api_id')) #join the away team second

In [None]:
Non_Numeric_Predictors <- c('side','result',
                            'buildUpPlaySpeedClass.x','buildUpPlayDribblingClass.x','buildUpPlayPassingClass.x',
                            'buildUpPlayPositioningClass.x','chanceCreationPassingClass.x','chanceCreationCrossingClass.x',
                            'chanceCreationShootingClass.x','chanceCreationPositioningClass.x','defencePressureClass.x',
                            'defenceAggressionClass.x','defenceTeamWidthClass.x','defenceDefenderLineClass.x',
                            'buildUpPlaySpeedClass.y','buildUpPlayDribblingClass.y','buildUpPlayPassingClass.y',
                            'buildUpPlayPositioningClass.y','chanceCreationPassingClass.y','chanceCreationCrossingClass.y',
                            'chanceCreationShootingClass.y','chanceCreationPositioningClass.y','defencePressureClass.y',
                            'defenceAggressionClass.y','defenceTeamWidthClass.y','defenceDefenderLineClass.y')
                            
Numeric_predictors <- c('goals_scored','goals_conceded',
                        'buildUpPlaySpeed.x','buildUpPlayPassing.x','chanceCreationPassing.x','chanceCreationCrossing.x','chanceCreationShooting.x','defencePressure.x','defenceAggression.x',
                        'defenceTeamWidth.x',
                        'buildUpPlaySpeed.y','buildUpPlayPassing.y','chanceCreationPassing.y','chanceCreationCrossing.y','chanceCreationShooting.y','defencePressure.y','defenceAggression.y',
                        'defenceTeamWidth.y',
                        'tot_scored.x','home_scored.x','away_scored.x','tot_conceded.x',
                        'home_conceded.x','away_conceded.x','wins.x','losses.x','draws.x','h_wins.x','a_wins.x','h_loss.x','a_loss.x','mean_goals.x',
                        'var_goals.x','win_pct.x','loss_pct.x','hwin_pct.x','awin_pct.x','points.x','rank.x',
                        'tot_scored.y','home_scored.y','away_scored.y','tot_conceded.y',
                        'home_conceded.y','away_conceded.y','wins.y','losses.y','draws.y','h_wins.y','a_wins.y','h_loss.y','a_loss.y','mean_goals.y',
                        'var_goals.y','win_pct.y','loss_pct.y','hwin_pct.y','awin_pct.y','points.y','rank.y')


In [None]:
overall_matches[,Numeric_predictors] = sapply(overall_matches[,Numeric_predictors],as.numeric)
overall_matches[,Non_Numeric_Predictors] = lapply(overall_matches[,Non_Numeric_Predictors],as.factor)

In [None]:
#find the correlation among variables
#options(repr.plot.width = 25, repr.plot.height = 12)
#library(psych)
#home_team_predictors = select(overall_matches,c(home_team_id,tot_scored.x,home_scored.x,away_scored.x,tot_conceded.x,home_conceded.x,away_conceded.x,wins.x,losses.x,draws.x,h_wins.x,a_wins.x,h_loss.x,a_loss.x,points.x,rank.x))
#home_team_predictors <- unique( home_team_predictors[,] )
#head(home_team_predictors)
#pairs.panels(home_team_predictors[,c(2:15)]) 

Remove the correlated columns 

In [None]:
#options(repr.plot.width = 12, repr.plot.height = 6)
#ggplot(top_teams,aes(x=mean_goals,y=var_goals))+geom_point()+
#geom_text(aes(label=home_team,vjust=1))+xlab("Average number of goals scored")+
#ylab("Variance in goals scored")+scale_x_continuous(breaks=seq(1,3.5,by=0.1))+scale_y_continuous(breaks=seq(1,4,by=0.1))

In [None]:
#Null counts per column

#na_count <-sapply(Team_Attributes, function(y) sum(length(which(is.na(y)))))
#na_count <- data.frame(na_count)
#na_count

In [None]:
#find the home and away winning percentage

win_perc <- filter(overall_matches, result == "win") %>%
group_by(season,home_team_id) %>%
summarise(win_perc = sum(result == "win")/38)

loss_perc <- filter(overall_matches, result == "loss") %>%
group_by(season,home_team_id) %>%
summarise(loss_perc = sum(result == "loss")/38)

In [None]:
head(win_perc,5)
head(loss_perc,5)

In [None]:
overall_matches = select(overall_matches,-c(home_team.y,matches.x,h_matches.x,a_matches.x,home_team,matches.y,h_matches.y,a_matches.y,season.y))

# 3. Model creation and Prediction

In [None]:
#Modelling and Prediction

set.seed(10)
training_size <- floor(0.80 * nrow(overall_matches))
train_data <- sample(seq_len(nrow(overall_matches)), size = training_size)

train <- overall_matches[train_data,]
test <- overall_matches[-train_data,]

labels <- select(overall_matches, c(home_team_id, home_team.x,season.x,opponent_team_id))

train = select(train, -c(home_team_id, home_team.x,season.x,opponent_team_id)) 
test = select(test, -c(home_team_id, home_team.x,season.x,opponent_team_id)) 

train[,Numeric_predictors] = scale(train[,Numeric_predictors])
test[,Numeric_predictors] = scale(test[,Numeric_predictors])

use best subset selection

a) creating a model by removing the numerically correlated columns

In [None]:
#Find numeric correlated columns
#The idea is to remove the columns that have a correlation of more than 0.9

library('caret')
temp1 = train[,Numeric_predictors]
temp2 = cor(temp1)
temp3 = findCorrelation(temp2, cutoff=0.9)
temp3 = sort(temp3)
temp4 = temp1[,-c(temp3)]


In [None]:
#Remove the correlated columns
train2 = select(train,-c(colnames(temp1)))
train3 = select(train,c(colnames(temp4)))
train4 = cbind(train3,train2)

test2 = select(test,-c(colnames(temp1)))
test3 = select(test,c(colnames(temp4)))
test4 = cbind(test3,test2)


In [None]:
temp5 = select(temp4,-c(goals_scored,goals_conceded,buildUpPlaySpeed.x,buildUpPlayPassing.x,chanceCreationCrossing.x,chanceCreationShooting.x,defencePressure.x,defenceAggression.x,buildUpPlaySpeed.y,buildUpPlayPassing.y,chanceCreationCrossing.y,
                       chanceCreationShooting.y,defencePressure.y,defenceAggression.y))

In [None]:
library(pheatmap)
pheatmap(cor(temp5))


In [None]:
#Identify the category variables which has just 1 level, basically that is a constant

utemp <- sapply(train4, function(x) is.factor(x))
m <- train4[, utemp]
ifelse(n <- sapply(m, function(x) length(levels(x))) == 1, "DROP", "NODROP")

In [None]:
train4 = select(train4,-c(buildUpPlayDribblingClass.x,defenceDefenderLineClass.x,buildUpPlayDribblingClass.y,defenceDefenderLineClass.y))
test4 = select(test4,-c(buildUpPlayDribblingClass.x,defenceDefenderLineClass.x,buildUpPlayDribblingClass.y,defenceDefenderLineClass.y))

In [None]:
#test of normality for some of the columns

attach(train4)
shapiro.test(a_loss.x)
hist(a_loss.x)

1. SVM Model

In [None]:
train4 = filter(train4,result!='draw')
train4 <- droplevels(train4)

test4 = filter(test4,result!='draw')
test4 <- droplevels(test4)


In [None]:
names(temp3)

In [None]:
#SVM Model 

#library(caret)

#buildUpPlaySpeed.x,buildUpPlayPassing.x,chanceCreationCrossing.x,chanceCreationShooting.x,defencePressure.x,defenceAggression.x,buildUpPlaySpeedClass.x,
#buildUpPlaySpeed.y,buildUpPlayPassing.y,chanceCreationCrossing.y,chanceCreationShooting.y,defencePressure.y,defenceAggression.y,buildUpPlaySpeedClass.y,
#buildUpPlayPositioningClass.x,chanceCreationPassingClass.x,chanceCreationCrossingClass.x,chanceCreationShootingClass.x,
#buildUpPlayPositioningClass.y,chanceCreationPassingClass.y,chanceCreationCrossingClass.y,chanceCreationShootingClass.y,
#chanceCreationPositioningClass.x,chanceCreationPositioningClass.y,defencePressureClass.x,defencePressureClass.y
#defenceAggressionClass.x,defenceAggressionClass.y,defenceTeamWidthClass.x,defenceTeamWidthClass.y

require(e1071)
#train_svm = select(train,c(result,goals_scored,goals_conceded,away_scored.x,home_conceded.x,away_conceded.x) original

train_svm = select(train4,c(result,away_scored.x,home_conceded.x,away_conceded.x,draws.x,h_loss.x,var_goals.x,awin_pct.x,side,
                          away_scored.y,home_conceded.y,away_conceded.y,draws.y,h_loss.y,a_loss.y,var_goals.y))

test_svm = select(test4,c(away_scored.x,home_conceded.x,away_conceded.x,draws.x,h_loss.x,var_goals.x,awin_pct.x,side,
                         away_scored.y,home_conceded.y,away_conceded.y,draws.y,h_loss.y,a_loss.y,var_goals.y))

svm_model <- train(result ~., data = train_svm, method = "svmLinear3")

svm_prediction <- predict(svm_model, newdata = test_svm)
check_accuracy <- as.data.frame(cbind(prediction = svm_prediction, actual = test$result))


In [None]:
table(svm_prediction,test4$result)
#getting 109/190 correct,which is roughly 57% accuracy, with 75-25 split
#getting 109/152 correct,which is roughly 61.8% accuracy, with 80-20 split

In [None]:
precision_svm <- posPredValue(svm_prediction, test4$result, positive="win")
recall_svm <- sensitivity(svm_prediction, test4$result, positive="win")

F1_svm <- (2 * precision_svm * recall_svm) / (precision_svm + recall_svm)
precision_svm
recall_svm
F1_svm

In [None]:
#PCA to visualize the data 

train_pca_data = select(train4,c(result,away_scored.x,home_conceded.x,away_conceded.x,draws.x,h_loss.x,var_goals.x,awin_pct.x,
                          away_scored.y,home_conceded.y,away_conceded.y,draws.y,h_loss.y,a_loss.y,var_goals.y))

train_pca <- prcomp(train_pca_data[,2:15], center = TRUE, scale = TRUE)
summary(train_pca)

In [None]:
#graph of the PCA 

screeplot(train_pca, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
       col=c("red"), lty=5, cex=0.6)


In [None]:
cumpro <- cumsum(train_pca$sdev^2 / sum(train_pca$sdev^2))
plot(cumpro[0:15], xlab = "PC #", ylab = "Amount of explained variance", main = "Cumulative variance plot")
abline(v = 6, col="blue", lty=5)
abline(h = 0.88974 , col="blue", lty=5)
legend("topleft", legend=c("Cut-off @ PC6"),
       col=c("blue"), lty=5, cex=0.6)


In [None]:
options(repr.plot.width = 12, repr.plot.height = 08)
library("factoextra")
fviz_pca_ind(train_pca, geom.ind = "point", pointshape = 21, 
             pointsize = 6, 
             fill.ind = train_pca_data$result, 
             col.ind = "black", 
             palette = "jco", 
             addEllipses = TRUE,
             label = "var",
             col.var = "black",
             repel = TRUE,
             legend.title = "Match Outcome",
             ggtheme=theme(legend.text=element_text(size=20))) +
  ggtitle("") +
  theme(plot.title = element_text(hjust = 1))

2. LDA model with PCA

In [None]:
model_data = train_pca_data
model_data = filter(model_data,result!='draw')
model_data <- droplevels(model_data)

#model_data = as.matrix(train_pca_data[,2:15])
#model_data_raw = cbind(model_data,as.numeric(train_pca_data$result)-1)

train_size = floor(0.80*nrow(model_data))

train_raw = sample(nrow(model_data),size=train_size)
train_lda = model_data[train_raw,]
test_lda = model_data[-train_raw,]

In [None]:
library(caret)
library(lubridate)

set.seed(1234)

model_data$result <- make.names(model_data$result)
model_data$result <- as.factor(model_data$result)

lda_model  <- train(result ~ away_scored.x+home_conceded.x+away_conceded.x+draws.x+h_loss.x+var_goals.x+awin_pct.x+away_scored.y+home_conceded.y+away_conceded.y+draws.y+h_loss.y+a_loss.y+var_goals.y, data=train_lda, method="lda")

In [None]:
lda_prediction <- predict(lda_model, test_lda[,2:15])
#prediction <- cbind(actual=test_lda$result, prediction)
table(test_lda$result,lda_prediction)
#LDA model has accuracy of only 51.9 % with 80-20 split

In [None]:
precision_lda <- posPredValue(lda_prediction, test_lda$result, positive="win")
recall_lda <- sensitivity(lda_prediction, test_lda$result, positive="win")

F1_lda <- (2 * precision_lda * recall_lda) / (precision_lda + recall_lda)
precision_lda
recall_lda
F1_lda

In [None]:
options(repr.plot.width = 15, repr.plot.height = 8)
op <- par(cex = 4)
library(pROC)
rocobj1 <- roc(as.factor(test4$result), as.numeric(svm_prediction))
rocobj2 <- roc(test_lda$result, as.numeric(lda_prediction))
ggroc(list(SVM = rocobj1, LDA = rocobj2),cex = 2)+
labs(x = "1 - Specificity",
       y = "Sensitivity",
       linetype = "Model")+theme_grey(base_size = 22)

In [None]:
rocobj1 <- roc(as.factor(test4$result), as.numeric(svm_prediction))
rocobj2 <- roc(test_lda$result, as.numeric(lda_prediction))
rocobj1
rocobj2

In [None]:
#library('boot')
#?cv.glm
#linear model, with  k fold validation
#overall_matches2 = select(overall_matches,-c(home_team_id, home_team.x,season.x,opponent_team_id))
#k_fold_cv_error = cv.glm(overall_matches2 , glm_fit, K=5)
#k_fold_cv_error$delta
#sqrt(k_fold_cv_error$delta)[1]
#glm_s$coefficients

3. Random Forest Model

In [None]:
library(randomForest)

In [None]:
rf_data_n = select(overall_matches,c(result,away_scored.x,home_conceded.x,away_conceded.x,h_loss.x,var_goals.x,awin_pct.x,side,draws.x,draws.y,side,
                          away_scored.y,home_conceded.y,away_conceded.y,h_loss.y,a_loss.y,var_goals.y))

rf_data_n2 = select(overall_matches,c(result,tot_scored.x,home_scored.x,away_scored.x,tot_conceded.x,home_conceded.x,
away_conceded.x,wins.x,losses.x,draws.x,h_wins.x,a_wins.x,h_loss.x,a_loss.x,mean_goals.x,
var_goals.x,win_pct.x,loss_pct.x,hwin_pct.x,awin_pct.x,points.x,rank.x,
tot_scored.y,home_scored.y,away_scored.y,tot_conceded.y,home_conceded.y,away_conceded.y,
wins.y,losses.y,draws.y,h_wins.y,a_wins.y,h_loss.y,a_loss.y,mean_goals.y,var_goals.y,
win_pct.y,loss_pct.y,hwin_pct.y,awin_pct.y,points.y,rank.y))

In [None]:
rf_data_nd = filter(rf_data_n,result!='draw')
rf_data_nd <- droplevels(rf_data_nd)

rf_data_nd2 = filter(rf_data_n2,result!='draw')
rf_data_nd2 <- droplevels(rf_data_nd2)


In [None]:
set.seed(100)
train = sample(nrow(rf_data_nd2),0.8*nrow(rf_data_nd2),replace=FALSE)
train_rf = rf_data_nd2[train,]
test_rf = rf_data_nd2[-train,]


In [None]:
rf_model <- randomForest(result ~ ., data = train_rf,ntree = 500, mtry = 3, importance = TRUE)
rf_model


In [None]:
rf_model <- randomForest(result ~ ., data = train_rf, ntree = 500, mtry = 3,importance = TRUE)
rf_predict <- predict(rf_model, test_rf, type = "class")
table(rf_predict, test_rf$result)
#64 percent accuracy

In [None]:
precision_rf <- posPredValue(rf_predict, test_rf$result, positive="win")
recall_rf <- sensitivity(rf_predict, test_rf$result, positive="win")

F1_rf <- (2 * precision_rf * recall_rf) / (precision_rf + recall_rf)
precision_rf
recall_rf
F1_rf

In [None]:
roc(as.factor(test_rf$result), as.numeric(rf_predict),plot=TRUE)

In [None]:
rf_model$importance

In [None]:
#a=c()
#i=5
#for (i in 3:8) {
#  model3 <- randomForest(result ~ ., data = train_rf, ntree = 500, mtry = i, importance = TRUE)
#  predValid <- predict(model3, test_rf2, type = "class")
#  a[i-2] = mean(predValid == test_rf$result)
#}
 
#a
 
#plot(3:8,a)

4. LASSO Model

In [None]:
pkgs <- list("glmnet", "doParallel", "foreach", "pROC")
lapply(pkgs, require, character.only = T)
registerDoParallel(cores = 4)

In [None]:
set.seed(2017)

elastic_data = select(overall_matches,c(result,tot_scored.x,home_scored.x,away_scored.x,tot_conceded.x,home_conceded.x,
away_conceded.x,wins.x,losses.x,draws.x,h_wins.x,a_wins.x,h_loss.x,a_loss.x,mean_goals.x,
var_goals.x,win_pct.x,loss_pct.x,hwin_pct.x,awin_pct.x,points.x,rank.x,
tot_scored.y,home_scored.y,away_scored.y,tot_conceded.y,home_conceded.y,away_conceded.y,
wins.y,losses.y,draws.y,h_wins.y,a_wins.y,h_loss.y,a_loss.y,mean_goals.y,var_goals.y,
win_pct.y,loss_pct.y,hwin_pct.y,awin_pct.y,points.y,rank.y))

n <- nrow(elastic_data)
sample <- sample(seq(n), size = n * 0.8, replace = FALSE)
train <- elastic_data[sample,]
train <- filter(train,result!='draw')
train <- droplevels(train)

test <- elastic_data[-sample,]
test <- filter(test,result!='draw')
test <- droplevels(test)

In [None]:
mdlY <- as.factor(as.matrix(train["result"]))
mdlX <- as.matrix(train[,2:43])
newY <- as.factor(as.matrix(test["result"]))
newX <- as.matrix(test[,2:43])

In [None]:
cv1 <- cv.glmnet(mdlX, mdlY, family = "binomial", nfold = 10, type.measure = "deviance", paralle = TRUE, alpha = 1)
md1 <- glmnet(mdlX, mdlY, family = "binomial", lambda = cv1$lambda.1se, alpha = 1)
coef(md1)

In [None]:
a <- seq(0.1, 0.9, 0.05)
search <- foreach(i = a, .combine = rbind) %dopar% {
  cv <- cv.glmnet(mdlX, mdlY, family = "binomial", nfold = 10, type.measure = "deviance", paralle = TRUE, alpha = i)
  data.frame(cvm = cv$cvm[cv$lambda == cv$lambda.1se], lambda.1se = cv$lambda.1se, alpha = i)
}
cv3 <- search[search$cvm == min(search$cvm), ]
md3 <- glmnet(mdlX, mdlY, family = "binomial", lambda = cv3$lambda.1se, alpha = cv3$alpha)
coef(md3)

In [None]:
roc(newY, as.numeric(predict(md3, newX, type = "response")),plot=TRUE)

In [None]:
actual = as.matrix(test['result'])
prediction = predict(md3, newX,type = "class")
table(actual,prediction)

In [None]:
precision_elastic <- posPredValue(as.factor(prediction), as.factor(actual), positive="win")
recall_elastic <- sensitivity(as.factor(prediction), as.factor(actual), positive="win")

F1_elastic <- (2 * precision_elastic * recall_elastic) / (precision_elastic + recall_elastic)
precision_elastic
recall_elastic
F1_elastic

In [None]:
options(repr.plot.width = 15, repr.plot.height = 8)
library(pROC)
rocobj1 <- roc(as.factor(test_rf$result), as.numeric(rf_predict))
rocobj2 <- roc(actual, as.numeric(predict(md3, newX, type = "response")))
ggroc(list(RandomForest = rocobj1, ElasticNet = rocobj2),cex=2)+
labs(x = "1 - Specificity",
       y = "Sensitivity",
       linetype = "Model")+theme_grey(base_size = 22)

In [None]:
rocobj1 <- roc(as.factor(test_rf$result), as.numeric(rf_predict))
rocobj2 <- roc(actual, as.numeric(predict(md3, newX, type = "response")))
rocobj1
rocobj2

In [None]:
table(actual,prediction)