In [1]:
library(ISLR)
library(data.table)
library(boot) # for cv.glm (cross validation)

In [2]:
data(Hitters)
Hitters1=data.frame(Hitters)
Hitters1=na.omit(Hitters1) # remove NA's

Salary1<-log(Hitters1$Salary)
Hitters1=data.frame(Hitters1, Salary1)

HighSal = ifelse(Hitters1$Salary1<6.62, "No", "Yes") # cut can also be used, when there are more than two columns
Hitters1 = data.frame(Hitters1,HighSal)

In [3]:
head(Hitters1)
str(Hitters1)
dim(Hitters1)

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,...,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague,Salary1,HighSal
-Alan Ashby,315,81,7,24,38,39,14,3449,835,69,...,375,N,W,632,43,10,475.0,N,6.163315,No
-Alvin Davis,479,130,18,66,72,76,3,1624,457,63,...,263,A,W,880,82,14,480.0,A,6.173786,No
-Andre Dawson,496,141,20,65,78,37,11,5628,1575,225,...,354,N,E,200,11,3,500.0,N,6.214608,No
-Andres Galarraga,321,87,10,39,42,30,2,396,101,12,...,33,N,E,805,40,4,91.5,N,4.516339,No
-Alfredo Griffin,594,169,4,74,51,35,11,4408,1133,19,...,194,A,W,282,421,25,750.0,A,6.620073,Yes
-Al Newman,185,37,1,23,8,21,2,214,42,1,...,24,N,E,76,127,7,70.0,A,4.248495,No


'data.frame':	263 obs. of  22 variables:
 $ AtBat    : int  315 479 496 321 594 185 298 323 401 574 ...
 $ Hits     : int  81 130 141 87 169 37 73 81 92 159 ...
 $ HmRun    : int  7 18 20 10 4 1 0 6 17 21 ...
 $ Runs     : int  24 66 65 39 74 23 24 26 49 107 ...
 $ RBI      : int  38 72 78 42 51 8 24 32 66 75 ...
 $ Walks    : int  39 76 37 30 35 21 7 8 65 59 ...
 $ Years    : int  14 3 11 2 11 2 3 2 13 10 ...
 $ CAtBat   : int  3449 1624 5628 396 4408 214 509 341 5206 4631 ...
 $ CHits    : int  835 457 1575 101 1133 42 108 86 1332 1300 ...
 $ CHmRun   : int  69 63 225 12 19 1 0 6 253 90 ...
 $ CRuns    : int  321 224 828 48 501 30 41 32 784 702 ...
 $ CRBI     : int  414 266 838 46 336 9 37 34 890 504 ...
 $ CWalks   : int  375 263 354 33 194 24 12 8 866 488 ...
 $ League   : Factor w/ 2 levels "A","N": 2 1 2 2 1 2 1 2 1 1 ...
 $ Division : Factor w/ 2 levels "E","W": 2 2 1 1 2 1 2 2 1 1 ...
 $ PutOuts  : int  632 880 200 805 282 76 121 143 0 238 ...
 $ Assists  : int  43 82 11 40 42

## Model 1 - log.fit

In [4]:
# glm - generalised linear model fitting
# fits logistic regression with family=binomial, fits logit function

log.fit = glm(HighSal~AtBat+Hits+Years+League, data = Hitters1, family = binomial)
coef(log.fit) # Years is most impactful
summary(log.fit)$coef # Years is most impactful < 0.05

Unnamed: 0,Estimate,Std. Error,z value,Pr(>|z|)
(Intercept),-6.299755413,0.91024571,-6.9209394,4.486583e-12
AtBat,0.005128261,0.004053662,1.2650935,0.2058378
Hits,0.010777969,0.012420578,0.867751,0.3855306
Years,0.206383384,0.037488771,5.5052054,3.68738e-08
LeagueN,0.147374365,0.335663266,0.4390542,0.6606223


In [5]:
summary(log.fit)
# Null deviance: 304.77 - RSS when no features are selected
# AIC (Akchnine Information Criteria) should be the lowest as represents error
# RSS should be max as its reduction in RSS


Call:
glm(formula = HighSal ~ AtBat + Hits + Years + League, family = binomial, 
    data = Hitters1)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7690  -0.6759  -0.3605   0.5773   3.4084  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -6.299755   0.910246  -6.921 4.49e-12 ***
AtBat        0.005128   0.004054   1.265    0.206    
Hits         0.010778   0.012421   0.868    0.386    
Years        0.206383   0.037489   5.505 3.69e-08 ***
LeagueN      0.147374   0.335663   0.439    0.661    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 304.77  on 262  degrees of freedom
Residual deviance: 227.83  on 258  degrees of freedom
AIC: 237.83

Number of Fisher Scoring iterations: 5


## Model 2 - log.fit2

In [6]:
log.fit2 = glm(HighSal~. -Salary1-Salary, data = Hitters1, family = binomial)

#coef(log.fit2) # Years is most impactful
#summary(log.fit2)$coef # Years is most impactful < 0.05

# all variables should be independent of each other.
# VIF - Variance Influence Factor, Variables are corelated or not, if value is more than 5 those will be highly corelted. Multicolinearity
# should not use highly correlated variables
# Use cor(Hitters1[1:4]) such like to see colinearity. on numeric columns only.

summary(log.fit2)


Call:
glm(formula = HighSal ~ . - Salary1 - Salary, family = binomial, 
    data = Hitters1)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9896  -0.5295  -0.2959   0.1643   3.1457  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -4.4811751  1.0976178  -4.083 4.45e-05 ***
AtBat        0.0008590  0.0059492   0.144   0.8852    
Hits         0.0130527  0.0231071   0.565   0.5722    
HmRun        0.0595967  0.0585420   1.018   0.3087    
Runs        -0.0279304  0.0267644  -1.044   0.2967    
RBI         -0.0137384  0.0247643  -0.555   0.5791    
Walks        0.0383097  0.0174461   2.196   0.0281 *  
Years       -0.2066964  0.1368856  -1.510   0.1310    
CAtBat       0.0000525  0.0012151   0.043   0.9655    
CHits        0.0045971  0.0064962   0.708   0.4792    
CHmRun       0.0010765  0.0141199   0.076   0.9392    
CRuns        0.0009699  0.0066257   0.146   0.8836    
CRBI        -0.0008377  0.0064291  -0.130   0.8963    
CWalks 

In [7]:
cor(Hitters1[1:4])

Unnamed: 0,AtBat,Hits,HmRun,Runs
AtBat,1.0,0.9639691,0.5551022,0.8998291
Hits,0.9639691,1.0,0.5306274,0.9106301
HmRun,0.5551022,0.5306274,1.0,0.6310759
Runs,0.8998291,0.9106301,0.6310759,1.0


## Model 3 - log.fit3

In [8]:
log.fit3 = glm(HighSal~CAtBat+CHits+Years+League, data = Hitters1, family = binomial)
summary(log.fit3)$coef
summary(log.fit3)

Unnamed: 0,Estimate,Std. Error,z value,Pr(>|z|)
(Intercept),-1.6415460918,0.3711770722,-4.4225417,9.75465e-06
CAtBat,-0.0006533893,0.0008642893,-0.7559845,0.4496585
CHits,0.0070315523,0.0028287554,2.4857406,0.01292821
Years,-0.392708325,0.1218074909,-3.224008,0.001264099
LeagueN,-0.2624681862,0.343247327,-0.764662,0.4444728



Call:
glm(formula = HighSal ~ CAtBat + CHits + Years + League, family = binomial, 
    data = Hitters1)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0789  -0.5774  -0.4353   0.2869   2.5103  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.6415461  0.3711771  -4.423 9.75e-06 ***
CAtBat      -0.0006534  0.0008643  -0.756  0.44966    
CHits        0.0070316  0.0028288   2.486  0.01293 *  
Years       -0.3927083  0.1218075  -3.224  0.00126 ** 
LeagueN     -0.2624682  0.3432473  -0.765  0.44447    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 304.77  on 262  degrees of freedom
Residual deviance: 219.93  on 258  degrees of freedom
AIC: 229.93

Number of Fisher Scoring iterations: 5


## Predicting using log.fit model

In [9]:
log.probs = predict(log.fit, type = "response")
log.probs

In [14]:
log.pred = rep("No", 263)
log.pred[log.probs > 0.5] = "Yes"
pred.tab <- table(predicted=log.pred, True=Hitters1$HighSal)
pred.tab
#plot(pred.tab)

         True
predicted  No Yes
      No  174  40
      Yes  19  30

In [26]:
sum(pred.tab[c(1,4)])/sum(pred.tab[1:4]) # correct predictions
1-sum(pred.tab[c(1,4)])/sum(pred.tab[1:4]) # prediction error

#### Using Cross Validation

In [22]:
cv.log.fit <- cv.glm(Hitters1, log.fit, K=10)
cv.log.fit$delta[1] # CV estimate of prediction error

# KNN

In [31]:
library(class)
set.seed(1)
train = sample(1:nrow(Hitters1), 4*nrow(Hitters1)/5)
test=(-train)

In [32]:
x.train <- cbind(Hitters1$AtBat[train], Hitters1$Hits[train], Hitters1$Years[train], Hitters1$League[train])

x.test <- cbind(Hitters1$AtBat[test], Hitters1$Hits[test], Hitters1$Years[test], Hitters1$League[test])

y.train.cat <- Hitters1$HighSal[train]
y.test.cat <- Hitters1$HighSal[test]

y.train.reg <- Hitters1$Salary1[train]
y.test.reg <- Hitters1$Salary1[test]

## Categorical Data

In [33]:
# k=3
knn.pred.cat = knn(x.train, x.test, y.train.cat, k=3)