# Lecture 05 regression hands-on

Some minimal effort attempt at running some regression 

In [1]:
library(dplyr)
library(magrittr)
library(ggplot2)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
data_path <- "./data/"
fungal_toxin_df <- read.table(paste0(data_path, "fungalToxin.txt"), header = TRUE, sep = "\t")
agewould_train_df <- read.csv(paste0(data_path, "AgeWould_Train.csv"))

In [3]:
head(fungal_toxin_df)

Unnamed: 0_level_0,rain,noon_temp,sunshine,wind_speed,toxin
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1.3,20.9,6.23,13.3,18.1
2,2.28,25.4,8.13,10.8,28.6
3,1.11,28.2,10.21,10.9,15.9
4,0.74,23.7,6.96,8.2,19.2
5,1.32,26.5,9.04,9.8,19.3
6,0.51,23.9,7.84,12.3,14.8


## Simple linear regression

In [4]:
simple_lm <- fungal_toxin_df %>%
    lm(formula = toxin ~ rain)
summary(simple_lm)


Call:
lm(formula = toxin ~ rain, data = .)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.9479 -1.1061 -0.3528  0.7596  3.6531 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   10.570      1.961   5.390 0.000654 ***
rain           6.726      1.356   4.961 0.001105 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.16 on 8 degrees of freedom
Multiple R-squared:  0.7547,	Adjusted R-squared:  0.724 
F-statistic: 24.61 on 1 and 8 DF,  p-value: 0.001105


## Multiple Linear Regression

In [5]:
multi_lm <- fungal_toxin_df %>%
    lm(formula = toxin ~ .)
summary(multi_lm)


Call:
lm(formula = toxin ~ ., data = .)

Residuals:
      1       2       3       4       5       6       7       8       9      10 
-1.8818  2.0498 -0.6314  0.4787 -0.5805  1.2508 -0.1921 -0.1813 -1.1552  0.8429 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  31.6084     7.1051   4.449  0.00671 ** 
rain          7.0676     1.0031   7.046  0.00089 ***
noon_temp    -0.4201     0.2413  -1.741  0.14215    
sunshine     -0.2375     0.5086  -0.467  0.66018    
wind_speed   -0.7936     0.2977  -2.666  0.04458 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.574 on 5 degrees of freedom
Multiple R-squared:  0.9186,	Adjusted R-squared:  0.8535 
F-statistic: 14.11 on 4 and 5 DF,  p-value: 0.006232


In [6]:
multi_lm_inter <- fungal_toxin_df %>%
    dplyr::mutate(temp_binary = ifelse(noon_temp >= median(noon_temp), 1, 0)) %>%
    lm(formula = toxin ~ temp_binary * rain) 
summary(multi_lm_inter)


Call:
lm(formula = toxin ~ temp_binary * rain, data = .)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.7151 -0.9304 -0.1032  0.8287  2.3493 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)        13.380      1.693   7.905 0.000217 ***
temp_binary        -9.731      3.176  -3.064 0.022123 *  
rain                4.690      1.292   3.629 0.010978 *  
temp_binary:rain    6.345      2.144   2.960 0.025287 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.556 on 6 degrees of freedom
Multiple R-squared:  0.9046,	Adjusted R-squared:  0.8569 
F-statistic: 18.96 on 3 and 6 DF,  p-value: 0.001831


## Logistic Regression

In [7]:
head(agewould_train_df)

Unnamed: 0_level_0,Age,Gender,Would_train
Unnamed: 0_level_1,<int>,<chr>,<int>
1,44,F,1
2,25,F,1
3,30,M,1
4,27,F,1
5,26,F,1
6,46,M,0


In [8]:
logistic_lm <- agewould_train_df %>%
    glm(formula = Would_train ~ ., family = "binomial")
summary(logistic_lm)


Call:
glm(formula = Would_train ~ ., family = "binomial", data = .)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  4.60567    0.62828   7.331 2.29e-13 ***
Age         -0.09203    0.01372  -6.706 2.00e-11 ***
GenderM     -0.01010    0.32196  -0.031    0.975    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 314.70  on 235  degrees of freedom
Residual deviance: 249.74  on 233  degrees of freedom
AIC: 255.74

Number of Fisher Scoring iterations: 4


In [9]:
cbind(coef(logistic_lm), confint(logistic_lm))

Waiting for profiling to be done...



Unnamed: 0,Unnamed: 1,2.5 %,97.5 %
(Intercept),4.60567404,3.4330773,5.90528146
Age,-0.0920303,-0.120252,-0.06626529
GenderM,-0.01010163,-0.6368411,0.62924877
