# Importing the dataset

In [2]:
dataset = read.csv("50_Startups.csv")
# dataset = [, 2:3]
# dataset

# Taking care of missing data

In [2]:
#dataset$Age = ifelse(is.na(dataset$Age), ave(dataset$Age, FUN = function(x) mean(x, na.rm = TRUE)), dataset$Age)

# Encoding categorical data

In [3]:
dataset$State = factor(dataset$State, levels = c('New York', 'California', 'Florida'), labels = c(1,2,3))

# Splitting the dataset into the Training set and Test set

In [5]:
# options(repos='http://cran.rstudio.com/')
# install.packages('caTools')
library(caTools)
set.seed(42)
split = sample.split(dataset$Profit, SplitRatio = 2/3)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# dim(training_set)

# Feature Scaling

In [5]:
# training_set = scale(training_set)
# test_set = scale(test_set)

# Fitting multiple linear regression to the training set

In [9]:
regressor = lm(formula = Profit ~ ., data = training_set)
# summary(regressor)


Call:
lm(formula = Profit ~ ., data = training_set)

Residuals:
   Min     1Q Median     3Q    Max 
-31725  -4162   1002   4588  15700 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      4.645e+04  1.081e+04   4.299   0.0002 ***
R.D.Spend        8.115e-01  5.986e-02  13.556 1.45e-13 ***
Administration  -2.515e-02  6.480e-02  -0.388   0.7010    
Marketing.Spend  3.097e-02  2.383e-02   1.299   0.2048    
State2           1.498e+03  4.731e+03   0.317   0.7539    
State3           2.265e+03  4.523e+03   0.501   0.6206    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9329 on 27 degrees of freedom
Multiple R-squared:  0.948,	Adjusted R-squared:  0.9384 
F-statistic: 98.48 on 5 and 27 DF,  p-value: < 2.2e-16


# Predicting the test set results

In [7]:
y_pred = predict(regressor, newdata = test_set)
# y_pred

# Building optimal model using backward elimination

In [10]:
regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + State, data = dataset) #using the whole data
# summary(regressor)


Call:
lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + 
    State, data = dataset)

Residuals:
   Min     1Q Median     3Q    Max 
-33504  -4736     90   6672  17338 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      5.008e+04  6.953e+03   7.204 5.76e-09 ***
R.D.Spend        8.060e-01  4.641e-02  17.369  < 2e-16 ***
Administration  -2.700e-02  5.223e-02  -0.517    0.608    
Marketing.Spend  2.698e-02  1.714e-02   1.574    0.123    
State2           4.189e+01  3.256e+03   0.013    0.990    
State3           2.407e+02  3.339e+03   0.072    0.943    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9439 on 44 degrees of freedom
Multiple R-squared:  0.9508,	Adjusted R-squared:  0.9452 
F-statistic: 169.9 on 5 and 44 DF,  p-value: < 2.2e-16


# Automatic backward elimination

In [11]:
backwardElimination <- function(x, sl) {
    numVars = length(x)
    for (i in c(1:numVars)){
      regressor = lm(formula = Profit ~ ., data = x)
      maxVar = max(coef(summary(regressor))[c(2:numVars), "Pr(>|t|)"])
      if (maxVar > sl){
        j = which(coef(summary(regressor))[c(2:numVars), "Pr(>|t|)"] == maxVar)
        x = x[, -j]
      }
      numVars = numVars - 1
    }
    return(summary(regressor))
  }
  
  SL = 0.05
  dataset = dataset[, c(1,2,3,4,5)]
  backwardElimination(training_set, SL)


Call:
lm(formula = Profit ~ ., data = x)

Residuals:
   Min     1Q Median     3Q    Max 
-31994  -3750   -588   5113  19212 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 4.668e+04  3.333e+03   14.00 5.99e-15 ***
R.D.Spend   8.659e-01  3.874e-02   22.35  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9231 on 31 degrees of freedom
Multiple R-squared:  0.9416,	Adjusted R-squared:  0.9397 
F-statistic: 499.5 on 1 and 31 DF,  p-value: < 2.2e-16
