In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# sklearn package for linear regression
from sklearn.linear_model import LinearRegression

### Load the Dataset

In [2]:
data = pd.read_csv('../data/multiple_linear_regression.csv')
data.head()

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
0,1714,2.4,1
1,1664,2.52,3
2,1760,2.54,3
3,1685,2.74,3
4,1693,2.83,2


In [3]:
data.describe()

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
count,84.0,84.0,84.0
mean,1845.27381,3.330238,2.059524
std,104.530661,0.271617,0.855192
min,1634.0,2.4,1.0
25%,1772.0,3.19,1.0
50%,1846.0,3.38,2.0
75%,1934.0,3.5025,3.0
max,2050.0,3.81,3.0


In [4]:
# we know that SAT is a good indicator of GPA

# let's see if Rand variable is a good one too

# sample is the ML word for observations (or sample size)

In [6]:
# declare our dependent and independent variables
x = data[['SAT', 'Rand 1,2,3']]
y = data['GPA']

In [8]:
# regression
reg = LinearRegression()
reg.fit(x,y)

# WE USED 2 LINES OF CODE TO GENERATE A ML MODEL! 

LinearRegression()

In [10]:
reg.coef_

#          SAT        RAND 1,2,3

array([ 0.00165354, -0.00826982])

In [11]:
reg.intercept_

0.29603261264909486

### Calculating the R-squared

In [15]:
R2 = reg.score(x, y)
R2.round(3)  # regular R-squared value

0.407

The adjusted R-squared penalizes the use of extra variables.

$R^2_{adj} = 1 - (1 - R^2) *\frac{n - 1}{n - p - 1}$

* n is the number of observations (84)
* p is the number of predictors (2)

In [23]:
def compute_adjusted_R2(x, R2):
    n = x.shape[0]
    p = x.shape[1]
    c = (n - 1) / (n - p - 1)
    result = 1 - (1 - R2) * c
    return result.round(3)

In [24]:
compute_adjusted_R2(x, R2)

0.392

### Example: Real Estate Data

In [50]:
data = pd.read_csv('../data/real_estate_price_size_year.csv')
data

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009
...,...,...,...
95,252460.400,549.80,2009
96,310522.592,1037.44,2009
97,383635.568,1504.75,2006
98,225145.248,648.29,2015


In [51]:
data.describe()

Unnamed: 0,price,size,year
count,100.0,100.0,100.0
mean,292289.47016,853.0242,2012.6
std,77051.727525,297.941951,4.729021
min,154282.128,479.75,2006.0
25%,234280.148,643.33,2009.0
50%,280590.716,696.405,2015.0
75%,335723.696,1029.3225,2018.0
max,500681.128,1842.51,2018.0


In [29]:
# dependent and independent variables
x = data[['size', 'year']]
y = data['price']

In [30]:
# regression
reg = LinearRegression()
reg.fit(x, y)

LinearRegression()

In [31]:
# coefficients
reg.coef_

array([ 227.70085401, 2916.78532684])

In [32]:
# intercept
reg.intercept_

-5772267.01746328

In [34]:
# normal R-squared value
R2 = reg.score(x, y)
R2.round(3)

0.776

In [35]:
# adjusted R-squared value
R2_adj = compute_adjusted_R2(x, R2)
R2_adj

0.772

### Compare the R-squared and the Adjusted R-squared

The R-squared value of 0.776 is only slightly greater than the adjusted R-squared value of 0.772. Therefore, we were not penalized much for the inclusion of the second independent variable.

In the simple linear regression for price and size, the $R^2$ was 0.745 and the adjusted $R^2$ for the current multiple regression is 0.772. It seems that after adding the year feature, not much was added to our model: 0.772 is only slightly greater than 0.745.

### Predicted Values

Find the predicted price of an apartment that has a size of 750 sq.ft. from 2009.

In [36]:
new_data = pd.DataFrame(data=[[750, 2009]], columns=[['size', 'year']])

In [37]:
new_data

Unnamed: 0,size,year
0,750,2009


In [42]:
print("Predicted value for the house: ${}".format(reg.predict(new_data)[0].round(2)))

Predicted value for the house: $258330.34


### Univariate P-values

Use feature selection to compute the p-values.

In [43]:
from sklearn.feature_selection import f_regression

In [44]:
f_regression(x, y)

(array([285.92105192,   0.85525799]), array([8.12763222e-31, 3.57340758e-01]))

In [45]:
p_values = f_regression(x, y)[1].round(3)

In [46]:
p_values

array([0.   , 0.357])

In [48]:
# summary table
reg_summary = pd.DataFrame(data=x.columns.values, columns=['Features'])
reg_summary

Unnamed: 0,Features
0,size
1,year


In [49]:
reg_summary['Coefficients'] = reg.coef_
reg_summary['P-values'] = p_values
reg_summary

Unnamed: 0,Features,Coefficients,P-values
0,size,227.700854,0.0
1,year,2916.785327,0.357


Observing the p-values, it seems that size is a good predictor of the price, but since the year feature has a p-value of 0.357 which is much greater than 0.05, it seems that the year is not a useful feature to include in our model. Therefore, we can remove the year feature from the model.

### Feature Scaling

Continue working with the real estate data.

In [54]:
# scale the inputs 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)
xs = scaler.transform(x)

In [55]:
# regression with scaled input
reg = LinearRegression()
reg.fit(xs, y)

LinearRegression()

In [59]:
# print a summary table
summary_table = pd.DataFrame(data=['Bias', 'Size', 'Year'], columns=['Features'])
summary_table['Weights'] = [reg.intercept_, reg.coef_[0], reg.coef_[1]]
summary_table

Unnamed: 0,Features,Weights
0,Bias,292289.47016
1,Size,67501.576142
2,Year,13724.397082


In [61]:
# R-squared
R2 = reg.score(xs, y)
R2.round(3)

0.776

In [62]:
# Adjusted R-squared
R2_adj = compute_adjusted_R2(xs, R2)
R2_adj

0.772

In [63]:
# predicted values using scaled data
new_data = pd.DataFrame(data=[[750, 2009]], columns=[['size', 'price']])
new_scaled = scaler.transform(new_data)
new_scaled

array([[-0.34752816, -0.76509206]])

In [72]:
new_data = [[750, 2009]]
new_data_scaled = scaler.transform(new_data)
reg.predict(new_data_scaled)[0]

258330.34465994601

In [67]:
print('${}'.format(reg.predict(new_scaled)[0].round(2)))

$258330.34


In [68]:
p_values = f_regression(xs, y)[1]
p_values.round(3)

array([0.   , 0.357])

In [73]:
# summary table
reg_summary = pd.DataFrame(data=x.columns.values, columns=['Features'])
reg_summary['Coefficients'] = reg.coef_
reg_summary['P-values'] = p_values.round(3)
reg_summary

Unnamed: 0,Features,Coefficients,P-values
0,size,67501.576142,0.0
1,year,13724.397082,0.357


### Train-Test Split

In [74]:
from sklearn.model_selection import train_test_split

In [76]:
a = np.arange(1, 101)

In [77]:
b = np.arange(501, 601)

In [78]:
# split the data
train_test_split(a)

[array([ 22,   8,  17,  71,   3,  38,  37,  45,   9,  53,  13,  40,   7,
         30,  68,  31,   6,  27,  80,  97,  57,  28,  35,  99, 100,  87,
         96,  41,  42,  84,  34,  73,  10,  21,  43,  77,  74,  89,  94,
         54,  46,  64,  44,  18,  59,  33,  15,  61,  55,  90,  98,  36,
         47,  86,  93,  49,  25,  11,  23,  76,  14,   2,  51,  12,  63,
         88,  65,  70,  72,  60,  75,  20,  26,   1,  85]),
 array([58, 50, 92, 52, 95, 69,  5, 56, 32, 19, 83, 81, 78, 29, 91, 39,  4,
        62, 79, 82, 67, 24, 66, 16, 48])]

In [95]:
a_train, a_test = train_test_split(a)
a_train, a_test = train_test_split(a, test_size=0.2)
# a_train, a_test = train_test_split(a, test_size=0.2, shuffle=False)  # usually keep it as True
a_train, a_test = train_test_split(a, test_size=0.2, random_state=42)

In [88]:
a_train.shape

(80,)

In [89]:
a_test.shape

(20,)

In [96]:
a_train

array([ 56,  89,  27,  43,  70,  16,  41,  97,  10,  73,  12,  48,  86,
        29,  94,   6,  67,  66,  36,  17,  50,  35,   8,  96,  28,  20,
        82,  26,  63,  14,  25,   4,  18,  39,   9,  79,   7,  65,  37,
        90,  57, 100,  55,  44,  51,  68,  47,  69,  62,  98,  80,  42,
        59,  49,  99,  58,  76,  33,  95,  60,  64,  85,  38,  30,   2,
        53,  22,   3,  24,  88,  92,  75,  87,  83,  21,  61,  72,  15,
        93,  52])

In [97]:
a_test

array([84, 54, 71, 46, 45, 40, 23, 81, 11,  1, 19, 31, 74, 34, 91,  5, 77,
       78, 13, 32])

In [98]:
# every time we split the data, we get different sets for training and testing
# so, R^2 can change by a few percentage points each time
# we would like to shuffle in the same way each time: 
#   therefore, use the random_state input as if it were like a random SEED

In [102]:
a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.2, random_state=365)

In [103]:
b_train

array([525, 532, 599, 573, 591, 566, 503, 559, 594, 501, 508, 515, 590,
       554, 531, 520, 577, 582, 530, 535, 595, 542, 538, 507, 511, 550,
       521, 548, 502, 517, 510, 558, 568, 543, 541, 516, 588, 572, 579,
       600, 580, 539, 524, 586, 522, 523, 562, 576, 518, 547, 555, 526,
       560, 519, 571, 564, 551, 563, 565, 528, 512, 578, 513, 544, 575,
       587, 540, 504, 529, 549, 537, 557, 527, 574, 506, 545, 592, 534,
       553, 583])

In [104]:
b_test

array([509, 569, 581, 556, 533, 593, 584, 561, 546, 589, 585, 567, 597,
       505, 570, 536, 598, 596, 514, 552])