In [2]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import scale
from sklearn.linear_model import LinearRegression

import numpy as np

# LINEAR REGRESSION

## SKLEARN LINEAR MODEL USES GRADIENT DESCENT AS OPTIMIZATION ALGORITHM

$ w_j = w_j - \alpha \frac{1}{n} \sum (Xw - y) x_j$

### GRADIENT DESCENT IS SUBSCEPTIBLE TO SCALE: BUT THE ALGORITHM AUTOMATICALLY SCALES THE FEATURE MATRIX

In [3]:
boston = load_boston()
X, y = scale(boston.data), boston.target
print(boston.data.shape, boston.target.shape)

(506, 13) (506,)


In [4]:
regression = LinearRegression()
regression.fit(X, y)
print(regression.score(X,y))

0.7406426641094095


# CALCULATE THE COEEFICIENT OF DETERMINATION
## RANGE IN BETWEEN 0 AND 1
$R^2 = 1 - \frac{\sum (Xw - y)^2}{\sum (\bar y - y)^2}$

### SHOULD BE THE SAME AS REGRESSION SCORE

In [5]:
mean_y = np.mean(y)
squared_errors_mean = np.sum((mean_y - y)**2)
squared_errors_model = np.sum((y - regression.predict(X))**2)
R2 = 1 - (squared_errors_model / squared_errors_mean)
print(R2)

0.7406426641094095


### GOOD RESULT FOR LINEAR REGRESSION. VALUES OVER 0.90 ARE RARE ARE MIGHT BE DERIVATIVE FROM DATA SNOOPING OR LEAKAGE.

In [6]:
print([a + ':' + str(round(b, 2)) for a, b in zip(boston.feature_names, regression.coef_)])

['CRIM:-0.93', 'ZN:1.08', 'INDUS:0.14', 'CHAS:0.68', 'NOX:-2.06', 'RM:2.67', 'AGE:0.02', 'DIS:-3.1', 'RAD:2.66', 'TAX:-2.08', 'PTRATIO:-2.06', 'B:0.85', 'LSTAT:-3.74']


### ZIP FUNCTION CREATES TUPLE ARRAY BY COUPLING THE FIRST ELEMENT IN THE FIRST ARRAY QITH FIRST ELEMENT IN SECOND. THEN CONTINUES..

# FOR QUALITATIVE MEASURES

In [7]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [8]:
#lbl = LabelEncoder()
enc = OneHotEncoder()

In [9]:
qualitative = np.array(['red', 'red', 'green', 'blue', 'red', 'blue', 'blue', 'green']).reshape(8,1)

In [10]:
#qualitative = ['red', 'red', 'green', 'blue', 'red', 'blue', 'blue', 'green']

In [11]:
#labels = lbl.fit_transform(qualitative).reshape(8,1)
#print(labels)

In [12]:
print(type(enc.fit_transform(qualitative)))
print(enc.fit_transform(qualitative).toarray())

<class 'scipy.sparse.csr.csr_matrix'>
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]


### IF MISSING DATA THE MODEL MIGHT STOP
### FIT MISSING DATA WITH 0 OR FIT WITH LINEAR INTERPOLATION OR WITH MEAN OF THAT FEATURE
### OTHER SOLUTION: CREATE ANOTHER BINARY FEATURE WHICH SHOWS THAT A FEATURE IS MISSING.

<br>

### LINEAR REGRESSION IS DISRUPTED BY OUTLIERS. 
### THE MODEL TRIES TO MINIMISE THE SQUARE VALUE OF THE ERRORS (RESIDUALS).
### THE OUTLIERS HAVE LARGE RESIDUAL!!

<br>

### LINEAR MODEL HAS ONE COEFFICIENT FOR EACH FEATURE. THIS IS PERFECT FOR EXPRESSING A UNCORRELATED FEATURE SITUATION. BUT AS AN EXAMPLE:  HUMAN AGE AND HAIR COLOR ARE CORRELATED.

### USE POLYNOMIAL EXPANSION OF THE FEATURES where you consider a beta coefficient for the second degree of the features as follows:

$y = b_1x_1 + b_2x_2 + a$

to 

$ y = b_1x_1 + b_2x_2 + a + b_3 x_1^2 + b_4x_2^2 + b_5 x_1 x_2$



In [13]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

In [15]:
pf = PolynomialFeatures(degree = 2)
poly_X = pf.fit_transform(X)

In [16]:
print(poly_X.shape)

(506, 105)


### for each feature couple combination created another 3 features with the polynomial expansion.
### each one of these new features will have their own beta coefficient



In [17]:
X_train, X_test, y_train, y_test = train_test_split(poly_X, y, test_size = 0.33, random_state = 42)

In [23]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

In [54]:
# RIDGE DISTANCE
reg_regression = Ridge(alpha = 0.2, normalize = True, max_iter = 1000)
reg_regression.fit(X_train, y_train)

print('R2: %0.3f' % r2_score(y_test, reg_regression.predict(X_test)))

R2: 0.824


## what's happening?
### split the polynomial dataset into train and test at 0.33 % for test.
### use ridge distance
### NORMALIZE DATA AS THE POLYNOMIAL EXPANSION FEATURES ARE SQUARED!!
### train on train split data
### find score of the regression predict of the X_test against the true value: y_test

In [None]:
### USING POLYNOMIAL EXPANSION DECREASES BIAS 