In [None]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline


In [None]:
import sys
#sys.executable #sys.version_info
sys.version
print('\n'.join(f'{m.__name__}=={m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))


In [None]:
import sys
import matplotlib

print('python:',sys.version)
print('Numpy:',np.__version__)
print('Pandas:',pd.__version__)
print('Scikitlearn:',sklearn.__version__)
print('Seaborn: ',sns.__version__)
print('matplotlib:',matplotlib.__version__)


In [None]:
Boston=pd.read_csv('Data/housing.data',delim_whitespace=True, header=None)
Boston.columns=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX',
                'PTRATIO','B','LSTAT','MEDV']
#Boston.to_csv('Boston.csv')
Boston.describe()


<table style="width:60%">
<tr>
<th> Code :   </th>
<th> Description : </th>
</tr>

<tr>
<th> CRIM </th> 
<th> per capita crime rate per town  </th>
</tr>
<tr>
<th> ZN </th>
<th> proportion of residential land zones for lots over 25000 sgr ft ></th>
</tr>
<tr>
<th> INDUS </th>
<th> proportion of non retail business acr per town </th>
</tr>
<tr>
<th> CHAS</th>
<th> Charles River dummy variable (=1 if tract bounds river) </th>
</tr>
<tr>
<th> NOX </th>
<th> NO concentration (pp 10 million) </th>
</tr>
<tr>
<th> RM </th>
<th> Average N rooms  </th>
</tr>
<tr>
<th>  AGE  </th>
<th> proportion of owner occupied builts prior 1940 </th>
</tr>
<tr>
<th>  DIS  </th>
<th> weighted distances to 5 Boston employment centers </th>
</tr>

<tr>
<th>  RAD  </th>
<th> index of accessibility to radial highways </th>
</tr>
    
<tr>
<th>  TAX  </th>
<th> full value property tax rate per 10 000 $ </th>
</tr>
    
<tr>
<th>  PTRATIO </th>
<th> pupil teacher ratio by town </th>
</tr>
    
<tr>
<th> B </th>
<th> 1000(BK-0.63)² ; bk proportion of blacks by town </th>
</tr>
    
<tr>
<th> LSTAT </th>
<th> % lower status of the population </th>
</tr>
    
<tr>
<th>  MEDV  </th>
<th> median value of owner occupied homes in 1000 $ </th>
</tr>
</table>


# Exploratory data Analysis (EDA) #

In [None]:
sns.pairplot(Boston, height=1.5)
plt.show()


In [None]:
Features=['CRIM','ZN','INDUS','NOX','RM']
sns.pairplot(Boston[Features], height=2.5)
plt.show()


In [None]:
Features2=['AGE','TAX','PTRATIO','B','LSTAT','MEDV']
sns.pairplot(Boston[Features2], height=2.5)
plt.show()


# Correlation & Features selection #

In [None]:
pd.options.display.float_format='{:,.3f}'.format
Boston.corr()


In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(Boston.corr(), annot=True)
plt.show()


In [None]:

plt.figure(figsize=(12,10))
sns.heatmap(Boston[['CRIM','ZN','INDUS','CHAS','MEDV']].corr(), annot=True)
plt.show()


# Linear regression with Scikit-Learn #

## Five steps of in using Scikit-earn estimator API by Jacob T. Vanderplas: ##
<ol>
<li> Choose class model by importing the appropriate estimator class from Scikit-earn </li>
<li> Choose model hyperparameters by instanciating this class with desired values
<li> Arrange data into a features matrix and target vector
<li> Fit the model to ur data by calling the fit() method of the model instance
<li> Apply the model to the new data:
<ul>
<li> for supervised learning: often we predict labels for unknown data using predict() method
<li> for unsupervised learning: often we transform or infer properties of the data using the transform() or predict() method
</ul>
</ol>

### MEDV vs RM Linear regression ###

In [None]:
from sklearn.linear_model import LinearRegression
X=Boston['RM'].values.reshape(-1,1)
Y=Boston['MEDV'].values


In [None]:
model=LinearRegression()
model.fit(X,Y)
print(f'linear model coeficient={model.coef_} and B={model.intercept_}')


In [None]:
plt.figure(figsize=(10,6))
sns.regplot(x=X,y=Y,color='orange')
plt.xlabel('Average N of rooms')
plt.ylabel('owner occupied houses mediane value 1000$')
plt.show()


In [None]:
sns.jointplot(x='RM', y='MEDV', data=Boston, kind='reg',height=10,color='orange')
plt.grid()
plt.show()


In [None]:

model.predict(np.array([5]).reshape(1,-1))


### LSTAT vs MEDV Linear regression ###

In [None]:

X2=Boston['LSTAT'].values.reshape(-1,1)
Y2=Boston['MEDV'].values
model2=LinearRegression()
model2.fit(X2,Y2)
print(f'linear, a={model2.coef_} and b={model2.intercept_}')


In [None]:
plt.figure(figsize=(10,6))
sns.regplot(x=X2,y=Y2,color='DeepSkyBlue');
plt.xlabel('Median value of owner occupied homes in 1000 $')
plt.ylabel('% lower status of the population')
plt.grid()
plt.show();


In [None]:
#optional
sns.jointplot(x='LSTAT', y='MEDV', data=Boston, kind='reg',height=10,color='lime')
plt.grid()
plt.show()


In [None]:
model2.predict(np.array([15]).reshape(1,-1))


# Robust Regression #

## Random Sample Consensus (RANSAC) ##
<ol>
<li> select <strong> min_samples </strong>random samples from original data and check whether the dataset is valid <strong>(is_data_valid)</strong> </li>
<li> fit a model to a random subset <strong>(base_estimator.fit)</strong> and check whether the estimated model is valid <strong> (is_model_valid)</strong> </li>
<li> Classify all data as inliers or ouliers by calculating the residuals to the estimated model <strong>(base_estimator.predict(x)-y)</strong> all data samples
with absolute residuals smaller than the <strong>residual_threshold </strong>are considered as inliers </li>
<li> save fitted model as best model if number of outliers samples is maximal, in case the current estimated model has the same number of inliers,
    it is considered as the best model if it has better score</li>
</ol>

In [None]:
from sklearn import linear_model
from sklearn import datasets
#RM vs MEDV
#X=Boston['RM'].values.reshape(-1,1)
#Y=Boston['MEDV'].values
coef=True
# Robustly fit linear model with RANSAC algorithm
ransac = linear_model.RANSACRegressor()
ransac.fit(X,Y)
inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)

# Compare estimated coefficients
print(f'Estimated coefficients (coef:{coef} , linear regression: {model.coef_},RANSAC= {ransac.estimator_.coef_}x+{ransac.estimator_.intercept_})')


In [None]:
#predictions and plotting
line_x=np.arange(3,10,1)
line_y_ransac=ransac.predict(line_x.reshape(-1,1))
line_y=model.predict(line_x.reshape(-1,1))

sns.set(style='darkgrid', context='notebook')
plt.figure(figsize=(12,8))
plt.scatter(X[inlier_mask], Y[inlier_mask], c='blue', marker='o', label='Inliers')
plt.scatter(X[outlier_mask], Y[outlier_mask], c='violet', marker='s', label='Outliers')
plt.plot(line_x, line_y, color='grey',linewidth=2, label="Linear regressor")
plt.plot(line_x,line_y_ransac,color="red", linewidth=2,label="RANSAC regressor")
plt.xlabel('Average N rooms')
plt.ylabel('median value of owner occupied homes in 1000 $ ')
plt.legend(loc='lower right')
plt.show()


**linear regression Assumptions:** https://www.statisticssolutions.com/free-resources/directory-of-statistical-analyses/assumptions-of-linear-regression/

# Evaluate Regression Model Performance #

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X=Boston.iloc[:, :-1].values
Y=Boston['MEDV'].values

x_train, x_test,y_train, y_test=train_test_split(X, Y, test_size=0.2, random_state=0)
lr=LinearRegression()
lr.fit(x_train,y_train)
y_train_pred=lr.predict(x_train)
y_test_pred=lr.predict(x_test)


### Method1: Residual Analysis ###

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(y_train_pred, y_train_pred-y_train, c='blue', marker='s', label='Training')
plt.scatter(y_test_pred,y_test_pred-y_test, c='orange', marker='o', label='Test')
plt.xlabel('Predicted values')
plt.ylabel('Residuals ')
plt.legend(loc='upper right')
plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='k')
plt.xlim([-10,50])
plt.show()


### Method2: Mean Squarred Error (MSE) ###
<ul>
<li>The average value of the sums of the squarred error cost function </li>
<li>Useful for comparing different regression models </li>
<li>For tuning parameters via a grid search and cross-validation </li>
</ul>

## $$MSE={{1}\over{n }}{\sum_{i=1}^n{(yi-ŷi)²}}$$ ##

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_train,y_train_pred)
mean_squared_error(y_test,y_test_pred)


### Method3: Coefficient of determination R² ###
### $$ R²=1-{SSE \over SST}$$ ###
<ul>
<li>SSE: Sum of squarred errors </li>
<li>SST: Total sum of squares </li>

</ul>


In [None]:
from sklearn.metrics import r2_score

r2_score(y_train,y_train_pred)
r2_score(y_test,y_test_pred)


# The perfect model #

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

generate_random=np.random.RandomState(0)
x=10*generate_random.rand(1000)
y=3*x+np.random.randn(1000)
plt.figure(figsize=(10,8))
plt.scatter(x,y, marker='^', c='indigo')
plt.show()


In [None]:
x_train, x_test,y_train, y_test=train_test_split(x,y, test_size=0.3, random_state=0)
model=LinearRegression()
model.fit(x_train.reshape(-1,1),y_train)
y_train_pred=model.predict(x_train.reshape(-1,1))
y_test_pred=model.predict(x_test.reshape(-1,1))


In [None]:
#Residual Analysis

plt.figure(figsize=(12,8))
plt.scatter(y_train_pred, y_train_pred-y_train, c='blue', marker='s', label='Training')
plt.scatter(y_test_pred,y_test_pred-y_test, c='orange', marker='o', label='Test')
plt.xlabel('Predicted values')
plt.ylabel('Residuals ')
plt.legend(loc='upper right')
plt.hlines(y=0, xmin=3, xmax=33, lw=2, color='r')
plt.xlim([-5,35])
plt.ylim([-25,15])
plt.show()


In [None]:
#MSE

from sklearn.metrics import mean_squared_error

mean_squared_error(y_train,y_train_pred)
mean_squared_error(y_test,y_test_pred)
print(f'MSE train:{mean_squared_error(y_train,y_train_pred)} , MSE Test:{mean_squared_error(y_test,y_test_pred)}')


In [None]:
#R²
from sklearn.metrics import r2_score

r2_score(y_train,y_train_pred)
r2_score(y_test,y_test_pred)
print(f'R² train:{mean_squared_error(y_train,y_train_pred)} , R² Test:{mean_squared_error(y_test,y_test_pred)}')


# Multiple Regression #

## y=$\beta$0+$\beta1$*x1+$\beta2$*x2... ##
 

In [None]:

Housing=pd.read_csv('housing.data',delim_whitespace=True, header=None)
Housing.columns=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX',
                'PTRATIO','B','LSTAT','MEDV']
Boston=Housing.iloc[:,0:13]

X=Boston
Y=Housing.iloc[:,13:15]
print(f'x= \n {X} and y=\n {Y}')


# Statsmodels #

In [None]:
import statsmodels.api as sm

X_constant=sm.add_constant(X)
pd.DataFrame(X_constant)
#sm.OLS?
model=sm.OLS(Y,X_constant)
lr=model.fit()
lr.summary()
#P must be <0.025


## Summary: ##
<ul>
<li> <b>coefficient:</b> it is the value of the intercept. For each variable, it is the measurement of how change in that variable affects the independent variable. It is the ‘m’ in ‘y = mx + b’ One unit of change in the dependent variable will affect the variable’s coefficient’s worth of change in the independent variable. </li>
<li><b>std error:</b> is an estimate of the standard deviation of the coefficient, a measurement of the amount of variation in the coefficient throughout its data points. A low std error compared to a high coefficient produces a high t statistic, which signifies a high significance for your coefficient.</li>
<li><b>P>|t|</b> is one of the most important statistics in the summary. It uses the t statistic to produce the p-value, a measurement of how likely your coefficient is measured through our model by chance. The p-value of 0.378 for Wealth is saying there is a 37.8% chance the Wealth variable has no affect on the dependent variable, Lottery, and our results are produced by chance. A common alpha is 0.05, which few of our variables pass in this instance.</li>
<li><b>[0.025 and 0.975]</b> are both measurements of values of our coefficients within 95% of our data, or within two standard deviations. Outside of these values can generally be considered outliers.</li> </ul>
The p-value is the smallest test size that would cause an observation of t=0.1 to lead to a rejection of the null hypothesis

### Residual Tests: ###
<ul>
<li><b>Omnibus:</b> a combined statistic test for skewness and kurtosis</li>
<li><b>prob(Omnibus):</b> P-value of Omnibus test</li>
<li><b>Skewness:</b> a measure of symmetry of residuals around the mean.Zero if symmetrical. A positive value indicates a long tail to the right, a negative value indicates a long tail to the left</li>
<li><b>Kurtosis:</b> A measure of the shape of distribution of the residuals, A normal distribution has 0 measure.A negative value points to a flatter than normal distribution, a positive one has a higher peak than normal distribution</li>
<li><b>Durbin-Watson:</b> A test for presence of correlation among the residuals, this is important for time series modelling</li>
<li><b>Jarque-Bera:</b> it is a combined statistical test of Skewness and Kurtosis</li>
<li><b>Prob(JB): </b>p_value of Jarque-Bera</li>
<li><b>Cond.No: </b>it is a test for multicolinearity. > 30 indicates unstable results.</li>
    
</ul>


In [None]:
import statsmodels.formula.api as smf

form_lr=smf.ols(formula='Y ~ CRIM+ZN+INDUS+CHAS+NOX+RM+AGE+DIS+RAD+TAX+PTRATIO+B+LSTAT',data=Housing)
mlr=form_lr.fit()
mlr.summary()


In [None]:
#predicting 100 rows in basis of CRIM and Black
model_ex=smf.ols(formula='Y ~ CRIM+B',data=Housing) #CRIM+ZN+CHAS+
mlr_ex=model_ex.fit()
mlr_ex.summary()
#predictions = mlr_ex.predict(Housing[0:100])
#predictions.describe()


# Correlation Matrix #

In [None]:

Housing=pd.read_csv('Data/housing.data',delim_whitespace=True, header=None)
Housing.columns=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX',
                'PTRATIO','B','LSTAT','MEDV']
Boston=Housing.iloc[:,0:13]
X=Boston
Y=Housing.iloc[:,13:15]


In [None]:
pd.options.display.float_format='{:,.2f}'.format
corr_matrix=Boston.corr()
corr_matrix


In [None]:
corr_matrix[np.abs(corr_matrix)< 0.6]=0 #lesser than 0.6 and greater than -0.6
corr_matrix

In [None]:
palette = sns.color_palette('tab20b',10) # Default color palette
sns.palplot(palette) # Plotting your palette!
#sns.palplot(sns.color_palette('husl', 20)) # Seaborn color palette, with number of colors 
#sns.color_palette('rocket', as_cmap=True) # Get a CMap

plt.figure(figsize=(18,9))
sns.heatmap(corr_matrix, annot=True, cmap=palette)
plt.show()


# Detecting colinearity with Eigenvectors #

In [None]:
eigenvalues , eigenvectors= np.linalg.eig(Boston.corr())
pd.Series(eigenvalues).sort_values()
np.abs(pd.Series(eigenvectors[:,8])).sort_values(ascending=False)
print(Boston.columns[2],Boston.columns[8],Boston.columns[9])


small values= presence of colinearity

# Revising Feature importance and Extractions #

In [None]:
plt.hist(Boston['TAX'])
#plt.hist(Boston['NOX'])


# Standardise variable to identify Key Features #

In [None]:
from sklearn.linear_model import LinearRegression

pd.options.display.float_format='{:,.4f}'.format
model=LinearRegression()
model.fit(X,Y)
mc=list(np.transpose(model.coef_))
bc=list((Boston.columns))
bcc=np.transpose(bc)
result=pd.DataFrame({'Name':bc,'Coefficient':mc}).set_index('Name')
co=np.abs(result).sort_values(by=['Coefficient'], ascending=False)
co


### 2nd method ###

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

scaler=StandardScaler()
standard_coefficient_linear_reg=make_pipeline(scaler,model)
standard_coefficient_linear_reg.fit(X,Y)
scl=list(np.transpose(standard_coefficient_linear_reg.steps[1][1].coef_))
result=pd.DataFrame({'Name':bcc,'Coefficient':scl}).set_index('Name')
co=np.abs(result).sort_values(by=['Coefficient'], ascending=False)
co


## Use R² to identify key features ##
<ul>
<li>Compare R² of the model with R² without a feature</li>
<li>significant change in R² means the importance of the feature</li>
</ul>

In [None]:
from sklearn.metrics import r2_score
import statsmodels.formula.api as smf

linear_reg=smf.ols(formula='Y ~ CRIM+ZN+INDUS+CHAS+NOX+RM+AGE+DIS+RAD+TAX+PTRATIO+B+LSTAT',data=Boston)
Benchmark=linear_reg.fit()
r2_score(Y,Benchmark.predict(Boston))


## Without LSTAT ##

In [None]:
linear_reg=smf.ols(formula='Y ~ CRIM+ZN+INDUS+CHAS+NOX+RM+AGE+DIS+RAD+TAX+PTRATIO+B',data=Boston)
lr_lstat=linear_reg.fit()
r2_score(Y,lr_lstat.predict(Boston))


## Without AGE ##

In [None]:
linear_reg=smf.ols(formula='Y ~ CRIM+ZN+INDUS+CHAS+NOX+RM+DIS+RAD+TAX+PTRATIO+B+LSTAT',data=Boston)
lr_AGE=linear_reg.fit()
r2_score(Y,lr_AGE.predict(Boston))


# Regularized Regression #
<ul>
<li>Ridge regression </li>
<li>Least absolute shrinkage and selection operator(LASSO) </li>
<li>Elastic net </li>
</ul>

## Ridge Regression ##

<p>Ridge regression is a model tuning method that is used to analyse any data that suffers from multicollinearity. 
This method performs <b>L2</b> regularization. When the issue of multicollinearity occurs, least-squares are unbiased, 
and variances are large, this results in predicted values being far away from the actual values. 

The cost function for ridge regression: </p>

## $$ Min(||X( \omega)-Y||_2^2 + \lambda||\omega||_2^2) $$ ##

<p>Lambda is the penalty term. λ given here is denoted by an alpha parameter in the ridge function. So, by changing the values of alpha, we are controlling the penalty term. The higher the values of alpha, the bigger is the penalty and therefore the magnitude of coefficients is reduced.
<ul>
<li>   It shrinks the parameters. Therefore, it is used to prevent multicollinearity </li> 
<li>    It reduces the model complexity by coefficient shrinkage </li> 
<li> Check out the free course on regression analysis. </li> 
</ul>
<b>Ridge Regression Models </b> 

For any type of regression machine learning model, the usual regression equation forms the base which is written as: </p>

<div>
$$ Y = X\beta+e $$
    </div>
<p>
Where Y is the dependent variable, X represents the independent variables, B is the regression coefficients to be estimated, and e represents the errors are residuals. 
Once we add the lambda function to this equation, the variance that is not evaluated by the general model is considered. After the data is ready and identified to be part of L2 regularization, there are steps that one can undertake.</p>
<b>Standardization </b>
<p>
In ridge regression, the first step is to standardize the variables (both dependent and independent) by subtracting their means and dividing by their standard deviations. This causes a challenge in notation since we must somehow indicate whether the variables in a particular formula are standardized or not. As far as standardization is concerned, all ridge regression calculations are based on standardized variables. When the final regression coefficients are displayed, they are adjusted back into their original scale. However, the ridge trace is on a standardized scale.

Also Read: Support Vector Regression in Machine Learning </p>
<b>Bias and variance trade-off</b>

Bias and variance trade-off is generally complicated when it comes to building ridge regression models on an actual dataset. However, following the general trend which one needs to remember is:

    The bias increases as λ increases.
    The variance decreases as λ increases.

<b>Assumptions of Ridge Regressions</b>

The assumptions of ridge regression are the same as that of linear regression: linearity, constant variance, and independence. However, as ridge regression does not provide confidence limits, the distribution of errors to be normal need not be assumed.
Now, let’s take an example of a linear regression problem and see how ridge regression if implemented, helps us to reduce the error.

We shall consider a data set on Food restaurants trying to find the best combination of food items to improve their sales in a particular region. 
<p>
if lambda is zero then you can imagine we get back OLS. However, if lambda is very large then it will add too much weight and it will lead to under-fitting. Having said that it’s important how lambda is chosen. This technique works very well to avoid over-fitting issue.</p>

## LASSO ##

<p>Is a linear model that estimates sparse coefficients. Called L1 regularization.
Mathematically, it consists in a linear model trained with $\phi1$ prior as regularized.The objective function to minimize is:</p>

## $$ Min_\omega \frac{1}{2n_{samples}}||X( \omega)-Y||_2^2 + \lambda||\omega||_1 $$ ##
<p> The LASSO estimate thus solves of least squares penalty with $\lambda||\omega||_1$ added, where $\lambda$ is constant and $||\omega||_1$ is the $\phi1$ -norm of the parameter vector.</p>
<p>
The key difference between these techniques is that Lasso shrinks the less important feature’s coefficient to zero thus, removing some feature altogether. So, this works well for feature selection in case we have a huge number of features.</p>


## ELASTIC Net ##

<p>A linear regression model trained with L1 and L2 prior as regularizer.
This combination allows for learning a sparse model where few of the weights are non 0 like LASSO, while still maintaining the regularization properties of ridge.
<b>ELASTIC NET</b> is useful when there are multiple features which are correlated with one another.<b>LASSO</b> is likely to pick one of these randomly, while <b>ELASTIC NET</b> is likely to pick both.
A practical advantage of trading-off between LASSO and Ridge is it allows <b>ELASTIC NET</b> to inherit some of Ridge's stability under rotation.
the objective function to minimize is in this case:
</p>

## $$ Min_\omega \frac{1}{2n_{samples}}||X( \omega)-Y||_2^2 + \lambda p||\omega||_1 + \frac{\lambda(1-p)}{2} ||\omega||_2^2$$ ##


# Outliers Impact #

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline


## Linear regression ##

In [None]:
from sklearn.linear_model import LinearRegression

np.random.seed(42)
n_samples=100
rng=np.random.randn(n_samples)*10
y_gen=0.5*rng+2*np.random.randn(n_samples)
lr=LinearRegression()
model=lr.fit(rng.reshape(-1,1), y_gen)
model_pred=lr.predict(rng.reshape(-1,1))
plt.figure(figsize=(10,8))
plt.scatter(rng, y_gen, color='b')
plt.plot(rng,model_pred, color='r' )
print(lr.coef_)


In [None]:
idx=rng.argmax()
y_gen[idx]=200
idx=rng.argmin()
y_gen[idx]=-200
plt.figure(figsize=(10,8))
plt.scatter(rng, y_gen, color='green')
o_lr=LinearRegression()
o_lr.fit(rng.reshape(-1,1), y_gen)
o_model_predict=o_lr.predict(rng.reshape(-1,1))
plt.scatter(rng, y_gen, color='grey')
plt.plot(rng,o_model_predict, color='indigo' )
print(o_lr.coef_)


## Ridge Regression ##

In [None]:
from sklearn.linear_model import Ridge
from sklearn import preprocessing

#rng_N=preprocessing.normalize(rng.reshape(-1,1))
ridge_mod=Ridge(alpha=0.5, fit_intercept=True)#, normalize=True)
ridge_mod.fit(rng.reshape(-1,1), y_gen)
ridge_mod_pred=ridge_mod.predict(rng.reshape(-1,1))

plt.figure(figsize=(10,8))
plt.scatter(rng, y_gen, color='red')
plt.plot(rng,ridge_mod_pred, color='blue' )
ridge_mod.coef_


In [None]:
#Ridge + normalize data ???
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import RidgeCV

ridge_modN = make_pipeline(StandardScaler(with_mean=False),RidgeCV())
ridge_modN.fit(rng.reshape(-1,1), y_gen)
ridge_modN_pred=ridge_modN.predict(rng.reshape(-1,1))

plt.figure(figsize=(10,8))
plt.scatter(rng, y_gen, color='red')
plt.plot(rng,ridge_modN_pred, color='blue' )

ridge_modN['ridgecv'].coef_


## LASSO ##

In [None]:
from sklearn.linear_model import Lasso

Lasso_mod=Lasso(alpha=0.5, fit_intercept=True)#, normalize=True)
Lasso_mod.fit(rng.reshape(-1,1), y_gen)
Lasso_mod_pred=Lasso_mod.predict(rng.reshape(-1,1),)

plt.figure(figsize=(10,8))
plt.scatter(rng, y_gen, color='green')
plt.plot(rng,Lasso_mod_pred, color='blue' )
Lasso_mod.coef_


In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error

lasso = Lasso(alpha=0.1)

lasso_pipeline = pipeline(steps=[('preprocess', Lasso_mod),('model', Lasso)])

plt.figure(figsize=(10,8))
plt.scatter(rng, y_gen, color='red')
plt.plot(rng,Lasso_modN_prod, color='blue' )

Lasso_modN['model'].coef_


## Elastic Net regression ##

In [None]:
from sklearn.linear_model import ElasticNet

en_model=ElasticNet(alpha=0.5, fit_intercept=True)#, normalize=True)
en_model.fit(rng.reshape(-1,1), y_gen)
en_model_pred=en_model.predict(rng.reshape(-1,1),)

plt.figure(figsize=(10,8))
plt.scatter(rng, y_gen, color='orange')
plt.plot(rng,en_model_pred, color='blue' )
en_model.coef_


### When to use Ridge, Lasso or Elasticnet? ###
<ul>
<li><b>Ridge Regression</b> can't zero out coefficient, you either end up including all the coefficients in the model, or none of them.</li>
<li><b>LASSO</b> does both parameters shrinkage and variable selection automatically.</li>
<li>if some of your covariates are highly correlated, you may want to look at the <b>ElasticNet</b> instead of <b>LASSO</b>.</li>
</ul>

# Polynomial Regression #

In [None]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
import sys
import matplotlib

print('python:',sys.version)
print('Numpy:',np.__version__)
print('Pandas:',pd.__version__)
print('Scikitlearn:',sklearn.__version__)
print('Seaborn: ',sns.__version__)
print('matplotlib:',matplotlib.__version__)


## $$ y= x^3+100+\epsilon $$ ##

In [None]:
np.random.seed(42)
n_samples=100
X=np.linspace(0,10,100)
rng=np.random.randn(n_samples)*100
Y=X**3+100+rng
plt.figure(figsize=(10,8))
plt.scatter(X,Y,c='#7f7f7f' )
plt.grid()
plt.show()


### Linear Regression ###

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

lr=LinearRegression()
lr.fit(X.reshape(-1,1),Y)
lr_pred=lr.predict(X.reshape(-1,1))
plt.figure(figsize=(10,8))
plt.scatter(X,Y,c='#139fe8' )
plt.plot(X,lr_pred, c='#8a010d' )
plt.grid()
plt.show()
print(f'model Coef={lr.coef_}, R² ={r2_score(Y,lr_pred)}')


### Polynomial Regression ###

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_reg=PolynomialFeatures(degree=2)
X_poly=poly_reg.fit_transform(X.reshape(-1,1))
lr2=LinearRegression()
lr2.fit(X_poly,Y.reshape(-1,1))
Y_pred=lr2.predict(X_poly)

plt.figure(figsize=(10,8))
plt.scatter(X,Y,c='#14047b' )
plt.plot(X,Y_pred, c='red' )
plt.grid()
plt.show()
print(f'model Coef={lr2.coef_}, R² ={r2_score(Y,Y_pred)}')


### Example: Boston dataset ###

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

Boston=pd.read_csv('housing.data',delim_whitespace=True, header=None)
Boston.columns=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX',
                'PTRATIO','B','LSTAT','MEDV']

pd.options.display.float_format='{:,.3f}'.format
sns.pairplot(Boston, size=1.5)
Boston.corr()


In [None]:
X_boston=np.array(Boston['DIS'])
Y_boston=np.array(Boston['NOX'])
plt.figure(figsize=(12,6))
plt.scatter(X_boston,Y_boston)
plt.grid()
plt.show()


### linear Regression ###

In [None]:
lin=LinearRegression()
lin.fit(X_boston.reshape(-1,1),Y_boston)
lin_pred=lin.predict(X_boston.reshape(-1,1))
plt.figure(figsize=(10,8))
plt.scatter(X_boston,Y_boston,c='darkslateblue' )
plt.plot(X_boston,lin_pred, c='fuchsia' )
plt.grid()
plt.show()
print(f'model Coef={lin.coef_}, R² ={r2_score(Y_boston,lin_pred):.3f}')


In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly=PolynomialFeatures(degree=2)
X_poly=poly.fit_transform(X_boston.reshape(-1,1))
poly2=LinearRegression()
poly2.fit(X_poly,Y_boston.reshape(-1,1))
X_fit=np.arange(X_boston.min(), X_boston.max(),1)[:,np.newaxis]
Y_pred=poly2.predict(poly.fit_transform(X_fit.reshape(-1,1)))

plt.figure(figsize=(12,6))
plt.scatter(X_boston,Y_boston, c='forestgreen')
plt.plot(X_fit,Y_pred,c='lightsalmon', linewidth=3)
plt.grid()
plt.show()
print(f'model Coef={poly2.coef_}, R² ={r2_score(Y_boston,poly2.predict(X_poly)):.3f}')


### Cubic Regression ###

In [None]:
poly_3=PolynomialFeatures(degree=3)
X_poly=poly_3.fit_transform(X_boston.reshape(-1,1))
poly3=LinearRegression()
poly3.fit(X_poly,Y_boston.reshape(-1,1))
X_fit=np.arange(X_boston.min(), X_boston.max(),1)[:,np.newaxis]
Y_pred3=poly3.predict(poly_3.fit_transform(X_fit.reshape(-1,1)))

plt.figure(figsize=(12,6))
plt.scatter(X_boston,Y_boston, c='forestgreen')
plt.plot(X_fit,Y_pred3,c='lightsalmon', linewidth=3)
plt.grid()
plt.show()
print(f'model Coef={poly3.coef_}, R² ={r2_score(Y_boston,poly3.predict(X_poly)):.3f}')


# Non linear relationships #

In [None]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline


In [None]:

Data=pd.read_csv('Data/housing.data',delim_whitespace=True, header=None)
Boston=pd.DataFrame(data=np.array(Data),index=None, columns=('CRIM','ZN','INDUS','CHAS',
                                                             'NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV'), )#.iloc[:,:-1]

#sns.pairplot(Boston, height=1.5)
#plt.savefig('CorrMatrix.png')
#pd.options.display.float_format='{:,.3f}'.format
#Boston.corr()
Boston


### Decision Tree ###

In [None]:
from sklearn.tree import DecisionTreeRegressor

Y=Boston['MEDV']
X=Boston['LSTAT'].values
tree=DecisionTreeRegressor(max_depth=5)
tree.fit(X.reshape(-1,1),Y) 
sort_idx=X.flatten().argsort()
plt.figure(figsize=(10,8))
plt.scatter(X[sort_idx],Y[sort_idx], c='b')
plt.plot(X[sort_idx],tree.predict(X[sort_idx].reshape(-1,1)), color='r', linewidth=2)
plt.xlabel('STAT')
plt.ylabel('MEDV')


## Random Forest ##

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor


In [None]:
X=Boston
Y=Boston['MEDV']
X_train, X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,train_size=0.7, random_state=42) 
#Criterion: 'poisson', 'squared_error', 'friedman_mse', 'absolute_error'
forest=RandomForestRegressor(n_estimators=500, criterion='friedman_mse', random_state=42, n_jobs=-1)
forest.fit(X_train, Y_train)
Y_train_pred=forest.predict(X_train)
Y_test_pred=forest.predict(X_test)
print(f'MSE Train={mean_squared_error(Y_train,Y_train_pred):.4f} , MSE Test={mean_squared_error(Y_test,Y_test_pred):.4f}')
print(f'R² Train={r2_score(Y_train,Y_train_pred):.4f} , R² Test={r2_score(Y_test,Y_test_pred):.4f}')


## AdaBoost ##

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

ada=AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=500, random_state=42)
ada.fit(X_train,Y_train)
Y_train_predict=ada.predict(X_train)
Y_test_predict=ada.predict(X_test)
print(f'MSE Train={mean_squared_error(Y_train,Y_train_pred):.4f} , MSE Test={mean_squared_error(Y_test,Y_test_pred):.4f}')
print(f'R² Train={r2_score(Y_train,Y_train_pred):.4f} , R² Test={r2_score(Y_test,Y_test_pred):.4f}')


### Feature importance ###

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor

X=Boston
Y=Boston['MEDV']
X_train, X_test,Y_train,Y_test=train_test_split(X,Y, test_size=0.3, random_state=42) 
tree=DecisionTreeRegressor(max_depth=5)
tree.fit(X,Y) #reshape(-1,1)
#sort_idx=X.flatten().argsort()

Y_train_pred=tree.predict(X_train)
Y_test_pred=tree.predict(X_test)
print(f'MSE Train={mean_squared_error(Y_train,Y_train_pred):.4f} ,MSE Test={mean_squared_error(Y_test,Y_test_pred):.4f}')
print(f'R² Train={r2_score(Y_train,Y_train_pred):.4f} , R² Test={r2_score(Y_test,Y_test_pred):.4f}')

result= pd.DataFrame(tree.feature_importances_, Boston.columns)
result.columns=['Features']
rt=result.sort_values(by='Features', ascending=False)
rt.plot(kind='bar', figsize=(10,5), edgecolor='black')



In [None]:
# Random Forest
forest.feature_importances_
result= pd.DataFrame(forest.feature_importances_, Boston.columns)
result.columns=['Features']
rt=result.sort_values(by='Features', ascending=False)
rt.plot(kind='bar', figsize=(10,8), edgecolor='black')


In [None]:
# Ada
ada.feature_importances_
result= pd.DataFrame(ada.feature_importances_, Boston.columns)
result.columns=['Features']
rt=result.sort_values(by='Features', ascending=False)
rt.plot(kind='bar', figsize=(10,4), edgecolor='black')


# Data Preprocessing #

In [None]:
Data=pd.read_csv('Data/housing.data',delim_whitespace=True, header=None)
Boston=pd.DataFrame(data=np.array(Data),index=None,
                    columns=('CRIM','ZN','INDUS','CHAS','NOX',
                             'RM','AGE','DIS','RAD','TAX','PTRATIO',
                             'B','LSTAT','MEDV'), )#.iloc[:,:-1]

#sns.pairplot(Boston, height=1.5)
#plt.savefig('CorrMatrix.png')
pd.options.display.float_format='{:,.3f}'.format
co=Boston.corr()
Boston
#co['MEDV'].sort_values(ascending=False)


In [None]:
#X=pd.DataFrame(Boston['LSTAT'])
X=Boston['LSTAT'].values
Y=np.array(Boston['MEDV'])
plt.figure(figsize=(10,5))
plt.scatter(X,Y)
plt.show()
X.shape


### Without Preprocessing ###

<p>Gradient descent is a widely used machine learning algorithm. It tells us how we can do better in predictive modeling <br>with an iterative approach. We will see how we can use the gradient descent algorithm to get better predicting results in linear regression.</p>
<b> cost function in linear regression: </b><br>
 $$\sum_{i=0}^n(yi-(\beta1x+\beta0))^2$$ 
<p>A cost function is a measure of how wrong the model is in terms of its ability to estimate the relationship between x and y. <br>
The starting point is known as Learning Rate a constant (generally denoted as alpha ‘α’). The learning rate is a step to go down to attain convergence in minimum steps. At every step, the function will calculate the slope of a line and curve at that certain point. The convergence will attain when the slope will be equal to zero.</p>
Gradient descent steps:
<ol>
<li>Adding a column of ones to x vector: 
  <b>x=np.c_[np.ones(x_RM.shape[0]),x_RM, x_LSTAT]</b></li>
<li>Guess/Random θ: $$ \theta (n+1*1)--> \theta_0, \theta_1,...\theta_n  $$  </li> <br>
we have x and θ_0, θ_1...θ_n  , we can do linear regression and predict the error 
<li>Predict the y values : </li>
 $$ pred^i=\theta_0 * 1 + \sum_{i=0}^n \theta_i x_i   $$  
<li>Calculate the error: calculate MSE 
     $$MSE={{1}\over{n }}{\sum_{i=1}^n{(yi-ŷi)²}}$$  </li>
<li>Calculate the cost function :</li>
  $$  cost=\frac{1}{2m} \sum_{i=1}^m (error)^2 $$ 
<li>Update θ :</li>
    $$ \theta=\theta - \alpha \frac{1}{m} *\sum_{i=1}^m (error)*x $$ 
<li>Repeat till the change in the cost function is negligible </li>
<ol>


In [None]:
#Gradient Descent:

alpha=0.0001
w_=np.zeros(1+X.shape[0])
cost_=[]
n_=100
for i in range (n_):
    y_pred=np.dot(X,w_[1:])+w_[0]
    errors=Y-y_pred
    w_[1:]+=alpha+X.dot(errors)
    w_[0]+=alpha+errors.sum()
    cost=(errors**2).sum()/2.0
    cost_.append(cost)
    
plt.figure(figsize=(10,5))
plt.plot(range(1, n_+1),cost_)
plt.xlabel('SSE')
plt.ylabel('Epoch')
plt.show()


### With preprocessing ###

In [None]:
from sklearn.preprocessing import StandardScaler
sc_x=StandardScaler()
sc_y=StandardScaler()
X_std=sc_x.fit_transform(X.reshape(-1,1))
Y_std=sc_y.fit_transform(Y.reshape(-1,1)).flatten()

#Gradient Descent:
alpha=0.0001
w_=np.zeros(1+X_std.shape[1])
cost_=[]
n_=100
for i in range (n_):
    y_pred=np.dot(X_std[:,0:2],w_[1:])+w_[0]
    errors=Y_std-y_pred
    w_[1:]+=alpha*X_std.T.dot(errors)
    w_[0]+=alpha*errors.sum()
    cost=(errors**2).sum()/2.0
    cost_.append(cost)
    
plt.figure(figsize=(10,5))
plt.plot(range(1,n_+1),cost_)
plt.xlabel('SSE')
plt.ylabel('Epoch')
plt.show()


In [None]:
plt.figure(figsize=(10,5))
plt.hist(X)
plt.xlim(-40,40)


In [None]:
#After scaling

plt.figure(figsize=(10,5))
plt.hist(X_std)
plt.xlim(-4,4)


Data pre-processing:
<ul>
<li>Standardisation / mean removal : mean =0 and variance= 1</li>
<li>Normalization</li>
<li>Binarization</li>
</ul>
<b>Assumptions:</b>
    <ul>
<li>Implicit / Explicit assumptions of ML algorithms: The features follow a normal distribution</li>
<li>Most methods are ased on linear assumptions</li>
<li>Most ML requires data to be standard normally distributed. Gaussian with 0 mean and unit variance</li>
</ul>


## Standardization /Mean removal / Variance Scaling ##

$$ X'= \frac{(X-\bar{X})}{\sigma} $$

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

X=Boston
Y=Boston['MEDV']
X_train, X_test,Y_train,Y_test=train_test_split(X,Y,
                                                test_size=0.3, random_state=42)
X_scaled=preprocessing.scale(X_train)
X_scaled.mean(axis=0)
#X_scaled.std(axis=0)


Training data is scaled, we must do likewise with Test data. However the assumption is that mean and variance are the same between Test and Train!

In [None]:
#StandardScaler
scaler=preprocessing.StandardScaler().fit(X_train)
#scaler.scale_
scaler.transform(X_train)
scaler.mean_


In [None]:

plt.figure(figsize=(10,5))
plt.hist(X_train)
plt.show()


In [None]:
scaler.transform(X_test)
plt.figure(figsize=(10,5))
plt.hist(X_test)
plt.show()
scaler.mean_


## Min-Max Scaling Feature to a range ##

In [None]:
X_train=np.array([[1,-1,2],[2,0,0],[0,1,-1]])
minmax_scaler=preprocessing.MinMaxScaler()
X_train_minmax=minmax_scaler.fit_transform(X_train)
X_train_minmax


In [None]:
X_test=np.array([[-3,-1,0],[2,1.5,4]])
X_test_minmax=minmax_scaler.transform(X_test)
X_test_minmax


## Scaling Sparse data ##
centering sparse data would destroy the sparseness structure in the data, and thus, rarely is a sensible thing to do.
However, it can make sense to scale sparse inputs, especially when features are on a different scales.
MaxAbsScaler and maxabs_scale were especially designed for scalling sparse data.

## Scaling Vs Whitening ##
it is sometimes not enough to to center and scale the features independently, since a downstream model can further make some assumption on the linear independance of the features.
To adress this issue u can use sklearn.decomposition.PCA or sklearn.decomposition.RandomizedPCA with whiten=True to further remove the linear correlation across features.

# Normalization #

Normalization is the process of scaling individual samplesto have unit form.
This process can be useful if u plan to use quadratic form such as dot-product or any other kernel to quantify the similarity of any pair of samples.
$$ X'= \frac{X-X_{mean}}{X_{max}-X_{min}} $$
<p>
This assumption is the base of the vector space model often used in text classification and culstering contexts.
there are 2 types of Normalization:
<ol>
<li> <b>L1 Normalization:</b> least absolute deviations ensure the sum of absolute values is 1 in each row. </li>
<li> <b>L2 Normalization:</b> least squares, ensure that the sum of squares is 1. </li>
</ol>
</p>   

In [None]:
from sklearn import preprocessing

X=[[1,-1,2], [2,0,0],[0,1,-1]]
X_norm=preprocessing.normalize(X,norm='l2')
X_norm


The preprocessing module further provides a utility class "Normalizer" that implements the same operation usig the "Transformer" API.

In [None]:
normalizer=preprocessing.Normalizer()#.fit(X)
normalizer
normalizer.transform(X)
#normalizer.transform([[-1,1,0]])

# Binarization #
$$ f(x) = 0,1 $$
feature binarization is the process of thresholding numerical features to get boolean values.This can be useful for downstream probabilistic estimators that make assumption that the input data is distributed according to a multi variate Bernouilli distribution .
It is also common among the text processing community to use binary feature values(probably to simplify the probabilistic reasoning) even if normalized counts (term frequencies) or TF-IDF valued features often perform slightly better in practice.

In [None]:
X=[[1,-1,2], [2,0,0],[0,1,-1]]
binarizer= preprocessing.Binarizer()#.fit(X)
binarizer.transform(X)


In [None]:
#Modifying the threshold, threshold: if x<threshold: x=0
binarizer= preprocessing.Binarizer(threshold=-0.5)
binarizer.transform(X)


### categorical features ###

In [None]:
source=['Australia', 'Singapore', 'New Zealand', 'Hong Kong']
label_enc=preprocessing.LabelEncoder()
src=label_enc.fit_transform(source)
for k,v in enumerate (label_enc.classes_):
    print(f'{v}:\t{k}')
test_data=['Hong Kong', 'Singapore', 'Australia', 'New Zealand']
result=label_enc.transform(test_data)
result


In [None]:
from sklearn.preprocessing import OneHotEncoder

one_hot_enc=OneHotEncoder(sparse_output=False,categories='auto')
src=src.reshape(len(src),1)
one_hot=one_hot_enc.fit_transform(src)
one_hot


In [None]:
invert_res=label_enc.inverse_transform([np.argmax(one_hot[0,:])])
invert_res
invert_res=label_enc.inverse_transform([np.argmax(one_hot[2,:])])
invert_res


# Variance Bias Trade off #

Every estimator has its advantages and its drawbacks. Its generalization error can be decomposed in terms of Bias, variance and noise.
The bias of an estimator is its average error for different training sets. The variance of an estimator indicates how sensitive it is to varying training sets. Noise is a property of the datasets. 
Bias and Noise are inherent properties of estimators and we usually have to select learning algorithms and hyperparameters so that both bias and variance are as low as possible.
Another way to reduce the variance of a model is to use more training data. However, you should only collect more training data if the true function is too complex to be approximated by an estimator with a lower variance.


## Validation Curve ##
<ul>
<li>for identifying over and under fitting.</li>
<li>for plotting training and validation scores VS model parameters.</li>
</ul> 

 ### a/ For ridge regression ###

In [None]:
from sklearn.model_selection import validation_curve
from sklearn.linear_model import Ridge

np.random.seed(0)
iris=pd.read_csv('Data\Iris.csv',delim_whitespace=False, header='infer')
iris


 ### b/ Plotting Validation Curve ###

Training scores and validation scores of a Supported Vector Model for different values of the kernel parameter gamma. For very low values of gamma, u can see that both the training score and the validation score are low: this is called underfitting.<br>
Medium values of gamma will result in high values for both scores.<br>
If gamma is too high, the classifier will overfit, which means that the training score is good but the validation score is poor.


 ### c/ Learning Curve ###

<ul>
<li> Shows the validation and training score of of an estimator for varying numbers of training samples.</li>
<li> A tool to find out how much we benefit from adding more training data and whether the estimator suffers more from a variance error or a bias error.</li>
<li> if both the validation score and the training score converge to a value that is too low with increasing size of the training set, we will not benefit much from more training data.</li>
</ul>
### The high bias learning curve (fig1) ###
<ul>
<li> Low training and test accuracy /score , underfits the training data
<li> The actual performance (accuracy or R²) level is far from the desired level of performance 
</ul>
address by:
<ul>
<li> Increase the number of parameters, adding or creating new feature. </li>
<li> Decrese the regularisation. </li>
</ul>
### The high variance learning curve (fig2) ###
<ul>
<li> The training and test level de not converge or converge slowly . </li>
<li> The training and test level are still very far apart, this is an example of overfitting  . </li>
</ul>
address by:
<ul>
<li> Collect more training data </li>
<li> Reduce model complexity </li>
<li> Increase regularisation </li>
</ul>
### An example (fig3)of good variance bias trade off learning curve. ###
<ul>
<li> The actual level of performance (accuracy or R²) achieved is close to desired level of performance.</li>
<li> The training and test (validation curve) are tight and converge to similar level.</li>
</ul>
<img src="Img_1.jpg" alt="variance bias trade off" width="600"/>

in summary, we look for :
<ul>
<li> <b>Bias:</b> Evaluate via score /accuracy level. </li>
<li> <b> Variance: </b>Evaluate via the convergence speed and distance between trainig score/accuracy level and test score/accuracy level. </li>
</ul>


www.statology.org Explanation


What is the Bias-Variance Tradeoff in Machine Learning?

To evaluate the performance of a model on a dataset, we need to measure how well the model predictions match the observed data.

For regression models, the most commonly used metric is the mean squared error (MSE), which is calculated as:

MSE = (1/n)*Σ(yi – f(xi))2

where:

    n: Total number of observations
    yi: The response value of the ith observation
    f(xi): The predicted response value of the ith observation

The closer the model predictions are to the observations, the smaller the MSE will be.

However, we only care about test MSE – the MSE when our model is applied to unseen data. This is because we only care about how the model will perform on unseen data, not existing data.

For example, it’s nice if a model that predicts stock market prices has a low MSE on historical data, but we really want to be able to use the model to accurately forecast future data.

It turns out that the test MSE can always be decomposed into two parts:

(1) The variance: Refers to the amount by which our function f would change if we estimated it using a different training set.

(2) The bias: Refers to the error that is introduced by approximating a real-life problem, which may be extremely complicated, by a much simpler model.

Written in mathematical terms:

Test MSE = Var(f̂(x0)) + [Bias(f̂(x0))]2 + Var(ε)

Test MSE = Variance + Bias2 + Irreducible error

The third term, the irreducible error, is the error that cannot be reduced by any model simply because there always exists some noise in the relationship between the set of explanatory variables and the response variable.

Models that have high bias tend to have low variance. For example, linear regression models tend to have high bias (assumes a simple linear relationship between explanatory variables and response variable) and low variance (model estimates won’t change much from one sample to the next).

However, models that have low bias tend to have high variance. For example, complex non-linear models tend to have low bias (does not assume a certain relationship between explanatory variables and response variable) with high variance (model estimates can change a lot from one training sample to the next).
The Bias-Variance Tradeoff

The bias-variance tradeoff refers to the tradeoff that takes place when we choose to lower bias which typically increases variance, or lower variance which typically increases bias.

The following chart offers a way to visualize this tradeoff:

<img src='\\Img_3.png' alt='error with model complexity' width=800 />

The total error decreases as the complexity of a model increases but only up to a certain point. Past a certain point, variance begins to increase and total error also begins to increase.

In practice, we only care about minimizing the total error of a model, not necessarily minimizing the variance or bias. It turns out that the way to minimize the total error is to strike the right balance between variance and bias.

In other words, we want a model that is complex enough to capture the true relationship between the explanatory variables and the response variable, but not overly complex such that it finds patterns that don’t really exist.

When a model is too complex, it overfits the data. This happens because it works too hard to find patterns in the training data that are just caused by random chance. This type of model is likely to perform poorly on unseen data.

But when a model is too simple, it underfits the data. This happens because it assumes the true relationship between the explanatory variables and the response variable is more simple than it actually is.

The way to pick optimal models in machine learning is to strike the balance between bias and variance such that we can minimize the test error of the model on future unseen data.

In practice, the most common way to minimize test MSE is to use cross-validation.


### d/ Validation Curve ###
By looking at the curve, we can determine if the model is underfitting, overfitting or just-right for some range of hyperparameter values of max_depth. Note that, in the graph, the accuracy score of the train set is marked as the “Training Score” and the accuracy score of the test set is marked as the “Cross-Validation Score”.
<ul>
<li><b>Underfitting: </b>Accuracy scores of both train and test sets are low. This indicates that the model is too simple or has been regularized too much. At the max_depth values of 1 and 2, the random forests model is underfitting.</li>
<li><b>Overfitting: </b>The training accuracy score is very high and the accuracy score of the test set is low. The model fits very well for the training data, but it fails to generalize to new input data. For max_depth values of 4, 5, …, 10, the model is highly overfitted.</li>
<li><b>Just-right: </b>No overfitting or underfitting. At the max_depth value of 3, the model is just right. The model fits the training data very well and it is also generalizable to new input data. That’s what we want!</li>
</ul>
Be careful: When you use an evaluation metric such as MSE, the overfitting condition happens when the training MSE is very low (not high) and the MSE of the test set is high (not low). This is because here we consider an error (Mean Squared Error).

Be careful: Here, you got the optimal max_depth hyperparameter value of 3. Keep in mind that this is what we got when we consider only the max_depth hyperparameter. When we consider several hyperparameters at a time as in Grid Search or Randomized Search, the optimal max_depth hyperparameter value will not be 3.


In [None]:

def plot_learning_curve(estimator, title, X_train,y_train,ylim=None,cv=None, n_jobs=1, 
                       train_sizes=np.linspace(0.1,1,5))


# Cross validation #

<ul><li>hold out cross validation</li>
<li>k-fold cross validation</li></ul>
A test set should still be held out for final validation, but the vlidation set is no longer needed when doing CV.
In the basic approach called k-fold CV, the training set is split into k smaller sets.The follwing procedure is followed for each of the k "folds".
<ul><li>A model is trained using k-1 of the folds as training data.</li>
<li>The resulting model is validated on the remaining part of the data (it is used as a test set to compute a performance mesure such as accuracy)</li></ul>
The performance mesure reported by k-fold CV is then the average of the values computed in the loop.

## Holdout method ##
<ul>
<li>Split initial dataset into a separate training and test dataset </li>
<li>Training dataset - model training</li>
<li>Test dataset - estimate its generalisation performance</li>
</ul>
A variation is to split the training set to 2: training set and validation set
<ul>
<li><b>Training set : </b>for fitting different models </li>
<li><b>Validation set: </b>for tuning and comparing different parameter settings to further improve the performance for making predictions on unseen data.And finally for model selection. </li>
</ul>
This process is called model selection, we want to select the optimal values of tuning parameters (hyperparameters)

## k-fold Cross Validation ##
<ul>
<li>Randomly split the training dataset into k folds without replacement.
<li>k-1 folds are used for the model training.
<li>The one fold is used for performance validation.
</ul>
This procedure is repeated k times.
Final outcomes: k models and performance estimates 
<ul>
<li>Calculate the average performance of the models based on the different, independant folds to obtain a performance estimate that is less sensitive to the sub partitioning of the training data compared to the holdout method.</li>
<li>k fold CV is used for model tuning.Finding the optimal hyperparameter values that yields a satisfying generalization performance.</li>
<li>Once we have found satisfactory hyperparameter values, we can retrain the model on the complete training set and obtain a final performance estimate using the independant test set.
The rationale behind fitting a model to the whole training dataset after k fold CV is that proving more training samples to a learning algorithm usually results in a more accurate and robust model.</li>
<li>common k is 10 </li>
<li>For relatively small training set, increase the number of folds </li>
</ul>

## Stratified k-fold Cross Validation ##
<ul>
<li>variation of K fold</li>
<li>Can yield better bias and variance estimates, especially in cases of unequal class proportions</li>
</ul>


# Cross Validation Illustration #

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import svm

Boston_df=Boston.iloc[:,0:13]
target=Boston['MEDV']#.values
Boston_df.shape

X_train,X_test,Y_train, Y_test=train_test_split(Boston_df,target,
                                                test_size=0.4,random_state=0)
print(f'X_train,X_test,Y_train, Y_test shapes:{X_train.shape}, {X_test.shape},{Y_train.shape},{Y_test.shape}')

regression=svm.SVR(kernel='linear', C=1).fit(X_train, Y_train)
regression.score(X_test, Y_test)


## Computing Cross-Validated metrics ##

In [None]:
from sklearn.model_selection import cross_val_score

regression=svm.SVR(kernel='linear', C=1)
scores=cross_val_score(regression, Boston_df, target, cv=5) #cv is k folds

print(f'Accuracy : %.3f +/- %.3f'% (scores.mean(), scores.std()**2))


In [None]:
# use MSE
scores=cross_val_score(regression, Boston_df, target, cv=5,
                       scoring='neg_mean_squared_error')

print(f'Accuracy : %.3f +/- %.3f'% (scores.mean(), scores.std()))


### K-fold ###

it divides all the samples in k group of samples called folds (if k=n, this is equivalent to the leave one out strategy), <br>of equal sizes(if possible). The prediction function is learned using k-1 folds, and the fold left out is used for test.<br>
This is an example of 2 fold CV on a dataset with 4 samples:

In [None]:
from sklearn.model_selection import KFold

X= ['a', 'b', 'c', 'd']
kf= KFold(n_splits=2)
for train, test in kf.split(X):
    print(f'%s %s'% (train, test))


### Stratified K-fold ###

In [None]:
from sklearn.model_selection import StratifiedKFold

X=np.ones(10)
Y= [0,0,0,0,1,1,1,1,1,1,]
skf=StratifiedKFold(n_splits=3)
for train, test in skf.split(X,Y):
    print(f'%s %s'% (train, test))


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.pipeline import make_pipeline

pipe_svm=make_pipeline(StandardScaler(), PCA(n_components=2), svm.SVR(kernel='linear', C=1))
pipe_svm.fit(X_train,Y_train)
y_pred=pipe_svm.predict(X_test)
print(f'Test Accuracy= %.3f' % pipe_svm.score(X_test,Y_test))
#print(f'Test Accuracy= {pipe_svm.score(X_test,Y_test):.3f}' )


In [None]:
#2nd method
from sklearn.model_selection import cross_val_score

scores= cross_val_score(estimator=pipe_svm,X=X_train, y=Y_train, cv=10,n_jobs=1)
print(f'CV Accuracy= %s' % scores)
print(f'CV accuracy= %.3f +/- %.3f'% (scores.mean(), scores.std()))
