# Exercice Learning curve #

In [None]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline
import sys
import matplotlib

print('python:',sys.version)
print('Numpy:',np.__version__)
print('Pandas:',pd.__version__)
print('Scikitlearn:',sklearn.__version__)
print('Seaborn: ',sns.__version__)
print('matplotlib:',matplotlib.__version__)


In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

Boston=pd.read_csv('Data/housing.data',delim_whitespace=True, header=None)
Boston.columns=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX',
                'PTRATIO','B','LSTAT','MEDV']
Boston
Boston.describe()


In [None]:
#Pairplot
sns.pairplot(Boston, height=1.2)
plt.show()


In [None]:
pd.options.display.float_format='{:,.3f}'.format
Boston.corr()

plt.figure(figsize=(16,10))
sns.heatmap(Boston.corr(), annot=True)
plt.show()


# RM Regression #

In [None]:
X=np.array(Boston['RM'])
Y=np.array(Boston['MEDV'])

plt.scatter(X, Y, color='blue', marker='o',linewidth=2)
plt.xlabel('RM')
plt.ylabel('MEDV')
plt.legend(loc='lower right')
plt.show()


In [None]:
from sklearn.linear_model import Ridge


ridge_mod=Ridge(alpha=0.5, fit_intercept=True)#, normalize=True)
ridge_mod.fit(X.reshape(-1,1), Y)
ridge_mod_pred=ridge_mod.predict(X.reshape(-1,1))

plt.figure(figsize=(10,8))
plt.scatter(X, Y, color='red', label= 'values')
plt.plot(X,ridge_mod_pred, color='blue',label='Ridge' )
plt.xlabel('RM')
plt.ylabel('MEDV')
plt.legend(loc='lower right')
plt.show()
print(f'y={ridge_mod.coef_}x+{ridge_mod.intercept_:.3f}')


In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

X_train,X_test, Y_train ,Y_test =train_test_split(X,Y, test_size=0.3,
                 train_size=0.7, random_state=None,
                 shuffle=True, stratify=None)
#Train:
ridge_train=Ridge(alpha=0.5, fit_intercept=True)
ridge_train.fit(X_train.reshape(-1,1), Y_train)
y_train_pred=ridge_train.predict(X_train.reshape(-1,1))

#Test:
ridge_test=Ridge(alpha=0.5, fit_intercept=True)
ridge_test.fit(X_test.reshape(-1,1), Y_test)
y_test_pred=ridge_test.predict(X_test.reshape(-1,1))
X_train

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(X_train, Y_train, color='red', label= 'Train')
plt.plot(X_train,y_train_pred, color='blue',label='Ridge' )
plt.xlabel('RM')
plt.ylabel('MEDV')
plt.legend(loc='lower right')
plt.show()
print(f'ytrain={ridge_train.coef_}x+{ridge_train.intercept_}')
print(f'MSE= {mean_squared_error(Y_train,y_train_pred):.3f}\n')
plt.figure(figsize=(6,4))
plt.scatter(X_test, Y_test, color='red', label= 'Test')
plt.plot(X_test,y_test_pred, color='blue',label='Ridge' )
plt.xlabel('RM')
plt.ylabel('MEDV')
plt.legend(loc='lower right')
plt.show()
print(f'ytest={ridge_test.coef_}x+{ridge_test.intercept_}')
print(f'MSE= {mean_squared_error(Y_test,y_test_pred):.3f}')


In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import learning_curve

pd.options.display.float_format='{:,.2f}'.format
features=['RM', 'PTRATIO' ,'LSTAT']
x=Boston[features]
y=Y
train_sizes, train_scores, validation_scores = learning_curve(
    Ridge(),x,y,train_sizes=[10,50,100,150,200,250,350,400],cv = 5,
    scoring = 'neg_mean_squared_error')
print('Training scores:\n\n',train_scores)
print('\nValidation scores:\n\n', validation_scores)


In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1)
MSE_p=mean_squared_error(Y_train,y_train_pred)

plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.plot([0,100,500],[MSE_p,MSE_p,MSE_p], label='Desired', color='r')
plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for a Ridge Regression model', fontsize = 18, y = 1.03)
plt.legend()
plt.ylim(0,80)


### The high bias learning curve (fig1) ###
<ul>
<li> Low training and test accuracy /score , underfits the training data
<li> The actual performance (accuracy or R²) level is far from the desired level of performance 
</ul>
address by:
<ul>
<li> Increase the number of parameters, adding or creating new feature. </li>
<li> Decrese the regularisation. </li>
</ul>


# PTRATIO regression # 

In [None]:
X=np.array(Boston['PTRATIO'])
Y=np.array(Boston['MEDV'])

plt.scatter(X, Y, color='blue', marker='o',linewidth=1)
plt.xlabel('PTRATIO')
plt.ylabel('MEDV')
plt.legend(loc='lower right')
plt.show()
Boston['PTRATIO'].describe()


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

Y=Boston['MEDV']
X=Boston['PTRATIO'].values
tree=DecisionTreeRegressor(max_depth=2)
tree.fit(X.reshape(-1,1),Y) 
sort_idx=X.flatten().argsort()
plt.figure(figsize=(10,8))
plt.scatter(X[sort_idx],Y[sort_idx], c='b')
plt.plot(X[sort_idx],tree.predict(X[sort_idx].reshape(-1,1)), color='r', linewidth=2)
plt.xlabel('PTRATIO')
plt.ylabel('MEDV')

y_pred=tree.predict(X.reshape(-1,1))
print(f'MSE= {mean_squared_error(Y,y_pred):.3f}')

In [None]:
# Visualising the Decision Tree Regression Results 

plt.scatter(X,Y, color = 'blue')
plt.scatter(X, y_pred, color = 'green')
X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.plot(X_grid, tree.predict(X_grid), color = 'red')
plt.title('Decision Tree Regression')
plt.xlabel('PTRATIO')
plt.ylabel('MEDV')
plt.show()
print(f'MSE= {mean_squared_error(Y,y_pred):.3f}')


In [None]:
from sklearn.model_selection import learning_curve

pd.options.display.float_format='{:,.3f}'.format
features=['PTRATIO']
x=Boston[features]
y=Y
train_sizes, train_scores, validation_scores = learning_curve(
    DecisionTreeRegressor(),x,y,train_sizes=[10,50,100,150,200,250,350,400],cv = 5,
    scoring = 'neg_mean_squared_error')
print('Training scores:\n\n',train_scores)
print('\nValidation scores:\n\n', validation_scores)


In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1)
MSE_p=mean_squared_error(Y,y_pred)

plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.plot([0,100,500],[MSE_p,MSE_p,MSE_p], label='Desired', color='r')
plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for Decision Tree model', fontsize = 18, y = 1.03)
plt.legend()
plt.ylim(0,200)


# LSTAT Regression #

In [None]:
X=np.array(Boston['LSTAT'])
Y=np.array(Boston['MEDV'])

plt.scatter(X, Y, color='blue', marker='o',linewidth=2)
plt.xlabel('LSTAT')
plt.ylabel('MEDV')
plt.legend(loc='lower right')
plt.show()


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

poly_reg=PolynomialFeatures(degree=3)
X_poly=poly_reg.fit_transform(X.reshape(-1,1))
lr_r=LinearRegression()
lr_r.fit(X_poly,Y.reshape(-1,1))
Y_pred=lr_r.predict(X_poly)

plt.figure(figsize=(10,8))
plt.scatter(X,Y,label='Distribution',c='#14047b' )
plt.scatter(X,Y_pred,label='Regression', c='red' )
plt.xlabel('LSTAT')
plt.ylabel('MEDV')
plt.legend(loc='lower right')
plt.show()
print(f'model Coef={lr_r.coef_},\n R² ={r2_score(Y,Y_pred):.3f},\n MSE={mean_squared_error(Y,Y_pred):.3f}')


In [None]:
from sklearn.model_selection import learning_curve
from sklearn.linear_model import RidgeCV

pd.options.display.float_format='{:,.3f}'.format
features=['LSTAT']
x=Boston[features]
y=Y
model=RidgeCV() #PolynomialFeatures(degree=3) #LinearRegression() RidgeCV()
train_sizes, train_scores, validation_scores = learning_curve(
    model,x,y,train_sizes=[10,50,100,150,200,250,350,400],cv = 5,
    scoring = 'r2')
print('Training scores:\n\n',train_scores)
print('\nValidation scores:\n\n', validation_scores)


In [None]:
train_scores_mean = train_scores.mean(axis = 1)
validation_scores_mean = validation_scores.mean(axis = 1)
MSE_p=r2_score(Y,Y_pred)

plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.plot([0,100,500],[MSE_p,MSE_p,MSE_p], label='Desired', color='r')
plt.ylabel('R²', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for Polynomial model', fontsize = 18, y = 1.03)
plt.legend(loc='lower right')
plt.ylim(-0.25,1)
plt.show()


### The high bias learning curve (fig1) ###
<ul>
<li> Low training and test accuracy /score , underfits the training data
<li> The actual performance (accuracy or R²) level is far from the desired level of performance 
</ul>
address by:
<ul>
<li> Increase the number of parameters, adding or creating new feature. </li>
<li> Decrese the regularisation. </li>
</ul>


# Validation Curve #

In [None]:
# work with features with high correlation

Boston=pd.read_csv('Data/housing.data',delim_whitespace=True, header=None)
Boston.columns=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX',
                'PTRATIO','B','LSTAT','MEDV']

Boston_df=Boston.drop(columns=['ZN', 'CHAS', 'AGE', 'DIS', 'RAD', 'MEDV'])
target=Boston['MEDV'].values
Boston_df


In [None]:
#Learning curve for Boston_df
from sklearn.model_selection import learning_curve
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import ShuffleSplit

pd.options.display.float_format='{:,.3f}'.format

x=Boston_df
y=target
model=RidgeCV() #MultinomialNB()  #LinearRegression() RidgeCV() KMeans() LogisticRegression() GaussianNB
cv= ShuffleSplit(n_splits=100, test_size=0.3, random_state=0)
train_sizes, train_scores, validation_scores = learning_curve(
    model,x,y,cv = cv,scoring = 'neg_mean_squared_error') #train_sizes=[10,50,100,150,200,250,350,400]
print('Training scores:\n\n',train_scores)
print('\nValidation scores:\n\n', validation_scores)


In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1)
#MSE_p=mean_squared_error(Y,Y_pred)

plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
#plt.plot([0,100,500],[MSE_p,MSE_p,MSE_p], label='Desired', color='r')
plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves With Ridge regression', fontsize = 18, y = 1.03)
plt.legend(loc='lower right')
plt.ylim(20,35)
plt.show()


In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import validation_curve

np.random.seed(0)
X=Boston_df
Y=target #Boston['MEDV']
indices=np.arange(Y.all())#.shape(0))
np.random.shuffle(indices)
#X,Y=X[indices] , Y[indices]
param_name='alpha'
param_range=[-5,100,150,200,250,300,350,400,500,600]
#param_range=np.logspace(-5,2,10)
estim=Ridge()
train_scores, valid_scores=validation_curve(estim, X, Y,param_name='alpha',
                                            param_range=param_range,
                                            scoring="neg_mean_squared_error")
print(f'Train Scores= \n{train_scores}\n\n Validation Scores= \n{valid_scores}\n')


In [None]:
train_mean = np.mean(-train_scores, axis=1)
test_mean = np.mean(-valid_scores, axis=1)

train_std = np.std(-train_scores, axis=1)
test_std = np.std(-valid_scores, axis=1)
#
# Plot the model scores (accuracy) against the paramater range
plt.figure(figsize=(10,6))
plt.plot(param_range, train_mean,
         marker='o', markersize=5,
         color='blue', label='Training Accuracy')
plt.plot(param_range, test_mean,
         marker='o', markersize=5,
         color='green', label='Validation Accuracy')
plt.title('Validation Curve of Boston_df / Ridge Regression')
plt.legend(loc='best')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')
#plt.xlim(0,600)
plt.show()
print(f'overfitting model')

In [None]:
#Yellowbrick method

from yellowbrick.model_selection import ValidationCurve

Xyell=Boston['RM'].values
Yyell=Boston['MEDV'].values


viz = ValidationCurve(Ridge(), Xyell, Yyell,param_name,
                      param_range, cv=10, scoring="neg_mean_squared_error")

# Fit and show the visualizer
viz.fit(Xyell,Yyell)
viz.show()
