In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

__How to make synthetic datasets using Scikit Learn__

__1. Synthetic dataset for simple regression__

In [2]:
from sklearn.datasets import make_regression
plt.figure()
plt.title("Scatter plot of 100 samples, with 1 feature which is informative and bias(intercept) of 100\
 and noise of 10 i.e comparatively less noisy than noise=100.")
X, y = make_regression(n_samples = 100, n_features = 1, n_informative = 1, bias = 100, noise = 10, random_state=0)
plt.scatter(X, y, marker='o', s = 20)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7f848a1de208>

__2. Synthetic dataset for more complex regression__

In [3]:
from sklearn.datasets import make_friedman1
plt.figure()
plt.title("Scatter plot of 100 samples with 7 features, plotting for the last feature(7 th) only")
X, y = make_friedman1(n_samples = 100, n_features = 7, random_state = 0)
plt.scatter(X[:,6], y, marker = 'o', s = 20)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7f8488133048>

__3. Synthetic dataset for binary classification__

In [4]:
from sklearn.datasets import make_classification
plt.figure()
plt.title('Scatter plot for binary classification problem with two informative features')
X, y = make_classification(n_samples = 100, n_features = 2, n_redundant = 0, n_informative = 2,
                          n_clusters_per_class = 1, flip_y = 0.1, class_sep = 0.5, random_state = 0)
plt.scatter(X[:,0], X[:,1], c = y, marker='o', s=50)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7f84880fa588>

__4. Synthetic dataset for more complex classiication, where classes that are not linearly separable__

In [5]:
from sklearn.datasets import make_blobs
plt.figure()
plt.title("For complex classification problems, with 8 clusters")
X, y = make_blobs(n_samples = 100, n_features = 2, centers = 8, cluster_std = 1.3, random_state = 0)
plt.scatter(X[:,0], X[:,1], c = y, marker = 'o', s = 30)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7f8488053e80>

*Making it binary class*

In [6]:
y = y % 2
plt.figure()
plt.title("Binary classification with linearly non separable dataset")
plt.scatter(X[:,0], X[:,1], c = y, marker = 'o', s = 30)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7f848801fc50>

## Regularization in Linear models
________________________________________

__First creating a synthetic dataset for testing various models__


In [21]:
X, y = make_regression(n_samples = 200, n_features = 5, n_informative = 4, bias = 200, noise=70, random_state=10)
plt.figure()
plt.title("Scatter plot for out synthetic dataset")
plt.xlabel("X")
plt.ylabel("y")
plt.scatter(X[:,0], X[:,1], c='b', s = 15, label='y')
#Test train split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

<IPython.core.display.Javascript object>

__First of all using a linear regression model without regularization to test the score__

In [22]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
print("The accuracy score for the model on test data is : {}".format(model.score(X_test, y_test)))
print("The coeffecients of the regressor are : {}".format(model.coef_))
print("The intercept is : {}".format(model.intercept_))

The accuracy score for the model on test data is : 0.6483464055379076
The coeffecients of the regressor are : [ 82.98276627  -7.21401231  91.55860394  53.29839926  37.81177456]
The intercept is : 202.7863979272523


__Let us use KNeighborsRegressor__

In [23]:
from sklearn.neighbors import KNeighborsRegressor
neighbors = range(1,70)
scores = []

for n in neighbors :
    model = KNeighborsRegressor(n_neighbors = n)
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))

plt.figure()
plt.xlabel('No.of neighbors considered')
plt.ylabel("Value of score")
plt.plot(neighbors, scores, c='g')
plt.show()

<IPython.core.display.Javascript object>

*Here we see that score is maximum for n = 3*

In [24]:
model = KNeighborsRegressor(n_neighbors = 3)
model.fit(X_train, y_train)
print("The accuracy score for the model on test data is : {}".format(model.score(X_test, y_test)))

The accuracy score for the model on test data is : 0.5647401100975618


### What is regularization ?

*In regularization we add a extra term to the sum of the sqares of the residuals(difference between the predicted and actual value). The reason for this is to reduce complexity of the model by keeping the weights(coefficients) under control by incorporating its sum of squares in the parameter for learning.*

![Ridge Regression](ridgeregression.png)

![Lasso Regression](lasso.png)


__Let us use ridge regression on our dataset__

In [25]:
from sklearn.linear_model import Ridge

# Ridge regressor has a alpha parameter. We will look for the best alpha value for our dataset
scores = []
for a in range(1, 50) :
    model = Ridge(alpha=a).fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))

plt.figure()
plt.xlabel('Value of alpha')
plt.ylabel("Value of score")
plt.plot(range(1, 50), scores, c='g')
plt.show()

<IPython.core.display.Javascript object>

*Here we see that for alpha = 14 the value of score is maximum.*

In [26]:
model = Ridge(alpha=14)
model.fit(X_train, y_train)
print("The accuracy score for the model on test data is : {}".format(model.score(X_test, y_test)))
print("The coeffecients of the regressor are : {}".format(model.coef_))
print("The intercept is : {}".format(model.intercept_))

The accuracy score for the model on test data is : 0.6616985746516575
The coeffecients of the regressor are : [ 72.52252936  -5.59384106  81.94363469  47.82127269  33.96030018]
The intercept is : 204.18626831708332


*This is not much better than our linear model whose score was 0.64, hence we'll use normalization.*

### What is normalization ?

*The value included is the sum of squares of the weights, but all the feature values may not be on the same scale hence this could result in some features contributing more while others less. Hence normalization is used to bring all the features on the same scale.
It is beneficial not only in ridge regression but also for other models like Kneighbors.*

__Normalizing out features__

In [27]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaler.fit(X_train)
X_train_transformed = scaler.transform(X_train)
# we can also merge the fit and transform methods by calling scaler.fit_transform(X_train)
X_test_transformed = scaler.transform(X_test)

*To important things to remember-*
1. Both test and train features are transformed using the same scaler here MinMaxScaler
2. Don't fit the scaler using the test features as it could lead to Data leakage.

__Now using ridge regression on the normalized data__

In [29]:
model = Ridge(alpha=14)
model.fit(X_train_transformed, y_train)
print("The accuracy score for the model on test data is : {}".format(model.score(X_test_transformed, y_test)))
print("The coeffecients of the regressor are : {}".format(model.coef_))
print("The intercept is : {}".format(model.intercept_))

The accuracy score for the model on test data is : 0.30316954171699095
The coeffecients of the regressor are : [ 101.74967074    2.54793332  108.34010807   67.52778903   49.03146255]
The intercept is : 39.64859130755195


*Here we see that accuracy has infact decreased. So lets try a different value for alpha.*

In [31]:
scores = []
for a in range(1, 50) :
    model = Ridge(alpha=a).fit(X_train_transformed, y_train)
    scores.append(model.score(X_test_transformed, y_test))

plt.figure()
plt.xlabel('Value of alpha')
plt.ylabel("Value of score")
plt.plot(range(1, 50), scores, c='g')
plt.show()

<IPython.core.display.Javascript object>

*Here we see that model performs best for alpha = 1, which needs further investigation*

__Using Lasso regression on the normalized data__

In [36]:
from sklearn.linear_model import Lasso
scores = []
for a in range(1, 50) :
    model = Lasso(alpha=a).fit(X_train_transformed, y_train)
    scores.append(model.score(X_test_transformed, y_test))

plt.figure()
plt.xlabel('Value of alpha')
plt.ylabel("Value of score")
plt.plot(range(1, 50), scores, c='g')
plt.show()

<IPython.core.display.Javascript object>

In [37]:
# this gives the highest score so far , for alpha = 1.
model = Lasso(alpha=1).fit(X_train_transformed, y_train)
print("The accuracy score for the model on test data is : {}".format(model.score(X_test_transformed, y_test)))
print("The coeffecients of the regressor are : {}".format(model.coef_))
print("The intercept is : {}".format(model.intercept_))

The accuracy score for the model on test data is : 0.6788244453243963
The coeffecients of the regressor are : [ 284.0290698    -6.10824159  494.28389131  257.03399477  157.14439668]
The intercept is : -420.9150024941496


__Trying other models with our normalized data__

__1. Linear Regressor__

In [32]:
model = LinearRegression()
model.fit(X_train_transformed, y_train)
print("The accuracy score for the model on test data is : {}".format(model.score(X_test_transformed, y_test)))
print("The coeffecients of the regressor are : {}".format(model.coef_))
print("The intercept is : {}".format(model.intercept_))

The accuracy score for the model on test data is : 0.6483464055379071
The coeffecients of the regressor are : [ 303.07555376  -36.55525512  537.17238991  292.92328675  185.26006587]
The intercept is : -473.7385855541838


__2. KNeighbors Regressor__

In [34]:
neighbors = range(1,70)
scores = []

for n in neighbors :
    model = KNeighborsRegressor(n_neighbors = n)
    model.fit(X_train_transformed, y_train)
    scores.append(model.score(X_test_transformed, y_test))

plt.figure()
plt.xlabel('No.of neighbors considered')
plt.ylabel("Value of score")
plt.plot(neighbors, scores, c='g')
plt.show()

# at n_neighbors = 4 it gives a score of 0.56

<IPython.core.display.Javascript object>