In [None]:


%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Binning / Discretization 

In [None]:
from sklearn.linear_model import LinearRegression   #We will compare the linear model with decision model on this data
from sklearn.tree import DecisionTreeRegressor

In [None]:
import mglearn
X , y = mglearn.datasets.make_wave(n_samples=120)

new_data = np.linspace(-3, 3 , 1000, endpoint = False) 
new_data = new_data.reshape(-1,1) # converts to array of numbers

<b> How does Linear Regression model relationship between X and y </b>

In [None]:
sns.regplot(X,y)

In [None]:
lr_mod = LinearRegression().fit(X,y)
y_predicted = lr_mod.predict(new_data)
plt.plot(new_data, y_predicted)

In [None]:
dt_mod = DecisionTreeRegressor(max_depth=4).fit(X,y)
y_predicted = dt_mod.predict(new_data)
plt.plot(new_data, y_predicted)

In [None]:
# comparing model performance
print(lr_mod.score(X, y))
print(dt_mod.score(X, y))

In [None]:
# plotting both graphs in a single plot

Dtreg = DecisionTreeRegressor(min_samples_leaf = 3).fit(X,y)
predicted = dt_mod.predict(new_data)
plt.plot(new_data, predicted, label="Decision Tree")

linreg = LinearRegression().fit(X,y)
linpredicted = linreg.predict(new_data)
plt.plot(new_data, linpredicted, label = "Linear Regression" )

plt.plot(X[:,0], y, 'o', c='k')
plt.ylabel("Regression Output")
plt.xlabel("Input Features")
plt.legend(loc = 'best')

### Binning of Input predictor

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

kb = KBinsDiscretizer(n_bins=10, strategy='uniform')

kb.fit(X)

print("bin edges \n", kb.bin_edges_)



In [None]:
# With the bins defined, we can transform each data point X into a bin using the transform function

X_binned = kb.transform(X)
X_binned

In [None]:
X_binned.toarray()[0,:]

In [None]:
print(X_binned.toarray()[0,:])
print(pd.DataFrame(X).head(1))   #First data point -0.752759 is stored in bin 4

In [None]:
# To prevent sparse matrix and create dense matrix, let us re-do the binning with onehot coding. A given input belongs to
# one bin and not others. Hence, we can onehot code them

kb = KBinsDiscretizer( n_bins = 10, strategy ='uniform', encode ='onehot-dense') 
kb.fit(X) 
X_binned = kb.transform(X)

In [None]:
type(X_binned)

### In class assignment

Q1: Train a Linear Reg. model with binned inputs?

Q2: Use the trained model to predict on new data (new data should be binned as well)

Q3: Plot the model predictions

Q4: Does the model with binned features perform better than raw features

# Adding slope to the linear model in a bin

In [None]:
X_combined = np.hstack([X, X_binned])
new_data_combined = np.hstack([new_data, new_data_binned])
X_combined[0:10]

In [None]:
print('X_Combined shape :', X_combined.shape)  #from 10 onehotcoded columns, it has become 11

### Discussion question

What would be the impact of adding the original raw variable with the binned variables? How would the predicted plot look like?

In [None]:
linreg = LinearRegression().fit(X_combined, y)
predicted = linreg.predict(new_data_combined)
plt.plot(new_data, predicted)
plt.plot(X[:,0], y,'o') 

In [None]:
# linear regression model now has learnt a slope but the slope is same in all the bins (blue line)

In [None]:
print("coefficients for each bin \n", linreg.coef_ ) #coefficients for each bin.
print("")
print("intercept\n", linreg.intercept_)

In [None]:
print(linreg.score(X_combined, y))


### Adding interaction terms

In [None]:
X_product = np.hstack([X_binned, X_binned * X])


print(X_product.shape)

new_data_product = np.hstack([new_data_binned, new_data_binned * new_data])



In [None]:
X_product[0]

In [None]:
print(X_binned[1])
print(X[1])
print(X_binned[1] * X[1])

In [None]:
linreg = LinearRegression().fit(X_product, y)
predicted = linreg.predict(new_data_product)
plt.plot(new_data, predicted)
plt.plot(X[:,0], y,'o') 

In [None]:
# Each bin has it's own offset and its own coefficient.

In [None]:
# Each bin has it's own offset and coefficient

print("coefficients for each bin \n", linreg.coef_ ) #coefficients for each bin.
print("")
print("intercept\n", linreg.intercept_)

In [None]:
print(linreg.score(X_product, y))


# Polynomial feature transformation 

In [None]:
from sklearn.preprocessing import PolynomialFeatures


In [None]:
poly = PolynomialFeatures(degree=5, include_bias=False)

In [None]:
poly.fit(X)

In [None]:
X_poly = poly.transform(X)

new_data_poly = poly.transform(new_data)

In [None]:
X_poly.shape


In [None]:
X.shape

In [None]:
print("X_poly shape: ", X_poly.shape)

In [None]:
# Let us see the difference in the X and X_poly entries

print("Records in X \n", X[0])
print("Records in X_poly \n",X_poly[0])

In [None]:
# the columns are x, x^2, x ^3

print("X_Poly feature names : " , poly.get_feature_names())

In [None]:
linreg = LinearRegression().fit(X_poly, y)
predicted = linreg.predict(new_data_poly)
plt.plot(new_data, predicted)


plt.plot(X[:,0], y, 'o')   

In [None]:
print(linreg.score(X_poly, y))


### Assignment

Q: What happens to the fit as we increase the polynomial degree?