# Lab 6: Introducing Regression

Objectives:
- To gain hands-on experience with regression models using an example dataset.
- To better understand the concept of regression through its applications.

In [None]:
# Run this cell if you use Colab
from google.colab import drive
drive.mount('/content/drive')

##Linear Regression

###*Simple* Linear Regression

####Example #1

The following example is from this [reference](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html).

This type of linear regression deals with **one** independent variable.

This example uses only *the first feature* of the diabetes dataset to illustrate the data points within the two-dimensional plot.


In [None]:
# Code source: Jaques Grobler
# License: BSD 3 clause

import matplotlib.pyplot as plt
import numpy as np

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

Determine the shape of data.

In [None]:
print('Shape of X: %s'%str(diabetes_X.shape))
print('Shape of y: %s'%str(diabetes_y.shape))

In [None]:
diabetes_X[:10]

In [None]:
diabetes_y[:10]

Use only one feature.

In [None]:
# Use only one feature
diabetes_X = diabetes_X[:, np.newaxis, 2]

Plot all data points.

In [None]:
# Plot outputs
plt.scatter(diabetes_X, diabetes_y, color="black")

plt.title("Scatter Plot of All Data")
plt.xticks()
plt.yticks()
plt.xlabel("x")
plt.ylabel("y")
plt.show()

Split data into training set and test set.

In [None]:
# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes_y[:-20]
diabetes_y_test = diabetes_y[-20:]

Create a linear regression model.

In [None]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

Results.

In [None]:
# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

In [None]:
# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test, color="black")
plt.plot(diabetes_X_test, diabetes_y_pred, color="blue", linewidth=3)

plt.title("Test Data and Linear Regression Model")
plt.xticks()
plt.yticks()
plt.xlabel("x")
plt.ylabel("y")
plt.show()

####Example #2

The following example is from this [reference](https://www.w3schools.com/python/python_ml_multiple_regression.asp).

First load the data.

In [None]:
import pandas as pd

# Load the data
df = pd.read_csv('data_MLR.csv')
print(df.shape)
df.head()

We'll pick only one independent variable and one dependent variable.

Let's first pick ```Volume``` to see how well it could capture variance of the data.
- Independent variable (x) =  Volume
- Dependent variable (y) = CO2

In [None]:
X_2 = df[['Volume']].values.reshape(-1,1)
y_2 = df['CO2'].values

Plot the data.

In [None]:
plt.style.use('default')
plt.style.use('ggplot')

fig, ax = plt.subplots(figsize=(8, 4))

ax.scatter(X_2, y_2, edgecolor='k', facecolor='grey', alpha=0.7)
ax.set_ylabel('CO2', fontsize=14)
ax.set_xlabel('Volume', fontsize=14)
ax.set_title('Scatter Plot of All Data')

Create a linear regression model.

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X_2, y_2)
model = regr.predict(X_2)

Plot it out to see how well the model (using ```Volume```) fits the data.

In [None]:
plt.style.use('default')
plt.style.use('ggplot')

fig, ax = plt.subplots(figsize=(8, 4))

ax.plot(X_2, model, color='k', label='Regression model')
ax.scatter(X_2, y_2, edgecolor='k', facecolor='grey', alpha=0.7)
ax.set_ylabel('CO2', fontsize=14)
ax.set_xlabel('Volume', fontsize=14)

ax.set_title('Linear Regression Model')

###Multiple Linear Regression

We'll use the previous dataset.

In [None]:
# reload the data
df = pd.read_csv('data_MLR.csv')
print(df.shape)
df.head()

Now select *two variables* of interest.

- Independent variables (x) include ```Weight``` and ```Volumne```.

- Dependent variable (y) is ```CO2```.

In [None]:
X_m = df[['Weight', 'Volume']].values.reshape(-1,2)
y_m = df['CO2']

# Prepare model data point for visualization
x = X_m[:, 0]
y = X_m[:, 1]
z = y_m

Print values of each feature to double check the correctness.

In [None]:
print('Values of feature #1: Weight')
print(x)

In [None]:
print('Values of feature #2: Volume')
print(y)

In [None]:
print('Values of feature #3: CO2')
print(z)

Let's visualize it in 3D.

In [None]:
plt.style.use('default')
fig = plt.figure(figsize=(6, 6))


ax = fig.add_subplot(projection='3d')
ax.plot(x, y, z, color='k', zorder=15, linestyle='none', marker='o', alpha=0.5)
ax.set_xlabel('Weight', fontsize=12)
ax.set_ylabel('Volume', fontsize=12)
ax.set_zlabel('CO2', fontsize=12)
ax.locator_params(nbins=10, axis='x')


Visualize from different angle.

In [None]:
plt.style.use('default')
fig = plt.figure(figsize=(6, 6))

ax = fig.add_subplot(projection='3d')
ax.plot(x, y, z, color='k', zorder=15, linestyle='none', marker='o', alpha=0.5)
ax.set_xlabel('Weight', fontsize=12)
ax.set_ylabel('Volume', fontsize=12)
ax.set_zlabel('CO2', fontsize=12)
ax.locator_params(nbins=10, axis='x')

ax.view_init(elev=4, azim=114)

In [None]:
plt.style.use('default')
fig = plt.figure(figsize=(6, 6))

ax = fig.add_subplot(projection='3d')
ax.plot(x, y, z, color='k', zorder=15, linestyle='none', marker='o', alpha=0.5)
ax.set_xlabel('Weight', fontsize=12)
ax.set_ylabel('Volume', fontsize=12)
ax.set_zlabel('CO2', fontsize=12)
ax.locator_params(nbins=10, axis='x')

ax.view_init(elev=60, azim=165)

Create a linear regression model.

In [None]:
regr = linear_model.LinearRegression()
model = regr.fit(X_m, y_m)

Now, let's try predicting value of CO2, given some weight and volume.

In [None]:
predictedCO2 = regr.predict([[3300, 1300]])

print(predictedCO2)
ax.view_init(elev=60, azim=165)

##Polynomial Regression


This example is from [reference](https://medium.com/@shuv.sdr/polynomial-regression-in-python-58198fb0973f).

In [None]:
# Load dataset
df_sal = pd.read_csv('Position_Salaries.csv')

# See first 5 rows of the dataset
df_sal.head()

Plot Salary and Level to see if there's any relationship among the two features.

In [None]:
# Relationship between Salary and Level
plt.scatter(df_sal['Level'], df_sal['Salary'], color = 'red')
plt.title('Salary vs Level')
plt.xticks()
plt.yticks()
plt.xlabel('Level')
plt.ylabel('Salary')
plt.show()

Preparing the data.

In [None]:
# Splitting variables
X = df_sal.iloc[:, 1:-1].values  # independent
y = df_sal.iloc[:, -1].values  # dependent

Train *polynomial regression model* on the whole dataset.

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

pr = PolynomialFeatures(degree = 4)
X_poly = pr.fit_transform(X)
lr_2 = LinearRegression()
lr_2.fit(X_poly, y)


Train *linear regression model* on whole dataset

In [None]:
lr = LinearRegression()
lr.fit(X, y)

Let's predict the results from both models and compare them.

In [None]:
# Predict results
y_pred_lr = lr.predict(X)           # Linear Regression
y_pred_poly = lr_2.predict(X_poly)  # Polynomial Regression

Output from linear regression model.

In [None]:
# Visualize real data with linear regression
plt.scatter(X, y, color = 'black')
plt.plot(X, lr.predict(X), color = 'red')
plt.title('Real data (Linear Regression)')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.legend(['X/y_pred_lr', 'X/y'], title = 'Salary/Level', loc='best', facecolor='white')
# plt.box(False)
plt.grid()
plt.show()

Output from polynomial regression model.

In [None]:
# Visualize real data with polynomial regression
X_grid = np.arange(min(X), max(X), 0.1)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color = 'black')
plt.plot(X, lr_2.predict(X_poly), color = 'blue')
plt.title('Real data (Polynomial Regression)')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.legend(['X/y_pred_poly', 'X/y'], title = 'Salary/Level', loc='best', facecolor='white')
# plt.box(False)
plt.grid()
plt.show()
