# Linear Regression -Cement Case Study

### Importing Packages

In [None]:
%matplotlib inline
import time
import random
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import statistics
import numpy as np
from scipy import stats
import seaborn as sns

In [None]:
data = pd.read_excel('data/concrete_data.xls')

In [None]:
data.head()

##### Features Exploration

In [None]:
data.info()

In [None]:
data.describe()

##### Renaming the columns - Ex Use Case

In [None]:
data.columns = ['cement_component', 'furnace_slag', 'flay_ash', 'water_component', 'superplasticizer', \
                'coarse_aggregate', 'fine_aggregate', 'age', 'concrete_strength']

In [None]:
### Univariate Analysis
data.cement_component.plot.hist(bins=30, title = 'cement_component')

In [None]:
data.age.plot.hist(bins=30)

In [None]:
### Removing Zeros
data1 = data.replace(0, np.NaN)
data1.dropna(inplace = True)
data1

In [None]:
## Plotting Scatter Plots Between the Response and Exploratory Variables
plt.figure(figsize=(15,10.5))
plot_count = 1
for feature in list(data.columns)[:-1]:
    plt.subplot(3,3,plot_count)
    plt.scatter(data[feature], data['concrete_strength'])
    plt.xlabel(feature.replace('_',' ').title())
    plt.ylabel('Concrete strength')
    plot_count+=1
plt.show()

In [None]:
pd.set_option('display.width', 100)
pd.set_option('precision', 3)
correlations = data.corr(method='pearson')
print(correlations)

In [None]:
test = data.corr()
test.concrete_strength.sort_values()

In [None]:
test = data1.corr()
test.concrete_strength.sort_values()

In [None]:
data1.sample(10)

In [None]:
## Calculating Pair Plot Between All Features
sns.pairplot(data1, vars=data.columns, kind='reg')
plt.show()

In [None]:
## Convert the data into array type
X = data.iloc[:, :-1]
y = data.iloc[:,-1]

#----------------------------
## Convert the data into array type
# X = data1.iloc[:, :-1]
# y = data1.iloc[:,-1]

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print ('Intercept: %f'%lr.intercept_)
print ('Coefficients: %s'%str(lr.coef_))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
features = data.columns[:-1]
features

In [None]:
print ('Features: %s'%str(features))
print ('R2 score: %f'%r2_score(y_test, y_pred))

In [None]:
print ('Mean Absoulte Error:', mean_absolute_error(y_test, y_pred))
print ('Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))

## Overfitting and Underfitting

![](asset/tnn3.png)

#### Underfitting vs. Overfitting in Classification Example
![](asset/tnn4.png)

![](asset/tnn5.png)

![](asset/tnn6.png)
###### But when encounters the test data
![](asset/tnn7.png)




### Whats is the best architecture


![](asset/tnn9.png)

![](asset/tnn8.jpg)

![title](asset/fitting.png)



In [None]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

def load_extended_boston():
    boston = load_boston()
    X = boston.data

    X = MinMaxScaler().fit_transform(boston.data)
    X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)
    return X, boston.target

In [None]:
X, y = load_extended_boston()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
lr = LinearRegression().fit(X_train, y_train)

print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

### Tradeoff between model complexity against training and test accuracy

![title](asset/tradeoff.png)

### Regularization - Explicitly restricting the model to avoid overfitting

![title](asset/RidgeandLasso.jpg)

### Ridge regression

In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))

In [None]:
ridge10 = Ridge(alpha=2).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge10.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge10.score(X_test, y_test)))

### Lasso Regression 

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso().fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))

In [None]:
lasso001 = Lasso(alpha=0.01, max_iter=10000).fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso001.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso001.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso001.coef_ != 0)))

## GridSearch CV

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100],
'max_iter': [2000, 5000, 10000, 15000, 20000]}
print("Parameter grid:\n{}".format(param_grid))

In [None]:
grid_search = GridSearchCV(Lasso(), param_grid)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))
print("Best parameters: {}".format(grid_search.best_params_))