In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.datasets import load_boston

In [None]:
boston_df = load_boston()

In [None]:
boston_df.keys()

In [None]:
boston_df


In [None]:
# Datataset description
print(boston_df.DESCR)

# Prepare the Dataset

In [None]:
dataset = pd.DataFrame(boston_df.data)

In [None]:
dataset.head()

In [None]:
dataset = pd.DataFrame(boston_df.data, columns = boston_df.feature_names)

In [None]:
dataset.head()

In [None]:
dataset['Price'] = boston_df.target

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
# Check for missing value
dataset.isnull()

In [None]:
dataset.isnull().sum()

In [None]:
# Exploratory data analysis

#Correlation

# We want to check for 2 things: multi colinearity which happens when 2 independant variables are highly correlated > 96%. In
# this case, we want to drop of the variables and keep the other one. The other thing is we want to check if there is correlation
# between the dependant valriable (price) and any independant variable

dataset.corr()

In [None]:
dataset.corr().sort_values(by='Price', ascending=False)

In [None]:
import seaborn as sns
sns.pairplot(dataset)

In [None]:
# If you want to plot the graphs above one by one, you can use matplot library scatter plot function
plt.scatter(dataset['CRIM'],dataset['Price'])
plt.xlabel("Crime Rate")
plt.ylabel("Price")





In [None]:
# Create a regression plot

sns.regplot(x= "RM", y="Price", data = dataset)

In [None]:
sns.regplot(x= "LSTAT", y="Price", data = dataset)

In [None]:
sns.regplot(x= "CHAS", y="Price", data = dataset)

In [None]:
sns.regplot(x= "PTRATIO", y="Price", data = dataset)

In [None]:
## Independant and dependant features

X= dataset.iloc[:,:-1]
y= dataset.iloc[:,-1]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
## Train Test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X,y,test_size = 0.3, random_state=100)

#random_state number splits the test and training datasets with a random manner.
#In addition to what is explained here, it is important to remember that random_state value can have significant
#effect on the quality of your model (by quality I essentially mean accuracy to predict). 
#For instance, If you take a certain dataset and train a regression model with it,
#without specifying the random_state value, there is the potential that everytime, you will get
#a different accuracy result for your trained model on the test data. So it is important to find
#the best random_state value to provide you with the most accurate model. And then, that number
#will be used to reproduce your model in another occasion such as another research experiment.
#To do so, it is possible to split and train the model in a for-loop by assigning random numbers to random_state parameter.
# https://stackoverflow.com/questions/28064634/random-state-pseudo-random-number-in-scikit-learn


In [None]:
X_train

In [None]:
X_test

In [None]:
 ## Standardizing the dataset
    
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)

In [None]:
# https://towardsdatascience.com/what-and-why-behind-fit-transform-vs-transform-in-scikit-learn-78f915cf96fe
X_test = scaler.transform(X_test)

In [None]:
#Model Training

from sklearn.linear_model import LinearRegression

In [None]:
regression=LinearRegression()

In [None]:
regression.fit(X_train,y_train)

In [None]:
## print the coefficients and the intercept

print (regression.coef_)

In [None]:
print(regression.intercept_)

In [None]:
## on which parameters the model has been trained?

regression.get_params()

In [None]:
## Prediction with Test data

reg_pred= regression.predict(X_test)

In [None]:
reg_pred

In [None]:
# plot a scatter plot for the prediction

plt.scatter(y_test, reg_pred)

In [None]:
residuals = y_test - reg_pred

In [None]:
residuals

In [None]:
sns.displot(residuals,kind="kde")

# we are getting a normal dist here so our model seems to be working well.

In [None]:
plt.scatter(reg_pred, residuals)

# This is scattered and uniformly distributed. 

In [None]:
# let's use some performance metrics

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, reg_pred))
print(mean_squared_error(y_test, reg_pred))
print(np.sqrt(mean_squared_error(y_test, reg_pred))) # root mean square error

In [None]:
# R square and adjusted R square

from sklearn.metrics import r2_score

score = r2_score(y_test, reg_pred)



In [None]:
score

# That's considered a good score.

In [None]:
# How to improve the model:

# change the random state when you train the model
# choose a differen sklearn model. We used Linear regression here. 

In [None]:
# new data prediction

In [None]:
boston_df.data[0].reshape(1,-1)

In [None]:
#Transformation of new data
scaler.transform(boston_df.data[0].reshape(1,-1))

In [None]:
regression.predict(scaler.transform(boston_df.data[0].reshape(1,-1)))

In [None]:
# Pickling the model files for deployment

In [None]:
import pickle

In [None]:
pickle.dump(regression,open('regmodel.pkl','wb'))

In [None]:
# how to load it

pickled_model = pickle.load(open('regmodel.pkl','rb'))

In [None]:
pickled_model.predict(scaler.transform(boston_df.data[0].reshape(1,-1)))