In [14]:
#import necessary libraries
import numpy as np
import pandas as pd

In [15]:
# import scikit learn dataset already loaded in scikit-learn
from sklearn.datasets import load_boston
boston_dataset = load_boston()

In [16]:
# use built in methods to explore and understand the data
print(boston_dataset['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [17]:
# print the features of the dataset
print(boston_dataset['feature_names'])

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [18]:
# store data into dataframe
df_boston = pd.DataFrame(boston_dataset.data)

In [19]:
# set features as columns on the dataframe
df_boston.columns = boston_dataset.feature_names

In [20]:
# view first 5 observation
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [21]:
# print dataset matrix (observation and features matrix)
df_boston.shape

(506, 13)

In [22]:
# print dataset target or response shape
boston_dataset.target.shape

(506,)

In [23]:
# view target or response
boston_dataset['target']

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

## Machine Learning - Linear Regression

In [24]:
# import required libraries
import numpy as np
import pandas as pd

In [25]:
# import boston dataset
from sklearn.datasets import load_boston
boston_dataset = load_boston()

In [26]:
# create pandas dataset and store the data
df_boston = pd.DataFrame(boston_dataset.data)
df_boston.columns = boston_dataset.feature_names

In [27]:
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [28]:
# append price, target, as a new column to the dataset
df_boston['Price'] = boston_dataset.target

In [29]:
# print top 5 observations
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [30]:
# assign features on X-axis
X_features = boston_dataset.data

In [31]:
# assign target on Y-axis
Y_target = boston_dataset.target

In [32]:
# import linear model - the estimator
from sklearn.linear_model import LinearRegression
lineReg = LinearRegression()

In [33]:
# fit data into the estimator
lineReg.fit(X_features,Y_target)

LinearRegression()

In [34]:
# print the intercept
print('The estimated intercept %.2f '%lineReg.intercept_)

The estimated intercept 36.46 


In [35]:
# print the coefficient
print('The coefficient is %d' %len(lineReg.coef_))

The coefficient is 13


In [36]:
# train model split the whole dataset into train and test datasets
# from sklearn import cross_validation
# https://stackoverflow.com/questions/53978901/importerror-cannot-import-name-cross-validation-from-sklearn
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_features,Y_target)

In [37]:
# print the dataset shape
print(boston_dataset.data.shape)

(506, 13)


In [38]:
# print shapes of training and testing data sets
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(379, 13) (127, 13) (379,) (127,)


In [39]:
# fit the training sets into the model
lineReg.fit(X_train, Y_train)

LinearRegression()

In [40]:
# The mean square error or residual sum of squares
print('MSE value is %.2f ' %np.mean(lineReg.predict(X_test)-Y_test) **2)

MSE value is 0.01 


In [41]:
# calculate variance
print('Variance score is %.2f' %lineReg.score(X_test, Y_test))

Variance score is 0.59


## Supervised learning models : Logistic Regression

In [42]:
# import necessary modules
import numpy as np
import pandas as pd

In [43]:
# import sklearn load dataset
from sklearn.datasets import load_iris
iris_dataset = load_iris()

In [44]:
# display the dataset
type(iris_dataset)

sklearn.utils.Bunch

In [45]:
# view information using dataset built in method DESCR(describe)
print(iris_dataset.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [46]:
# View features
print(iris_dataset.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [47]:
# View target
print(iris_dataset.target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [48]:
# Find number of observations
print(iris_dataset.data.shape)

(150, 4)


In [49]:
# Assign features data to x-axis
X_feature = iris_dataset.data

In [50]:
# Assign target data to y-axis
Y_target = iris_dataset.target

In [51]:
# View the shape of both axis
print(X_feature.shape)
print(Y_target.shape)

(150, 4)
(150,)


### KNN model importing from sklearn

In [52]:
# First use KNN classifier method - import it from sklearn
from sklearn.neighbors import KNeighborsClassifier

In [53]:
# instantiate the knn estimator - object used to instantiate the class of a learning model is called an estimator
knn = KNeighborsClassifier(n_neighbors=1)

In [54]:
# print the knn
print(knn)

KNeighborsClassifier(n_neighbors=1)


In [55]:
# fit data into knn model(estimator)
knn.fit(X_feature, Y_target)

KNeighborsClassifier(n_neighbors=1)

In [56]:
# create object with new values for prediction
X_new = [[3,5,4,1],[5,3,4,2]]

In [57]:
# PRedict the outcome for the new observation using knn classifier
knn.predict(X_new)

array([1, 1])

In [58]:
# Use logistic regression estimator
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()

In [59]:
# fit data into the Logistic regression estimator
logReg.fit(X_feature, Y_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [60]:
# predict the outcome using Logistic Regression estimator 
logReg.predict(X_new)

array([0, 1])

## Unsupervised Learning Models : Clustering

### KMeans Clustering 

In [61]:
# import required libraries
import numpy as np

# import KMeans class from sklearn.cluster
from sklearn.cluster import KMeans

# import make_blobs dataset from sklearn.cluster
from sklearn.datasets import make_blobs

In [62]:
# Define number of smaples
n_samples = 300

# Define random state value to initialize the center
random_state = 20

# define number of feature as 5
X,y = make_blobs(n_samples=n_samples, n_features=5, random_state=None)

# define number of cluster to be formed as 3 and in random state and fit features into the model 
predict_y = KMeans(n_clusters=3, random_state=random_state).fit_predict(X)

# print the estimator prediction
predict_y

array([2, 2, 2, 0, 1, 1, 2, 1, 1, 0, 2, 2, 1, 2, 2, 1, 0, 1, 1, 2, 1, 0,
       2, 1, 2, 1, 0, 0, 2, 0, 1, 0, 1, 0, 2, 0, 2, 0, 0, 1, 2, 0, 2, 0,
       1, 0, 1, 2, 2, 2, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 2, 2,
       1, 0, 1, 1, 2, 2, 2, 0, 0, 2, 1, 1, 1, 0, 2, 1, 1, 2, 0, 0, 1, 2,
       1, 0, 1, 0, 2, 0, 1, 0, 2, 1, 0, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 2, 0, 0, 2, 0, 1, 1, 2, 1,
       1, 2, 1, 1, 1, 1, 1, 0, 0, 2, 0, 1, 1, 2, 2, 0, 1, 2, 1, 2, 2, 0,
       0, 0, 2, 2, 1, 0, 2, 1, 0, 2, 0, 0, 1, 2, 0, 1, 2, 1, 2, 1, 1, 0,
       1, 0, 2, 1, 0, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 0, 0, 2, 1, 1, 0, 0,
       1, 2, 2, 1, 0, 1, 2, 1, 0, 0, 2, 2, 2, 1, 0, 0, 2, 0, 1, 0, 2, 1,
       1, 0, 2, 2, 1, 1, 0, 1, 2, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1,
       0, 1, 2, 0, 0, 0, 1, 1, 2, 1, 2, 2, 0, 1, 1, 0, 1, 0, 2, 1, 2, 2,
       1, 2, 1, 2, 2, 0, 2, 0, 0, 0, 2, 1, 2, 2, 2, 2, 0, 1, 2, 2, 2, 0,
       2, 2, 1, 2, 0, 2, 0, 1, 0, 2, 2, 2, 1, 2])

### Unsupervised Learning Models : Dimensionality Reduction 
> Helps cut down dimentions without losing any data from dataset

**Techniques used for dimensionality reduction :**
- Drop data columns with missing values
- Drop data columns with low variance
- Drop data columns with high corelations
- Apply statistical functions - PCA(Principal Component Analysis)

## PCA implementation

In [63]:
# import required library PCA
from sklearn.decomposition import PCA

# import the dataset
from sklearn.datasets import make_blobs

In [64]:
# Define sample and random state
n_sample = 20
random_state = 20

In [65]:
# Generate the dataset with 10 features (dimension)
X,y = make_blobs(n_samples=n_sample, n_features=10, random_state=None)

In [66]:
# View the shape of the dataset
X.shape

(20, 10)

In [67]:
# Define the PCA estimator with number of reduced components 
pca = PCA(n_components=3)

In [68]:
# Fit the data into the PCA estimator
pca.fit(X)
print(pca.explained_variance_ratio_)

[0.62415177 0.34714539 0.00864788]


In [69]:
# Print the first PCA component 
first_pca = pca.components_[0]
print(first_pca)

[ 0.25905567 -0.25550125  0.27209367  0.54339901  0.19191334  0.59048811
 -0.0723899   0.31370582 -0.04007061  0.08676168]


In [70]:
# Transform the fitted data using transform method
pca_reduced= pca.transform(X)

In [71]:
# View the reduced shape (lower dimension)
pca_reduced.shape

(20, 3)

## Pipeline - Build pipeline using scikit-learn

#### Import the required libraries and models(estimators)

In [72]:
# import pipeline class
from sklearn.pipeline import Pipeline

# import linear estimator 
from sklearn.linear_model import LinearRegression

# import pca estimator for dimensionality reduction
from sklearn.decomposition import PCA

#### Chain the estimators together

In [73]:
estimator = [('dim_reduction',PCA()), ('linear_model',LinearRegression())]

#### Put the chain of estimators in a pipeline object

In [74]:
pipeline_estimator = Pipeline(estimator)

#### Check the chain of estimators

In [75]:
pipeline_estimator

Pipeline(steps=[('dim_reduction', PCA()), ('linear_model', LinearRegression())])

#### View the first step

In [76]:
pipeline_estimator.steps[0]

('dim_reduction', PCA())

#### View second step

In [77]:
pipeline_estimator.steps[1]

('linear_model', LinearRegression())

#### View all the steps in pipeline

In [78]:
pipeline_estimator.steps

[('dim_reduction', PCA()), ('linear_model', LinearRegression())]

### Model Persistence and Evaluation

In [81]:
# Import required libraries and dataset
from sklearn.datasets import load_iris
iris_dataset = load_iris()

In [82]:
# View feature names of the dataset
iris_dataset.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [84]:
# View target of the dataset
iris_dataset.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [85]:
# Define features and target objects
X_feature = iris_dataset.data
Y_target = iris_dataset.target

In [86]:
# Create object with new values for prediction
X_new = [[3,5,4,1],[5,3,4,2]]

In [88]:
# Use logistical regression estimator
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()

In [89]:
# Fit data into the logistic regression estimator
logReg.fit(X_feature, Y_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [90]:
# Predict the outcome using logistic regression estimator
logReg.predict(X_new)

array([0, 1])

In [95]:
# import library for model persistance
import pickle as pkl

In [96]:
# Use dumps method to persist the model
persist_model = pkl.dumps(logReg)
persist_model

b'\x80\x04\x95\x04\x03\x00\x00\x00\x00\x00\x00\x8c\x1esklearn.linear_model._logistic\x94\x8c\x12LogisticRegression\x94\x93\x94)\x81\x94}\x94(\x8c\x07penalty\x94\x8c\x02l2\x94\x8c\x04dual\x94\x89\x8c\x03tol\x94G?\x1a6\xe2\xeb\x1cC-\x8c\x01C\x94G?\xf0\x00\x00\x00\x00\x00\x00\x8c\rfit_intercept\x94\x88\x8c\x11intercept_scaling\x94K\x01\x8c\x0cclass_weight\x94N\x8c\x0crandom_state\x94N\x8c\x06solver\x94\x8c\x05lbfgs\x94\x8c\x08max_iter\x94Kd\x8c\x0bmulti_class\x94\x8c\x04auto\x94\x8c\x07verbose\x94K\x00\x8c\nwarm_start\x94\x89\x8c\x06n_jobs\x94N\x8c\x08l1_ratio\x94N\x8c\x0en_features_in_\x94K\x04\x8c\x08classes_\x94\x8c\x15numpy.core.multiarray\x94\x8c\x0c_reconstruct\x94\x93\x94\x8c\x05numpy\x94\x8c\x07ndarray\x94\x93\x94K\x00\x85\x94C\x01b\x94\x87\x94R\x94(K\x01K\x03\x85\x94h\x1c\x8c\x05dtype\x94\x93\x94\x8c\x02i4\x94\x89\x88\x87\x94R\x94(K\x03\x8c\x01<\x94NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00t\x94b\x89C\x0c\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x94t\x94b\x8c\x07n_iter_\x9

In [98]:
# Use joblib.dump to persist the model to a file
# from sklearn.externals import joblib
import joblib
joblib.dump(logReg, 'regresfilename.pkl')

['regresfilename.pkl']

In [100]:
# Create new estimator from the saved model
new_logreg_estimator = joblib.load('regresfilename.pkl')

In [101]:
# View the new estimator
new_logreg_estimator

LogisticRegression()

In [102]:
# Validate and use new estimator to predict
new_logreg_estimator.predict(X_new)

array([0, 1])

### Metric functions to evaluate accuracy of your model's predictions

1. Classification : metrics.accuracy_score | metrics.average_precision_score
2. Clustering : metrics.adjusted_rand_score
3. Regression : metrics.mean_absolute_error | metrics.mean_squared_error | metrics.median_absolute_error