In [40]:
import pandas as pd
import numpy as np
import sklearn.datasets as datasets
import sklearn.preprocessing as preprocessing
from sklearn import impute
## OR ##
#from sklearn.impute import SimpleImputer
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# scikit Learn basic functions
**Utilities**
* 1.0 Data Gathering
* 2.0 Preprocessing
* 3.0 Model Selection
* 4.0 Classification
* 5.0 Regression
* 6.0 Clustering
* 7.0 Dimensionality Reduction

## 1.0 Data Gathering

**1.1 Gathering data from web using pandas**

In [41]:
cancer_set = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data',                          
                        header = None)
print(cancer_set.shape)
# print(cancer_set.head())
# print(cancer_set.info())
# print(cancer_set.describe())
cancer_features = cancer_set.iloc[:,2:]  #iloc[all rows,columns from 2 to all]

print(cancer_features.shape)
print(type(cancer_features))

cancer_features = cancer_features.values #converting data frame to numpy array
print(cancer_features.shape)
print(type(cancer_features))

cancer_features_names = ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 
                         'mean compactness', 'mean concavity','mean concave points', 'mean symmetry',
                         'mean fractal dimension','radius error','texture error','perimeter error',
                         'area error', 'smoothness error','compactness error','concavity error',
                         'concave points error','symmetry error','fractal dimension error','worst radius',
                         'worst texture', 'worst perimeter', 'worst area','worst smoothness', 'worst compactness',
                         'worst concavity','worst concave points','worst symmetry','worst fractal dimension']

cancer_target = cancer_set.iloc[:, 1] #melign and benign column extract

# Replacing 'M' with 0 and 'B' with 1
cancer_target = cancer_target.replace(['M', 'B'], [0, 1])

# Converting to numpy array
cancer_target = cancer_target.values #convert panda to numpy

print(type(cancer_target))
print(cancer_target.shape)

(569, 32)
(569, 30)
<class 'pandas.core.frame.DataFrame'>
(569, 30)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(569,)


**1.2 Gathering data from scikit-learn**

In [42]:
# import sklearn.datasets as datasets

breast_cancer = datasets.load_breast_cancer()

print(breast_cancer.data.shape)
print(breast_cancer.target.shape)

# this is fully processed data

(569, 30)
(569,)


## 2.0 Preprocessing
- 2.1 Standardization mean removal  
- 2.2 Scaling  
- 2.3 Normalization  
- 2.4 Binarization  
- 2.5 One Hot Encoding  
- 2.6 Label Encoding  
- 2.7 Imputation  

### 2.1 Standardization  
Standardization or Mean Removal is the process of transforming each feature vector into a normal distribution with **mean 0 and variance 1**.

In [43]:
# import sklearn.preprocessing as preprocessing
standardizer = preprocessing.StandardScaler()
standardizer = standardizer.fit(breast_cancer.data)  # this takes only numbers not string
breast_cancer_standardized = standardizer.transform(breast_cancer.data)

print('Mean of each feature after Standardization :\n\n')
print(breast_cancer_standardized.mean(axis=0))
print('\nStd. of each feature after Standardization :\n\n')
print(breast_cancer_standardized.std(axis=0))

Mean of each feature after Standardization :


[-3.16286735e-15 -6.53060890e-15 -7.07889127e-16 -8.79983452e-16
  6.13217737e-15 -1.12036918e-15 -4.42138027e-16  9.73249991e-16
 -1.97167024e-15 -1.45363120e-15 -9.07641468e-16 -8.85349205e-16
  1.77367396e-15 -8.29155139e-16 -7.54180940e-16 -3.92187747e-16
  7.91789988e-16 -2.73946068e-16 -3.10823423e-16 -3.36676596e-16
 -2.33322442e-15  1.76367415e-15 -1.19802625e-15  5.04966114e-16
 -5.21317026e-15 -2.17478837e-15  6.85645643e-16 -1.41265636e-16
 -2.28956670e-15  2.57517109e-15]

Std. of each feature after Standardization :


[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]


### 2.2 Scaling
*Scaling transforms existing data values to lie between a minimum and maximum value.*

**MinMaxScaler** transforms data to range **0 and 1**.  
**MaxAbsScaler** transforms data to range **-1 and 1**.

In [44]:
# mimmax scaler to fit data in range 1 to 10

# min max scaler for default range 0 to 1
# min_max_scaler = preprocessing.MinMaxScaler().fit(breast_cancer.data)
# breast_cancer_minmaxscaled = min_max_scaler.transform(breast_cancer.data)
# feature range changed to 0 to 10
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 10)).fit(breast_cancer.data)
breast_cancer_minmaxscaled10 = min_max_scaler.transform(breast_cancer.data)

max_abs_scaler = preprocessing.MaxAbsScaler().fit(breast_cancer.data)
breast_cancer_maxabsscaled = max_abs_scaler.transform(breast_cancer.data)

### 2.3 Normalization
*Normalization scales each sample to have a unit norm.*  
Normalization can be achieved with **'l1', 'l2', and 'max'** norms.  
'l1' norm makes the **sum of absolute values of each row as 1**, and 'l2' norm makes the **sum of squares of each row as 1**.  
'l1' norm is *insensitive to outliers*.  
By default l2 norm is considered. Hence, *removing outliers is recommended before applying l2 norm.*  

In [45]:
normalizer = preprocessing.Normalizer(norm='l1').fit(breast_cancer.data)
breast_cancer_normalized = normalizer.transform(breast_cancer.data)

### 2.4 Binarization
*Binarization is the process of transforming data points to 0 or 1 based on a given threshold.*  

Any value **above the threshold** is transformed to **1**, and any value **below the threshold** is transformed to **0**.  
***By default, a threshold of 0 is used.***

In [46]:
binarizer = preprocessing.Binarizer(threshold=3.0).fit(breast_cancer.data)
breast_cancer_binarized = binarizer.transform(breast_cancer.data)
print(breast_cancer_binarized[:5,:5])

[[1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]]


### 2.5 OneHotEncoder
*OneHotEncoder converts categorical integer values into one-hot vectors.*  
In an one-hot vector, **every category** is transformed into a **binary attribute having only 0 and 1 values.**  

In [47]:
# An example creating two binary attributes for the categorical integers 1 and 2
onehotencoder = preprocessing.OneHotEncoder()
onehotencoder = onehotencoder.fit([[1], [1], [1], [2], [2], [1]])
# Transforming category values 1 and 2 to one-hot vectors
print(onehotencoder.transform([[1]]).toarray())
print(onehotencoder.transform([[2]]).toarray())

[[1. 0.]]
[[0. 1.]]


### 2.6 Label Encoding
*Label Encoding is a step in which, in which categorical features are represented as categorical integers.*

In [48]:
# An example of transforming categorical values ["benign","malignant"]into[0, 1] is shown below.

labels = ['malignant', 'benign', 'malignant', 'benign']
labelencoder = preprocessing.LabelEncoder()
labelencoder = labelencoder.fit(labels)

bc_labelencoded = labelencoder.transform(breast_cancer.target_names)

### 2.7 Imputation
*Imputation replaces missing values with either median, mean, or the most common value of the column or row in which the missing values exist.*

In [49]:
# Below example replaces missing values, represented by np.nan, with the mean of respective column (axis 0).
# 

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

imputer = imputer.fit(breast_cancer.data)
breast_cancer_imputed = imputer.transform(breast_cancer.data)

**In fresco play model selction and other topics are not covered. It now covers some ML techniques**
- Nearest Neighbour technique
- Decision Tree technique
- Esemble Method
- Support Vector Machine
- Clustering technique

## Nearest Neighbour technique
*Nearest neighbors method is used to determine a predefined number of data points that are closer to a sample point and predict its label.*

- **sklearn.neighbors** provides utilities for unsupervised and supervised neighbors-based learning methods.
- scikit-learn implements two different nearest neighbors classifiers:
  - *KNeighborsClassifier*   
  classifies based on k nearest neighbors of every query point, where k is an integer value specified by the user
  - *RadiusNeighborsClassifier*   
  classifies based on the number of neighbors present in a fixed radius r of every training point.

In [60]:
# from sklearn.model_selection import train_test_split

# from sklearn.neighbors import KNeighborsClassifier

cancer = datasets.load_breast_cancer()  # Loading the data set
X_train, X_test, Y_train, Y_test = train_test_split(cancer.data, cancer.target,
                                                    stratify=cancer.target,random_state=42)
knn_classifier = KNeighborsClassifier()   
knn_classifier = knn_classifier.fit(X_train, Y_train)

print('Accuracy of Train Data :', knn_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', knn_classifier.score(X_test,Y_test))

Accuracy of Train Data : 0.9460093896713615
Accuracy of Test Data : 0.9300699300699301


## Decision Trees
*Decision Trees is another Supervised Learning method used for **Classification** **and Regression**.*

- Decision Trees learn simple decision rules from training data and build a Model.
- **DecisionTreeClassifier** and **DecisionTreeRegressor** are the two utilities from **sklearn.tree**, which can be used for classification and regression respectively.

### Advantages
- Decision Trees are easy to understand.
- They often do not require any preprocessing.
- Decision Trees can learn from both numerical and categorical data.

### Disadvantages
- Decision trees sometimes become complex, which do not generalize well and leads to overfitting.
    - Overfitting can be addressed by placing the least number of samples needed at a leaf node or placing the highest depth of the tree.

- A small variation in data can result in a completely different tree.
    - This problem can be addressed by using decision trees within an ensemble.

In [51]:
# from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()   
dt_classifier = dt_classifier.fit(X_train, Y_train) 

print('Accuracy of Train Data :', dt_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', dt_classifier.score(X_test,Y_test))

# This is overfitting- Train data accurancy is 1.0

Accuracy of Train Data : 1.0
Accuracy of Test Data : 0.9090909090909091


Let's finetune the Model, by changing max depth value to 2

In [52]:
dt_classifier = DecisionTreeClassifier(max_depth=2)   
dt_classifier = dt_classifier.fit(X_train, Y_train) 

print('Accuracy of Train Data :', dt_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', dt_classifier.score(X_test,Y_test))

Accuracy of Train Data : 0.9460093896713615
Accuracy of Test Data : 0.9230769230769231


### Ensemble Methods
*Ensemble methods combine predictions of other learning algorithms, to improve the generalization.*

Ensemble methods are two types:
- **Averaging Methods**: They build several base estimators independently and finally average their predictions.
    - E.g.: ***Bagging Methods, Forests of randomised trees***
- **Boosting Methods**: They build base estimators sequentially and try to reduce the bias of the combined estimator.
    - E.g.: ***Adaboost, Gradient Tree Boosting***

#### 1. Bagging Methods
**Bagging Methods** draw random subsets of the original dataset, build an estimator and aggregate individual results to form a final one.

- **BaggingClassifier** and **BaggingRegressor** are the utilities from **sklearn.ensemble** to deal with Bagging.

#### 2. Randomized Trees
**sklearn.ensemble** offers two types of algorithms based on randomized trees: **Random Forest** and **Extra randomness** algorithms.

- **RandomForestClassifier** and **RandomForestRegressor** classes are used to deal with random forests.
    - In random forests, each estimator is built from a sample drawn with replacement from the training set.
- **ExtraTreesClassifier** and **ExtraTreesRegressor** classes are used to deal with extremely randomized forests.
    - In extremely randomized forests, more randomness is introduced, which further reduces the variance of the model.

#### 3. Boosting Methods
**Boosting Methods** combine several weak models to create a improvised ensemble.

- **sklearn.ensemble** also provides the following boosting algorithms:
    - AdaBoostClassifier
    - GradientBoostingClassifier

In [53]:
# Demo of Random Forest Classifier.
# max depth can be changed to see effect on accuracy.

# from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(max_depth=4,)
rf_classifier = rf_classifier.fit(X_train, Y_train) 

print('Accuracy of Train Data :', rf_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', rf_classifier.score(X_test,Y_test))

Accuracy of Train Data : 0.9906103286384976
Accuracy of Test Data : 0.958041958041958


### SVM
**Support Vector Machines** (SVMs) separates data points based on **decision planes**, which separates objects belonging to **different classes** in a **higher dimensional space**.  

SVM algorithm uses the best suitable kernel, which is capable of separating data points into **two or more classes**.
Commonly used kernels are:  
- linear
- polynomial
- rbf
- sigmoid

### Support Vector Regression
**scikit-learn provides** the following three utilities for performing Support Vector Regression.
- SVR
- NuSVR
- LinearSVR

### Advantages of SVMs
- SVM can distinguish the classes in a higher dimensional space.
- SVM algorithms are memory efficient.
- SVMs are versatile, and a different kernel can be used by a decision function.

### Disadvantages of SVMs
- SVMs do not perform well on high dimensional data with many samples.
- SVMs work better only with Preprocessed data.
- They are harder to visualize.

In [63]:
# Demo of Support Vector Classification
# The shown model overfits the training data.

# from sklearn.svm import SVC

svm_classifier = SVC()
svm_classifier = svm_classifier.fit(X_train, Y_train)

print('Accuracy of Train Data :', svm_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', svm_classifier.score(X_test,Y_test))

Accuracy of Train Data : 0.9178403755868545
Accuracy of Test Data : 0.9230769230769231


In [74]:
# Let's finetune the model and improve accuracy using scaled data

cancer = datasets.load_breast_cancer()  # Loading the data set

standardizer = preprocessing.StandardScaler()
standardizer = standardizer.fit(cancer.data)
cancer_standardized = standardizer.transform(cancer.data)

X_train, X_test, Y_train, Y_test = train_test_split(
    cancer_standardized, 
    cancer.target,
    stratify=cancer.target,
    random_state=42)

svm_classifier = SVC()

svm_classifier = svm_classifier.fit(X_train, Y_train)
svm_classifier
print('Accuracy of Train Data :', svm_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', svm_classifier.score(X_test,Y_test))

Accuracy of Train Data : 0.9788732394366197
Accuracy of Test Data : 0.9790209790209791


**Viewing the Classification Report**

In [79]:
from sklearn import metrics

Y_pred = svm_classifier.predict(X_test)

print('Classification report : \n',
      metrics.classification_report(Y_test, Y_pred))

Classification report : 
               precision    recall  f1-score   support

           0       0.96      0.98      0.97        53
           1       0.99      0.98      0.98        90

    accuracy                           0.98       143
   macro avg       0.98      0.98      0.98       143
weighted avg       0.98      0.98      0.98       143



# Unsupervised Learning
## Clustering
*Clustering is one of the unsupervised learning technique.*

- The technique is typically used to group data points into clusters based on a specific algorithm.
- Major clustering algorithms that can be implemented using scikit-learn are:

   - K-means Clustering
   - Agglomerative clustering
   - DBSCAN clustering
   - Mean-shift clustering
   - Affinity propagation
   - Spectral clustering
   
### K-Means Clustering
*In K-means Clustering entire data set is grouped into k clusters.*

- Steps involved are:
    - k centroids are chosen randomly.
    - The distance of each data point from k centroids is calculated. A data point is assigned to the nearest cluster.
    - Centroids of k clusters are recomputed.
    - The above steps are iterated till the number of data points a cluster reach convergence.
- **KMeans** from **sklearn.cluster** can be used for K-means clustering.

### Agglomerative Hierarchical Clustering
*Agglomerative Hierarchical Clustering is a bottom-up approach.*

- Steps involved are:
    - Each data point is treated as a single cluster at the beginning.
    - The distance between each cluster is computed, and the two nearest clusters are merged together.
    - The above step is iterated till a single cluster is formed.
    - **AgglomerativeClustering** from **sklearn.cluster** can be used for achieving this.
    - Merging of two clusters can be any of the following **linkage** type: **ward, complete or average**.
    
### Density Based Clustering
Now let's understand how density-based clustering is performed. DBSCAN from sklearn.cluster is used for this purpose.
video https://www.coursera.org/lecture/machine-learning-with-python/dbscan-B8ctK


### Mean Shift Clustering
*Mean Shift Clustering aims at discovering dense areas.*

Steps Involved:
- Identify blob areas with randomly guessed centroids.
- Calculate the centroid of each blob area and shift to a new one, if there is a difference.
- Repeat the above step till the centroids converge.
**make_blobs** from **sklearn.cluster** can be used to initialize the blob areas. **MeanShift** from **sklearn.cluster** can be used to perform Mean Shift clustering.

### Affinity Propagation
*Affinity Propagation generates clusters by passing messages between pairs of data points, until convergence.*

- **AffinityPropagation** class from **sklearn.cluster** can be used.
- The above class can be controlled with two major parameters:
    - **preference**: It controls the number of exemplars to be chosen by the algorithm.
    - **damping**: It controls numerical oscillations while updating messages.

### Spectral Clustering
*Spectral Clustering is ideal to cluster data that is connected, and may not be in a compact space.*

In general, the following steps are followed:
- Build an affinity matrix of data points.
- Embed data points in a lower dimensional space.
- Use a clustering method like k-means to partition the points on lower dimensional space.

**spectral_clustering** from **sklearn.cluster** can be used for achieving this.

In [81]:
# Demo of KMeans
# from sklearn.cluster import KMeans

kmeans_cluster = KMeans(n_clusters=2)
kmeans_cluster = kmeans_cluster.fit(X_train) 
kmeans_cluster.predict(X_test)

array([0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])

### Evaluating a Clustering algorithm
*A clustering algorithm is majorly evaluated using the following scores:*

**Homogeneity**: Evaluates if each cluster contains only members of a single class.  
**Completeness**: All members of a given class are assigned to the same cluster.  
**V-measure**: Harmonic mean of Homogeneity and Completeness.  
**Adjusted Rand index**: Measures similarity of two assignments.

In [82]:
# from sklearn import metrics

print(metrics.homogeneity_score(kmeans_cluster.predict(X_test), Y_test))
print(metrics.completeness_score(kmeans_cluster.predict(X_test), Y_test))
print(metrics.v_measure_score(kmeans_cluster.predict(X_test), Y_test))
print(metrics.adjusted_rand_score(kmeans_cluster.predict(X_test), Y_test))

0.5272254837937459
0.5140247069822967
0.5205414168879022
0.6422411377783216


In [84]:
iris = datasets.load_iris()
type(iris)

sklearn.utils.Bunch

In [87]:
%%timeit
2+3

10.8 ns ± 0.275 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)
