In [1]:
import numpy
from sklearn.model_selection import GridSearchCV,KFold,RandomizedSearchCV
from sklearn.metrics import make_scorer, r2_score, mean_squared_error, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_digits,load_diabetes
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

In [2]:
# import data to pandas 
dataset = load_diabetes()
dataset.keys()

dict_keys(['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])

In [3]:
# Description of Dataset
print(dataset['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - Age
      - Sex
      - Body mass index
      - Average blood pressure
      - S1
      - S2
      - S3
      - S4
      - S5
      - S6

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Brad

In [4]:
dataset['data'].shape

(442, 10)

In [5]:
import pandas as pd
# DataFrame for Independent Values
df = pd.DataFrame(dataset['data'], columns=['f-'+str(x) for x in range(1,11)])

In [6]:
df.head()

Unnamed: 0,f-1,f-2,f-3,f-4,f-5,f-6,f-7,f-8,f-9,f-10
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [7]:
# Dependent Values
dataset['target'][:10]

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310.])

In [8]:
df.shape

(442, 10)

In [9]:
data = dataset['data']
target = dataset['target']
classifier = LinearRegression()
score = []

## KFold Cross Validation
#### Cross-validation is a resampling procedure used to evaluate machine learning models on a limited data sample.

The procedure has a single parameter called k that refers to the number of groups that a given data sample is to be split into. As such, the procedure is often called k-fold cross-validation. When a specific value for k is chosen, it may be used in place of k in the reference to the model, such as k=10 becoming 10-fold cross-validation.

In [10]:
folds = KFold(n_splits=5)

In [11]:
for train_index,test_index in folds.split(data):
    #print("train index ->",train_index,"\n",
    #     "test index ->",test_index)
    #print(data[train_index].shape,"->>>", target[train_index].shape)
    #print(data[test_index].shape,"->>>", target[test_index].shape)
    
    classifier.fit(data[train_index], target[train_index])
    pred = classifier.predict(data[test_index])
    r2 = r2_score(target[test_index],pred)
    score.append(r2)
    print(r2)
    
np.mean(score)

0.42955642865857757
0.5225982811135659
0.4826783998252703
0.4265082749941945
0.550249225965861


0.4823181221114939

In [12]:
np.mean(cross_val_score(LinearRegression(), data, target, cv=5, n_jobs=-1))

0.48231812211149394

### -->>[Read this article](https://www.analyticsvidhya.com/blog/2018/05/improve-model-performance-cross-validation-in-python-r/)<<--

## Grid Search CV

Exhaustive search over specified parameter values for an estimator.

In [14]:
# Dataset for Classification
dataset = load_digits()
dataset.keys()

dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])

In [15]:
print(dataset['DESCR'])

.. _digits_dataset:

Optical recognition of handwritten digits dataset
--------------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 5620
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
http://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each bloc

In [16]:
dataset['data'].shape

(1797, 64)

In [17]:
dataset['target']

array([0, 1, 2, ..., 8, 9, 8])

In [18]:
data, target = dataset['data'][:50], dataset['target'][:50] #Using first 50 to reduce computation time in my pc;)

In [19]:
clf = GradientBoostingClassifier()
scorer = make_scorer(accuracy_score)
parameters = {'learning_rate':[0.001,0.01,0.1],
              'n_estimators':[100,200],
              'max_depth':[3,4,5],
              'min_samples_split':[2,3,4]}

In [20]:
gridsrch = GridSearchCV(clf, parameters, scoring=scorer, cv=5, n_jobs= -1)
gridsrch.fit(data, target)
bestclf = gridsrch.best_estimator_ 
bestclf



GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=4,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [21]:
cv_score = cross_val_score(bestclf, data, target, cv=5)
print("Cross validation score ->",cv_score)
print("Avg ->",np.mean(cv_score))



Cross validation score -> [0.46153846 0.83333333 0.6        0.77777778 0.83333333]
Avg -> 0.7011965811965812


- **Have you heard of RandomizedSearchCV? If not then search about it now!**

# Any Questions?