In [1]:
%matplotlib inline
import numpy as np
import sklearn
from sklearn import linear_model, metrics, datasets
import pandas as pd
from matplotlib import pyplot as plt
from mylibs import transform as tf
from mylibs import resample as rs
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold



### 01. Crie os seguintes arquivos com extensão .py e implemente os métodos definidos para cada um deles:
* transform.py
    - standardize
    - normalize
* resample.py
    - split_k_fold(n_elem, n_splits=3, shuffle=True, seed=0)
    - n_elem - número total de elementos.
    - n_split - número de folds. Mínimo: 2.
    - shuffle - aleatoriza a ordem dos dados (True) ou não (False).
    - seed - determina uma semente para geração de números aleatórios ou não (None).
    - Retorno: 2 arrays (idx_train e idx_test), cada um com n_splits elementos: 
    - um com os índices de treino. Exemplo para n_splits=3, teremos idx_train[0], idx_train[1] e idx_train[2].
    - um com os índices de teste. Exemplo para n_splits=3, teremos idx_test[0], idx_test[1] e idx_test[2].

In [2]:
np.random.seed(0)
x = np.random.rand(20) # 20 valores
x = (x * 100).round(2) # valores até 100
x = np.resize(x, (20, 1))

In [3]:
tf.normalize(x)

array([[0.56025437],
       [0.73661897],
       [0.61748808],
       [0.55612083],
       [0.42766296],
       [0.66316905],
       [0.44239534],
       [0.92379438],
       [1.        ],
       [0.38494966],
       [0.81770005],
       [0.53916269],
       [0.58060413],
       [0.95961844],
       [0.05384208],
       [0.0709062 ],
       [0.        ],
       [0.86104928],
       [0.80339163],
       [0.90068892]])

In [4]:
tf.standardize(x)

array([[-0.11870903],
       [ 0.48434953],
       [ 0.07699507],
       [-0.13284322],
       [-0.5720902 ],
       [ 0.23319593],
       [-0.52171451],
       [ 1.12437442],
       [ 1.38495081],
       [-0.71814345],
       [ 0.761597  ],
       [-0.19082962],
       [-0.04912535],
       [ 1.24687069],
       [-1.85032791],
       [-1.79197909],
       [-2.03443473],
       [ 0.90982474],
       [ 0.71267098],
       [ 1.04536795]])

In [5]:
test, train = rs.slipt_k_fold(20, 5, True, 2)

In [6]:
train

[array([ 2,  8,  6, 19, 15,  4, 16, 12,  9,  1,  0,  7, 14, 17,  3, 18]),
 array([13, 11, 10,  5, 15,  4, 16, 12,  9,  1,  0,  7, 14, 17,  3, 18]),
 array([13, 11, 10,  5,  2,  8,  6, 19,  9,  1,  0,  7, 14, 17,  3, 18]),
 array([13, 11, 10,  5,  2,  8,  6, 19, 15,  4, 16, 12, 14, 17,  3, 18]),
 array([13, 11, 10,  5,  2,  8,  6, 19, 15,  4, 16, 12,  9,  1,  0,  7])]

In [7]:
test

[array([13, 11, 10,  5]),
 array([ 2,  8,  6, 19]),
 array([15,  4, 16, 12]),
 array([9, 1, 0, 7]),
 array([14, 17,  3, 18])]

### Carregando Dataset

In [8]:
data = pd.read_csv('winequality-red.csv', delimiter=';')
data.shape

(1599, 12)

In [9]:
data.sample(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
407,12.0,0.39,0.66,3.0,0.093,12.0,30.0,0.9996,3.18,0.63,10.8,7
1220,10.9,0.32,0.52,1.8,0.132,17.0,44.0,0.99734,3.28,0.77,11.5,6
1200,7.7,0.57,0.21,1.5,0.069,4.0,9.0,0.99458,3.16,0.54,9.8,6
308,10.3,0.43,0.44,2.4,0.214,5.0,12.0,0.9994,3.19,0.63,9.5,6
1328,6.5,0.52,0.11,1.8,0.073,13.0,38.0,0.9955,3.34,0.52,9.3,5
1346,6.1,0.59,0.01,2.1,0.056,5.0,13.0,0.99472,3.52,0.56,11.4,5
199,6.9,1.09,0.06,2.1,0.061,12.0,31.0,0.9948,3.51,0.43,11.4,4
1236,7.8,0.55,0.0,1.7,0.07,7.0,17.0,0.99659,3.26,0.64,9.4,6
114,7.8,0.56,0.19,1.8,0.104,12.0,47.0,0.9964,3.19,0.93,9.5,5
161,7.6,0.68,0.02,1.3,0.072,9.0,20.0,0.9965,3.17,1.08,9.2,4


In [10]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [11]:
test, train = rs.slipt_k_fold(data.shape[0], 5, True, 3)

In [12]:
test

[array([1455,  788,  355,  338,  433,   68,  820, 1198,  107, 1340, 1194,
         981,  101, 1034,  549, 1197,  739, 1172,  555, 1289,  735,  675,
        1486, 1107,  451, 1278, 1432, 1208,  240,  373,  643, 1053, 1512,
         691,  402,  463, 1196, 1226,  960,  218,  529,  187,  508, 1293,
         312,  322,  877,  143,  450,   15,  626,  939,  560, 1073, 1490,
         580,  905,  772,  332, 1087, 1480,  780,  315,  607, 1371,  747,
        1582,  147,  177,   32,  115,  192, 1125, 1064,  188,  158, 1082,
         969,  121,  904, 1532,  155,  599, 1162, 1538,  480,  609,  587,
         982,  499,  845,  424, 1373,  807, 1530, 1424,   75, 1300,  149,
         230,  171, 1231,  934, 1007,   27,  900,  259, 1276,  511, 1155,
         682,  742,  929, 1355, 1377, 1063, 1047,  584,  764,  270, 1014,
         746,  198,  832, 1001,  319, 1126,  874,   91,  427,  776,   16,
        1405, 1446,  543,  848, 1408, 1586,  627, 1274, 1042, 1429, 1351,
         784, 1334,  754,  290,  592, 

In [13]:
train

[array([1084,  741,  520, ...,  975,  888,  951]),
 array([1455,  788,  355, ...,  975,  888,  951]),
 array([1455,  788,  355, ...,  975,  888,  951]),
 array([1455,  788,  355, ...,  975,  888,  951]),
 array([1455,  788,  355, ..., 1502,  266, 1569])]

In [14]:
n_train = int(round(len(y) * 0.75))
X_train = X[:n_train,:]
y_train = y[:n_train]
X_test = X[n_train:,:]
y_test = y[n_train:]

NameError: name 'y' is not defined

In [15]:
# read in the iris data
iris = load_iris()

# create X (features) and y (response)
X = iris.data
y = iris.target

In [16]:
# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

### Steps for K-fold cross-validation
1. Split the dataset into K equal partitions (or "folds").
* Use fold 1 as the testing set and the union of the other folds as the training set.
* Calculate testing accuracy.
* Repeat steps 2 and 3 K times, using a different fold as the testing set each time.
* Use the average testing accuracy as the estimate of out-of-sample accuracy.

In [18]:
# simulate splitting a dataset of 25 observations into 5 folds
kf = KFold(25, n_folds=5, shuffle=False)

# print the contents of each training and testing set
print('{} {:^61} {}'.format('Iteration', 'Training set observations', 'Testing set observations'))
for iteration, data in enumerate(kf, start=1):
    print('{:^9} {} {:}'.format(iteration, data[0], data[1]))

Iteration                   Training set observations                   Testing set observations
    1     [ 5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] [0 1 2 3 4]
    2     [ 0  1  2  3  4 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] [5 6 7 8 9]
    3     [ 0  1  2  3  4  5  6  7  8  9 15 16 17 18 19 20 21 22 23 24] [10 11 12 13 14]
    4     [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 20 21 22 23 24] [15 16 17 18 19]
    5     [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [20 21 22 23 24]


* Dataset contains 25 observations (numbered 0 through 24)
* 5-fold cross-validation, thus it runs for 5 iterations
* For each iteration, every observation is either in the training set or the testing set, but not both
* Every observation is in the testing set exactly once

### Comparing cross-validation to train/test split
Advantages of cross-validation:
* More accurate estimate of out-of-sample accuracy
* More "efficient" use of data (every observation is used for both training and testing)

Advantages of train/test split:
* Runs K times faster than K-fold cross-validation
* Simpler to examine the detailed results of the testing process


### Cross-validation recommendations
1. K can be any number, but K=10 is generally recommended
2. For classification problems, stratified sampling is recommended for creating the folds
    - Each response class should be represented with equal proportions in each of the K folds
    - scikit-learn's cross_val_score function does this by default

In [19]:
# Load the Diabetes Housing dataset
columns = 'age sex bmi map tc ldl hdl tch ltg glu'.split() # Declare the columns names
diabetes = datasets.load_diabetes() # Call the diabetes dataset from sklearn
df = pd.DataFrame(diabetes.data, columns=columns) # load the dataset as a pandas data frame
y = diabetes.target # define the target variable (dependent variable) as y

In [20]:
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(353, 10) (353,)
(89, 10) (89,)


In [21]:
# fit a model
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

In [22]:
predictions[0:5]

array([ 99.4834095 , 181.25429721, 106.61431616, 123.1084059 ,
       183.05038517])

In [23]:
print('Score:', model.score(X_test, y_test))

Score: 0.41617433039258167


In [24]:
#from sklearn.model_selection import KFold # import KFold
#X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) # create an array
#y = np.array([1, 2, 3, 4]) # Create another array
#kf = KFold(n_splits=2) # Define the split - into 2 folds 
#kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator
#print(kf)

In [25]:
#for train_index, test_index in kf.split(X):
 #   print('TRAIN:', train_index, 'TEST:', test_index)
  #  X_train, X_test = X[train_index], X[test_index]
   # y_train, y_test = y[train_index], y[test_index]

In [28]:
# Perform 6-fold cross validation
scores = cross_val_score(model, df, y, cv=6)
print('Cross-validated scores:', scores)

Cross-validated scores: [0.4554861  0.46138572 0.40094084 0.55220736 0.43942775 0.56923406]


In [29]:
predictions = cross_val_predict(model, df, y, cv=6)