In [1]:
%matplotlib inline
import numpy as np
import sklearn
from sklearn import linear_model, metrics
import pandas as pd
from matplotlib import pyplot as plt
from mylibs import transform as tf
from mylibs import resample as rs

### 01. Crie os seguintes arquivos com extensão .py e implemente os métodos definidos para cada um deles:
* transform.py
    - standardize
    - normalize
* resample.py
    - split_k_fold(n_elem, n_splits=3, shuffle=True, seed=0)
    - n_elem - número total de elementos.
    - n_split - número de folds. Mínimo: 2.
    - shuffle - aleatoriza a ordem dos dados (True) ou não (False).
    - seed - determina uma semente para geração de números aleatórios ou não (None).
    - Retorno: 2 arrays (idx_train e idx_test), cada um com n_splits elementos: 
    - um com os índices de treino. Exemplo para n_splits=3, teremos idx_train[0], idx_train[1] e idx_train[2].
    - um com os índices de teste. Exemplo para n_splits=3, teremos idx_test[0], idx_test[1] e idx_test[2].

In [2]:
np.random.seed(0)
x = np.random.rand(20) # 20 valores
x = (x * 100).round(2) # valores até 100
x = np.resize(x, (20, 1))

In [3]:
tf.normalize(x)

array([[0.56025437],
       [0.73661897],
       [0.61748808],
       [0.55612083],
       [0.42766296],
       [0.66316905],
       [0.44239534],
       [0.92379438],
       [1.        ],
       [0.38494966],
       [0.81770005],
       [0.53916269],
       [0.58060413],
       [0.95961844],
       [0.05384208],
       [0.0709062 ],
       [0.        ],
       [0.86104928],
       [0.80339163],
       [0.90068892]])

In [4]:
tf.standardize(x)

array([[-0.11870903],
       [ 0.48434953],
       [ 0.07699507],
       [-0.13284322],
       [-0.5720902 ],
       [ 0.23319593],
       [-0.52171451],
       [ 1.12437442],
       [ 1.38495081],
       [-0.71814345],
       [ 0.761597  ],
       [-0.19082962],
       [-0.04912535],
       [ 1.24687069],
       [-1.85032791],
       [-1.79197909],
       [-2.03443473],
       [ 0.90982474],
       [ 0.71267098],
       [ 1.04536795]])

In [6]:
test, train = rs.slipt_k_fold(20, 5, True, 2)

In [7]:
train

[array([ 2,  8,  6, 19, 15,  4, 16, 12,  9,  1,  0,  7, 14, 17,  3, 18]),
 array([13, 11, 10,  5, 15,  4, 16, 12,  9,  1,  0,  7, 14, 17,  3, 18]),
 array([13, 11, 10,  5,  2,  8,  6, 19,  9,  1,  0,  7, 14, 17,  3, 18]),
 array([13, 11, 10,  5,  2,  8,  6, 19, 15,  4, 16, 12, 14, 17,  3, 18]),
 array([13, 11, 10,  5,  2,  8,  6, 19, 15,  4, 16, 12,  9,  1,  0,  7])]

In [8]:
test

[array([13, 11, 10,  5]),
 array([ 2,  8,  6, 19]),
 array([15,  4, 16, 12]),
 array([9, 1, 0, 7]),
 array([14, 17,  3, 18])]

### Carregando Dataset

In [19]:
data = pd.read_csv('winequality-red.csv', delimiter=';')
data.shape

(1599, 12)

In [20]:
data.sample(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1455,6.5,0.9,0.0,1.6,0.052,9.0,17.0,0.99467,3.5,0.63,10.9,6
788,10.0,0.56,0.24,2.2,0.079,19.0,58.0,0.9991,3.18,0.56,10.1,6
355,6.7,0.75,0.01,2.4,0.078,17.0,32.0,0.9955,3.55,0.61,12.8,6
338,12.4,0.49,0.58,3.0,0.103,28.0,99.0,1.0008,3.16,1.0,11.5,6
433,12.3,0.39,0.63,2.3,0.091,6.0,18.0,1.0004,3.16,0.49,9.5,5
68,9.3,0.32,0.57,2.0,0.074,27.0,65.0,0.9969,3.28,0.79,10.7,5
820,7.0,0.685,0.0,1.9,0.099,9.0,22.0,0.99606,3.34,0.6,9.7,5
1198,7.7,0.26,0.26,2.0,0.052,19.0,77.0,0.9951,3.15,0.79,10.9,6
107,6.2,0.63,0.31,1.7,0.088,15.0,64.0,0.9969,3.46,0.79,9.3,5
1340,7.5,0.51,0.02,1.7,0.084,13.0,31.0,0.99538,3.36,0.54,10.5,6


In [21]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [23]:
test, train = rs.slipt_k_fold(data.shape[0], 5, True, 3)

In [24]:
test

[array([ 147,  937,  877, 1000,   73,   31,  266, 1403,  263, 1491,  293,
         112, 1246,  592, 1417,    4, 1503,  221,  837, 1083, 1402,  157,
          69,  730,  692,  175,  593, 1018,  984, 1116,  554, 1264, 1381,
         290,   83, 1077, 1270, 1493,  137,  288,  199, 1373, 1107,  745,
         720,   16,  869, 1568,   70,   85,  677,  597, 1237, 1087,  475,
        1511,  719,   61,  515, 1521,  393,  790,  204,  216, 1565, 1286,
         487, 1327,  615, 1454, 1518,  229,  402, 1295, 1598, 1314,  317,
           3,  371, 1117, 1588, 1437,   46,  471,  154,  746,  635,  565,
        1513,  809, 1380,  897,   37,   50, 1567,  268,  304,  743, 1597,
         477, 1158,  409,  416, 1371, 1249, 1370,  170,  103, 1410, 1031,
        1426,  969, 1223,  250, 1465,  751,  168,  752,  197, 1307,  328,
         562,  500,  322,  632,  895, 1422, 1569, 1065,   65,  321, 1173,
         669,  350,  948, 1481, 1068,  464,  608, 1318, 1128, 1475, 1386,
         981,  148, 1305,  675,  264, 

In [33]:
train

[array([ 606,  362,  533, ...,  968,  952, 1273]),
 array([ 147,  937,  877, ...,  968,  952, 1273]),
 array([ 147,  937,  877, ...,  968,  952, 1273]),
 array([ 147,  937,  877, ...,  968,  952, 1273]),
 array([ 147,  937,  877, ...,  305, 1350,  196])]