### Lidando com dados faltantes

In [11]:
import pandas as pd
from io import StringIO
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
csv_data = \
            '''A, B, C, D
            1.0, 2.0, 3.0, 4.0
            5.0, 6.0,, 8.0
            10.0, 11.0, 12.0'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [3]:
df.isnull().sum()

A     0
 B    0
 C    1
 D    1
dtype: int64

In [4]:
# Eliminação
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [5]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [6]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [7]:
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [8]:
df.dropna(subset=['C'])

KeyError: ['C']

### Imputando valores faltantes

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(df.values)
imputed_data = imputer.transform(df.values)
imputed_data

In [None]:
# fillna
df.fillna(df.mean())

### Lidando com dados categóricos

In [9]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']
])

df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [13]:
X = df[['color', 'size', 'price']].values
ohe = OneHotEncoder()
ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [18]:
X

array([['green', 'M', 10.1],
       ['red', 'L', 13.5],
       ['blue', 'XL', 15.3]], dtype=object)

In [21]:
X = df[['color', 'size', 'price']].values

C_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0, 1]),
    ('nothing', 'passthrough', [2])
])

C_transf.fit_transform(X).astype(float)

array([[ 0. ,  1. ,  0. ,  0. ,  1. ,  0. , 10.1],
       [ 0. ,  0. ,  1. ,  1. ,  0. ,  0. , 13.5],
       [ 1. ,  0. ,  0. ,  0. ,  0. ,  1. , 15.3]])

In [22]:
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,color_blue,color_green,color_red,size_L,size_M,size_XL
0,10.1,0,1,0,0,1,0
1,13.5,0,0,1,1,0,0
2,15.3,1,0,0,0,0,1


In [23]:
pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)

Unnamed: 0,price,color_green,color_red,size_M,size_XL
0,10.1,1,0,1,0
1,13.5,0,1,0,0
2,15.3,0,0,0,1
