In [87]:
import pandas as pd
import numpy as np

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [122]:
data = [
    {'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
    {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
    {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
    {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
]

In [123]:
df = pd.DataFrame(data)

In [103]:
def one_dummies(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode 
    @return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        df = pd.concat([df, dummies], axis=1)
    df = df.drop(each,axis=1)
    return df

In [66]:
columns = []
columns.append('neighborhood')

In [104]:
df2 = one_dummies(df,columns)
print(df2)

    price  rooms  neighborhood_0  neighborhood_1  neighborhood_2
0  850000      4               0               1               0
1  700000      3               1               0               0
2  650000      3               0               0               1
3  600000      2               1               0               0


In [124]:
df3 = df.copy()
label_encoder = LabelEncoder()
df3['neighborhood'] = label_encoder.fit_transform(df['neighborhood'])
print(df3)
print(df)


   neighborhood   price  rooms
0             1  850000      4
1             0  700000      3
2             2  650000      3
3             0  600000      2
  neighborhood   price  rooms
0   Queen Anne  850000      4
1      Fremont  700000      3
2  Wallingford  650000      3
3      Fremont  600000      2


# Text Features 

In [79]:
from sklearn.feature_extraction.text import CountVectorizer

In [80]:
sample = ['problem of evil',
          'evil queen',
          'horizon problem']

In [81]:
vec = CountVectorizer()
X = vec.fit_transform(sample)
X

<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [82]:
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Unnamed: 0,evil,horizon,of,problem,queen
0,1,0,1,1,0
1,1,0,0,0,1
2,0,1,0,1,0


### If it's a df 

In [132]:
sample = pd.DataFrame(sample)
sample

Unnamed: 0,0
0,problem of evil
1,evil queen
2,horizon problem


In [142]:
stop_list = ['the']

vec = CountVectorizer(stop_words=stop_list,lowercase=True) #Convert a collection of text documents to a matrix of token counts


X = vec.fit_transform(sample.iloc[:,0])
X

<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [143]:
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Unnamed: 0,evil,horizon,of,problem,queen
0,1,0,1,1,0
1,1,0,0,0,1
2,0,1,0,1,0


# Imputation of Missing Data

In [88]:
from numpy import nan
X = np.array([[ nan, 0,   3  ],
              [ 3,   7,   9  ],
              [ 3,   5,   2  ],
              [ 4,   nan, 6  ],
              [ 8,   8,   1  ]])
y = np.array([14, 16, -1,  8, -5])

In [130]:
from sklearn.preprocessing import Imputer
imp = Imputer(strategy='mean', axis=0) #axis 0 = coluna, 1=linha / 'mean','median','most_frequent'
X2 = imp.fit_transform(X)
print(imp.statistics_)
print('\n',X2)

[4.5 5.  4.2]

 [[4.5 0.  3. ]
 [3.  7.  9. ]
 [3.  5.  2. ]
 [4.  5.  6. ]
 [8.  8.  1. ]]
