In [87]:
import pandas as pd
import numpy as np

In [145]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [122]:
data = [
    {'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
    {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
    {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
    {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
]

In [123]:
df = pd.DataFrame(data)

In [103]:
def one_dummies(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode 
    @return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        df = pd.concat([df, dummies], axis=1)
    df = df.drop(each,axis=1)
    return df

In [66]:
columns = []
columns.append('neighborhood')

In [104]:
df2 = one_dummies(df,columns)
print(df2)

    price  rooms  neighborhood_0  neighborhood_1  neighborhood_2
0  850000      4               0               1               0
1  700000      3               1               0               0
2  650000      3               0               0               1
3  600000      2               1               0               0


In [124]:
df3 = df.copy()
label_encoder = LabelEncoder()
df3['neighborhood'] = label_encoder.fit_transform(df['neighborhood'])
print(df3)
print(df)


   neighborhood   price  rooms
0             1  850000      4
1             0  700000      3
2             2  650000      3
3             0  600000      2
  neighborhood   price  rooms
0   Queen Anne  850000      4
1      Fremont  700000      3
2  Wallingford  650000      3
3      Fremont  600000      2


In [171]:
df["neighborhood"].value_counts()

Fremont        2
Queen Anne     1
Wallingford    1
Name: neighborhood, dtype: int64

In [172]:
df3["neighborhood"].value_counts()

0    2
2    1
1    1
Name: neighborhood, dtype: int64

# Text Features 

In [79]:
from sklearn.feature_extraction.text import CountVectorizer

In [80]:
sample = ['problem of evil',
          'evil queen',
          'horizon problem']

In [81]:
stop_list = ['the']

vec = CountVectorizer(stop_words=stop_list,lowercase=True) #Convert a collection of text documents to a matrix of token counts

X = vec.fit_transform(sample)
X

<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [82]:
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Unnamed: 0,evil,horizon,of,problem,queen
0,1,0,1,1,0
1,1,0,0,0,1
2,0,1,0,1,0


### If it's a df 

In [132]:
sample = pd.DataFrame(sample)
sample

Unnamed: 0,0
0,problem of evil
1,evil queen
2,horizon problem


In [142]:
stop_list = ['the']

vec = CountVectorizer(stop_words=stop_list,lowercase=True) #Convert a collection of text documents to a matrix of token counts


X = vec.fit_transform(sample.iloc[:,0])
X

<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [143]:
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Unnamed: 0,evil,horizon,of,problem,queen
0,1,0,1,1,0
1,1,0,0,0,1
2,0,1,0,1,0


# Imputation of Missing Data

In [88]:
from numpy import nan
X = np.array([[ nan, 0,   3  ],
              [ 3,   7,   9  ],
              [ 3,   5,   2  ],
              [ 4,   nan, 6  ],
              [ 8,   8,   1  ]])
y = np.array([14, 16, -1,  8, -5])

In [130]:
from sklearn.preprocessing import Imputer
imp = Imputer(strategy='mean', axis=0) #axis 0 = coluna, 1=linha / 'mean','median','most_frequent'
X2 = imp.fit_transform(X)
print(imp.statistics_)
print('\n',X2)

[4.5 5.  4.2]

 [[4.5 0.  3. ]
 [3.  7.  9. ]
 [3.  5.  2. ]
 [4.  5.  6. ]
 [8.  8.  1. ]]


# Standardization 

#### Typically, to standardize variables, you calculate the mean and standard deviation for a variable. Then, for each observed value of the variable, you subtract the mean and divide by the standard deviation.

In [163]:
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])

scaler = preprocessing.StandardScaler().fit(X_train)
scaler.transform(X_train) 

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [164]:
print(scaler.mean_)
print(scaler.var_)
print(scaler.n_samples_seen_)

[1.         0.         0.33333333]
[0.66666667 0.66666667 1.55555556]
3


## MinMax Scaler (Scale to range [0,1])

In [165]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=[0,1])
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_train_minmax


array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

## Robust Scaler 

In [166]:
transformer = preprocessing.RobustScaler().fit(X_train)
X_train_robust = transformer.fit_transform(X_train)
X_train_robust

array([[ 0.        , -1.        ,  1.33333333],
       [ 1.        ,  0.        ,  0.        ],
       [-1.        ,  1.        , -0.66666667]])

##  Normalizer

In [169]:
normalizer = preprocessing.Normalizer().fit(X_train) 
X_normalized = normalizer.transform(X_train)
X_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

# Discretizing

In [173]:
df.head()

Unnamed: 0,neighborhood,price,rooms
0,Queen Anne,850000,4
1,Fremont,700000,3
2,Wallingford,650000,3
3,Fremont,600000,2


In [200]:
custom_bucket_array = np.linspace(60000, 90000, 5)
custom_bucket_array

array([60000., 67500., 75000., 82500., 90000.])

In [201]:
data = [
    {'price': 85000.5, 'rooms': 4, 'neighborhood': 'Queen Anne'},
    {'price': 70000.4, 'rooms': 3, 'neighborhood': 'Fremont'},
    {'price': 65000.2, 'rooms': 3, 'neighborhood': 'Wallingford'},
    {'price': 60000.6, 'rooms': 2, 'neighborhood': 'Fremont'}
]

In [202]:
df4 = pd.DataFrame(data)

In [203]:
print(df4)
pd.cut(df4['price'], custom_bucket_array)

  neighborhood    price  rooms
0   Queen Anne  85000.5      4
1      Fremont  70000.4      3
2  Wallingford  65000.2      3
3      Fremont  60000.6      2


0    (82500.0, 90000.0]
1    (67500.0, 75000.0]
2    (60000.0, 67500.0]
3    (60000.0, 67500.0]
Name: price, dtype: category
Categories (4, interval[float64]): [(60000.0, 67500.0] < (67500.0, 75000.0] < (75000.0, 82500.0] < (82500.0, 90000.0]]