# Scikit-learn Pre-processing

In [1]:
import sklearn
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn import set_config

#Sets output of transform and fit_transform to pandas dataframe output.
set_config(transform_output = "pandas")

TypeError: set_config() got an unexpected keyword argument 'transform_output'

## Scaling

In [None]:
tiny_data = np.array([[ 1., -1.,  2.],
                     [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])

scaler = StandardScaler().fit(tiny_data)
scaler

StandardScaler()

In [None]:
scaler.mean_

array([1.        , 0.        , 0.33333333])

In [None]:
scaler.scale_

array([0.81649658, 0.81649658, 1.24721913])

In [None]:
X_scaled = scaler.transform(tiny_data)

In [None]:
X_scaled.mean()

4.9343245538895844e-17

In [None]:
X_scaled.std()

1.0

## Ordinal Encoding

In [None]:
# example of a ordinal encoding
from numpy import asarray

In [None]:
# define data
data = asarray([['data'], ['wrangling'], ['rocks']])
print(data)

[['data']
 ['wrangling']
 ['rocks']]


In [None]:
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
encoder.fit_transform(data)

array([[0.],
       [2.],
       [1.]])

## One Hot Encoding

In [None]:
# define one hot encoding
encoder = OneHotEncoder(sparse_output=False)
# transform data
encoder.fit_transform(data)

TypeError: __init__() got an unexpected keyword argument 'sparse_output'

## Imputing missing values

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True, parser='auto')

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

TypeError: fetch_openml() got an unexpected keyword argument 'parser'

In [None]:
X.head()

NameError: name 'X' is not defined

In [None]:
y_train.head()

NameError: name 'y_train' is not defined

In [None]:
X.info()

NameError: name 'X' is not defined

In [None]:
missing = X_test.isnull().sum()
missing = missing[missing > 0].sort_values(ascending = False)

NameError: name 'X_test' is not defined

In [None]:
missing

NameError: name 'missing' is not defined

In [None]:
simple_imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
simple_imputed = simple_imp.fit_transform(X_test[['age', 'body']])

In [None]:
#Replace the age and body columns in the original X_Test dataframe
#with the imputed values
X_test[['age', 'body']] = simple_imputed

#Repeat the above code to get number of NA values- note how the 'age' and 
#'body' columns disappear
missing = X_test.isnull().sum()
missing = missing[missing > 0].sort_values(ascending = False)
missing

cabin        253
boat         208
home.dest    141
embarked       1
dtype: int64

## Putting it all together

In [None]:
X_train.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
221,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B30,C,,234.0,"Providence, RI"
1245,3,"Thomson, Mr. Alexander Morrison",male,,0,0,32302,8.05,,S,,,
1274,3,"Vander Planke, Mr. Julius",male,31.0,3,0,345763,18.0,,S,,,
1136,3,"Rasmussen, Mrs. (Lena Jacobsen Solvang)",female,,0,0,65305,8.1125,,S,,,
1305,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,


In [None]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

# Here we use `StandardScaler` for continuous variables; 
# then we impute for missing data (check the documentation for the imputation method)
# We use `OneHotEncoder` for categorical variables
# NOTE: we are using a subset of the features (not all the columns)

ct = make_column_transformer((make_pipeline(SimpleImputer(),
                                            StandardScaler()), ["age", "fare"]),
                             (OneHotEncoder(sparse_output=False), ["embarked", "sex", "pclass"]), 
                             verbose_feature_names_out=False)

# Note: click on pipeline elements to see more details
clf = make_pipeline(ct, LogisticRegression())
clf

In [None]:
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.7828746177370031

In [None]:
# Let's remove the last step in the pipeline (which is LogisticRegression()) & transform the X_test data
clf[:-1].transform(X_test)

Unnamed: 0,age,fare,embarked_C,embarked_Q,embarked_S,embarked_nan,sex_female,sex_male,pclass_1,pclass_2,pclass_3
697,0.043169,-0.488170,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
213,0.123980,1.602394,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
493,0.123980,0.078208,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1056,-1.411443,-0.436627,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
268,-0.441702,0.537753,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
383,-0.684137,-0.431465,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1268,0.131981,-0.471433,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
190,-0.684137,0.896629,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
835,0.131981,-0.500410,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
