## Preprocessing with Pipeline

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [2]:
data = pd.read_csv("../datasets/StudentScore.xls")

In [3]:
target = 'math score'
X = data.drop(target, axis=1)
y = data[target]
X.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
29,female,group D,master's degree,standard,none,70,75
535,female,group C,bachelor's degree,free/reduced,completed,83,83
695,female,group D,some college,free/reduced,none,89,86
557,male,group C,master's degree,free/reduced,none,67,66
836,male,group E,high school,standard,none,64,57


### Preprocessing pipeline for numerial features
SimpleImputer: solve missing values

In [15]:
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
result = num_transformer.fit_transform(X_train[["reading score", "writing score"]])

In [None]:
for i, j in zip(X_train[["reading score", "writing score"]].values, result):
    print("Before: {}, After: {}".format(i,j))

Before: [70 75], After: [0.03079054 0.43405338]
Before: [83 83], After: [0.9302895  0.96470125]
Before: [89 86], After: [1.34544287 1.1636942 ]
Before: [67 66], After: [-0.17678614 -0.16292548]
Before: [64 57], After: [-0.38436283 -0.75990434]
Before: [24 15], After: [-3.15205194 -3.54580567]
Before: [100 100], After: [2.10655738 2.09232798]
Before: [77 80], After: [0.51513614 0.7657083 ]
Before: [64 66], After: [-0.38436283 -0.16292548]
Before: [58 52], After: [-0.79951619 -1.09155926]
Before: [64 61], After: [-0.38436283 -0.4945804 ]
Before: [62 64], After: [-0.52274728 -0.29558745]
Before: [64 60], After: [-0.38436283 -0.56091139]
Before: [69 76], After: [-0.03840169  0.50038436]
Before: [67 67], After: [-0.17678614 -0.0965945 ]
Before: [37 35], After: [-2.25255298 -2.21918599]
Before: [63 63], After: [-0.45355505 -0.36191843]
Before: [60 53], After: [-0.66113174 -1.02522827]
Before: [84 91], After: [0.99948173 1.49534913]
Before: [87 79], After: [1.20705842 0.69937731]
Before: [67 

### Preprocessing pipeline for ordinal features
- OrdinalEncoder: for ordinal data type
- Features with 2 values (boolean) can be encoded by Ordinal or Nominal encoders

In [26]:
education_levels = ["master's degree", "bachelor's degree",  "associate's degree", 'some college',
        'high school', 'some high school']
gender_values = X_train['gender'].unique()
lunch_values = X_train['lunch'].unique()
test_prep_values = X_train['test preparation course'].unique()

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(categories=[education_levels, gender_values, lunch_values, test_prep_values]))
])
result = ordinal_transformer.fit_transform(X_train[["parental level of education",'gender', 'lunch', 'test preparation course']])
for i, j in zip(X_train[["parental level of education", 'gender', 'lunch', 'test preparation course']].values, result):
    print("Before: {}, After: {}".format(i,j))

Before: ["master's degree" 'female' 'standard' 'none'], After: [0. 0. 0. 0.]
Before: ["bachelor's degree" 'female' 'free/reduced' 'completed'], After: [1. 0. 1. 1.]
Before: ['some college' 'female' 'free/reduced' 'none'], After: [3. 0. 1. 0.]
Before: ["master's degree" 'male' 'free/reduced' 'none'], After: [0. 1. 1. 0.]
Before: ['high school' 'male' 'standard' 'none'], After: [4. 1. 0. 0.]
Before: ['high school' 'male' 'free/reduced' 'none'], After: [4. 1. 1. 0.]
Before: ["bachelor's degree" 'female' 'standard' 'completed'], After: [1. 0. 0. 1.]
Before: ["associate's degree" 'female' 'standard' 'completed'], After: [2. 0. 0. 1.]
Before: ['high school' 'male' 'standard' 'completed'], After: [4. 1. 0. 1.]
Before: ['some high school' 'female' 'free/reduced' 'none'], After: [5. 0. 1. 0.]
Before: ['some high school' 'male' 'standard' 'none'], After: [5. 1. 0. 0.]
Before: ['some college' 'female' 'free/reduced' 'none'], After: [3. 0. 1. 0.]
Before: ['some high school' 'male' 'standard' 'comp

### Nominal features
OnehotEncoder

In [None]:
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False))
])
result = nominal_transformer.fit_transform(X_train[["race/ethnicity"]])
for i, j in zip(X_train[["race/ethnicity"]].values, result):
    print("Before: {}, After: {}".format(i,j))

Before: ['group D'], After: [0. 0. 0. 1. 0.]
Before: ['group C'], After: [0. 0. 1. 0. 0.]
Before: ['group D'], After: [0. 0. 0. 1. 0.]
Before: ['group C'], After: [0. 0. 1. 0. 0.]
Before: ['group E'], After: [0. 0. 0. 0. 1.]
Before: ['group B'], After: [0. 1. 0. 0. 0.]
Before: ['group C'], After: [0. 0. 1. 0. 0.]
Before: ['group C'], After: [0. 0. 1. 0. 0.]
Before: ['group D'], After: [0. 0. 0. 1. 0.]
Before: ['group C'], After: [0. 0. 1. 0. 0.]
Before: ['group B'], After: [0. 1. 0. 0. 0.]
Before: ['group C'], After: [0. 0. 1. 0. 0.]
Before: ['group E'], After: [0. 0. 0. 0. 1.]
Before: ['group C'], After: [0. 0. 1. 0. 0.]
Before: ['group B'], After: [0. 1. 0. 0. 0.]
Before: ['group C'], After: [0. 0. 1. 0. 0.]
Before: ['group D'], After: [0. 0. 0. 1. 0.]
Before: ['group C'], After: [0. 0. 1. 0. 0.]
Before: ['group D'], After: [0. 0. 0. 1. 0.]
Before: ['group D'], After: [0. 0. 0. 1. 0.]
Before: ['group A'], After: [1. 0. 0. 0. 0.]
Before: ['group E'], After: [0. 0. 0. 0. 1.]
Before: ['

### Features with only two values (boolean)
Can use Ordinal or Nominal to encode

In [None]:
lunch