# Combining transformers for types of features in a pipeline

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
data = pd.read_csv("../datasets/StudentScore.xls")

In [3]:
# Split into features and target
target = 'math score'
X = data.drop(target, axis=1)
y = data[target]
X.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [4]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
29,female,group D,master's degree,standard,none,70,75
535,female,group C,bachelor's degree,free/reduced,completed,83,83
695,female,group D,some college,free/reduced,none,89,86
557,male,group C,master's degree,free/reduced,none,67,66
836,male,group E,high school,standard,none,64,57


In [6]:
# Numeric transformer
# Handle missing values using descriptive statis tic (mean, meadian, mode, ...)
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

In [7]:
# Ordinal and boolean features
education_levels = ["master's degree", "bachelor's degree",  "associate's degree", 'some college',
        'high school', 'some high school']
gender_values = X_train['gender'].unique()
lunch_values = X_train['lunch'].unique()
test_prep_values = X_train['test preparation course'].unique()

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(categories=[education_levels, gender_values, lunch_values, test_prep_values]))
])

In [9]:
# Nominal features
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False))
])

### Combine many transformers
ColumnTransformer()

In [12]:
# Complete preprocessing pipeline for all features
preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, ["reading score", "writing score"]),
    ("ordinal_features", ordinal_transformer,["parental level of education", "gender", "lunch", "test preparation course"]),
    ("nominal_features", nominal_transformer, ["race/ethnicity"])
])

### Bigger pipeline includes preprocessing & model initializing

In [None]:
# Using Linear Regression
reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

In [16]:
reg.fit(X_train, y_train)

### Evaluate Regression model

In [18]:
y_pred = reg.predict(X_test)

In [25]:
mean_squared_error(y_test, y_pred)

28.821056563832887

In [27]:
mean_absolute_error(y_test,y_pred)

4.181966418321512

#### r2 score (most common)

In [30]:
# Giá trị càng gần 1 mô hình càng tốt
r2_score(y_test,y_pred)

0.8815597679452446

In [None]:
# Using RandomForest
rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test,y_pred))
print(r2_score(y_test,y_pred))

38.01917537528345
4.73265
0.843760066741281


### So sánh kết quả 2 model LinearRegression và RandomForestRegressor
LinearRegression tốt hơn trong trường hợp này, vì:
- Dữ liệu có target có hệ số tương quan cao với các features, nên các mô hình tuyến tính phát huy được sức mạnh