# Combining transformers for types of features in a pipeline

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from lazypredict.Supervised import LazyRegressor

In [2]:
data = pd.read_csv("../datasets/StudentScore.xls")

In [3]:
# Split into features and target
target = 'math score'
X = data.drop(target, axis=1)
y = data[target]
X.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [4]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
29,female,group D,master's degree,standard,none,70,75
535,female,group C,bachelor's degree,free/reduced,completed,83,83
695,female,group D,some college,free/reduced,none,89,86
557,male,group C,master's degree,free/reduced,none,67,66
836,male,group E,high school,standard,none,64,57


In [5]:
# Numeric transformer
# Handle missing values using descriptive statis tic (mean, meadian, mode, ...)
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

In [6]:
# Ordinal and boolean features
education_levels = ["master's degree", "bachelor's degree",  "associate's degree", 'some college',
        'high school', 'some high school']
gender_values = X_train['gender'].unique()
lunch_values = X_train['lunch'].unique()
test_prep_values = X_train['test preparation course'].unique()

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(categories=[education_levels, gender_values, lunch_values, test_prep_values]))
])

In [7]:
# Nominal features
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False))
])

### Combine many transformers
ColumnTransformer()

In [8]:
# Complete preprocessing pipeline for all features
preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, ["reading score", "writing score"]),
    ("ordinal_features", ordinal_transformer,["parental level of education", "gender", "lunch", "test preparation course"]),
    ("nominal_features", nominal_transformer, ["race/ethnicity"])
])

### Bigger pipeline includes preprocessing & model initializing

In [9]:
# Using Linear Regression
reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('reg_model', LinearRegression())
])

In [10]:
reg.fit(X_train, y_train)

### Evaluate Regression model

In [11]:
y_pred = reg.predict(X_test)

In [12]:
mean_squared_error(y_test, y_pred)

28.821056563832887

In [13]:
mean_absolute_error(y_test,y_pred)

4.181966418321512

#### r2 score (most common)

In [14]:
# Giá trị càng gần 1 mô hình càng tốt
r2_score(y_test,y_pred)

0.8815597679452446

In [15]:
# Using RandomForest
rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('rf_model', RandomForestRegressor())
])
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test,y_pred))
print(r2_score(y_test,y_pred))

36.719414531249996
4.6673374999999995
0.8491014384443639


### So sánh kết quả 2 model LinearRegression và RandomForestRegressor
LinearRegression tốt hơn trong trường hợp này, vì:
- Dữ liệu có target có hệ số tương quan cao với các features, nên các mô hình tuyến tính phát huy được sức mạnh

## Hyperparameter Tuning (find the optimal parameters)

In [None]:
# GridSearch
# Tìm các tham số tốt nhất cho mô hình RandomForestRegressor (rf_model) ở trên
param_grid = {
    "rf_model__n_estimators": [50, 100, 200],
    "rf_model__criterion": ["squared_error", "absolute_error", "poisson"],
    "rf_model__max_depth": [None, 2, 5, 10],
    "preprocessor__num_features__imputer__strategy":["mean", "median "]
}
# Tìm ra tham số tốt nhất đối với các bộ validation
grid_search = GridSearchCV(rf, param_grid, cv=4, scoring="r2", n_jobs=6, verbose=1)
grid_search.fit(X_train, y_train)
print("Best params: {}".format(grid_search.best_params_))
y_pred = grid_search.predict(X_test)
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test,y_pred))
print(r2_score(y_test,y_pred))

Fitting 4 folds for each of 72 candidates, totalling 288 fits


144 fits failed out of a total of 288.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
48 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ADMIN\anaconda3\envs\cs685hw\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ADMIN\anaconda3\envs\cs685hw\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\ADMIN\anaconda3\envs\cs685hw\lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "c:\Users\ADMIN\anaconda3\envs\cs685hw\lib\site-packages\sklearn\pipeline.py",

Best params: {'preprocessor__num_features__imputer__strategy': 'mean', 'rf_model__criterion': 'absolute_error', 'rf_model__max_depth': 10, 'rf_model__n_estimators': 200}
35.298550625
4.5499
0.8549404836022593


In [None]:
# RandomizedSearchCV
# Tìm các tham số tốt nhất cho mô hình RandomForestRegressor (rf_model) ở trên
param_grid = {
    "rf_model__n_estimators": [50, 100, 200],
    "rf_model__criterion": ["squared_error", "absolute_error", "poisson"],
    "rf_model__max_depth": [None, 2, 5, 10],
    "preprocessor__num_features__imputer__strategy":["mean", "median "]
}
# Tìm ra tham số tốt nhất đối với các bộ validation
# n_iter: thử ngẫu nhiên một số bộ tham số => nhanh, nhưng không đảm bảo tìm được optimal
grid_search = RandomizedSearchCV(rf, param_distributions=param_grid, cv=5, scoring="r2", n_jobs=6, verbose=1, n_iter=30)
grid_search.fit(X_train, y_train)
print("Best params: {}".format(grid_search.best_params_))
y_pred = grid_search.predict(X_test)
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test,y_pred))
print(r2_score(y_test,y_pred))

Fitting 5 folds for each of 30 candidates, totalling 150 fits


70 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ADMIN\anaconda3\envs\cs685hw\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ADMIN\anaconda3\envs\cs685hw\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\ADMIN\anaconda3\envs\cs685hw\lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "c:\Users\ADMIN\anaconda3\envs\cs685hw\lib\site-packages\sklearn\pipeline.py", 

Best params: {'rf_model__n_estimators': 200, 'rf_model__max_depth': 10, 'rf_model__criterion': 'absolute_error', 'preprocessor__num_features__imputer__strategy': 'mean'}
34.96120346875
4.5333875
0.8563268129125942


### LazyPredict

In [None]:
# Tự động sử dụng những transformer phù hợp cho từng kiểu dữ liệu
lazy_reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy_reg.fit(X_train, X_test, y_train, y_test)

  0%|          | 0/42 [00:00<?, ?it/s]



In [21]:
models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LarsCV,0.88,0.88,5.37,0.08
HuberRegressor,0.88,0.88,5.37,0.06
SGDRegressor,0.88,0.88,5.38,0.03
LassoCV,0.88,0.88,5.38,0.17
KernelRidge,0.88,0.88,5.38,0.36
LinearSVR,0.88,0.88,5.39,0.05
ElasticNetCV,0.88,0.88,5.39,0.1
BayesianRidge,0.88,0.88,5.39,0.07
RidgeCV,0.88,0.88,5.39,0.04
Ridge,0.88,0.88,5.39,0.02


### Use LazyPredict with self-defined Pipeline

In [22]:
reg = Pipeline(steps=[
    ("preprocessor", preprocessor)
    # ("model", RandomForestRegressor())
])
X_train = reg.fit_transform(X_train)
X_test = reg.fit_transform(X_test)

lazy_reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy_reg.fit(X_train, X_test, y_train, y_test)
models

  0%|          | 0/42 [00:00<?, ?it/s]



Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LinearSVR,0.86,0.87,5.66,0.01
SGDRegressor,0.85,0.86,5.78,0.01
BayesianRidge,0.85,0.86,5.79,0.02
LassoCV,0.85,0.86,5.79,0.16
Ridge,0.85,0.86,5.79,0.01
RidgeCV,0.85,0.86,5.79,0.03
LarsCV,0.85,0.86,5.79,0.03
Lars,0.85,0.86,5.79,0.02
LassoLarsIC,0.85,0.86,5.79,0.01
LinearRegression,0.85,0.86,5.79,0.01


### Cách triển khai khác
- Chạy Lazy lấy top 10 models
- Tìm các bộ tham số tốt nhất với 10 models đó
Trong thực tế, thử càng nhiều càng tốt