# Notes on Machine Learning Models - Part 3 - Best Practices

အပိုင်း ၃ ပိုင်းရှိတဲ့ ဒီ notes တွေမှာ အဓိကအားဖြင့် Machine Learning Model တွေကို အသုံးပြုရာမှာ သတိပြုရမဲ့ အောက်ပါ အကြောင်းအရာများကို ပြောပြပေးသွားမှာ ဖြစ်ပါတယ်။ 

* Feature Engineering 
* Model Validation and
* Coding Best Practices

ဒီအပိုင်းက တတိယနဲ့ နောက်ဆုံးအပိုင်း Best Practices ဖြစ်ပါတယ်။ 

Data Science နဲ့ ပတ်သက်ရင် (ခုနောက်ပိုင်း prompt engineering လိုမျိုးတွေ၊ AutoML တွေ သုံးလာကြပြီ ဖြစ်ပေမဲ့) လက်ရှိမှာ ကုဒ်တွေ ရေးနေရတုန်းပဲ ဖြစ်ပါတယ်။ 

ဒါကြောင့် ဒီအပိုင်းမှာ ... 

* `sklearn.pipe` နဲ့ တခြား `sklearn` utilities တွေ အကြောင်း
* version control system (VCS) အကြောင်းနဲ့ 
* တခြား coding နဲ့ ဆက်နွယ်နေတဲ့ ဆောင်ရန်/ရှောင်ရန် အကြောင်းတွေ ပြောမယ်။

## `sklearn` Like a Boss

In [1]:
from sklearn import preprocessing as sk_pp
from sklearn import datasets as sk_ds

import numpy as np
import pandas as pd

In [2]:
df_X, ds_y = sk_ds.fetch_openml(name="credit-g", as_frame=True, return_X_y=True)
df_X.head()

  " {version}.".format(name=name, version=res[0]["version"])


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,4.0,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,2.0,real estate,22.0,none,own,1.0,skilled,1.0,none,yes
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,3.0,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,4.0,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,4.0,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes


In [3]:
df_X_category = df_X.select_dtypes(include=["category"])
df_X_category.head()

Unnamed: 0,checking_status,credit_history,purpose,savings_status,employment,personal_status,other_parties,property_magnitude,other_payment_plans,housing,job,own_telephone,foreign_worker
0,<0,critical/other existing credit,radio/tv,no known savings,>=7,male single,none,real estate,none,own,skilled,yes,yes
1,0<=X<200,existing paid,radio/tv,<100,1<=X<4,female div/dep/mar,none,real estate,none,own,skilled,none,yes
2,no checking,critical/other existing credit,education,<100,4<=X<7,male single,none,real estate,none,own,unskilled resident,none,yes
3,<0,existing paid,furniture/equipment,<100,4<=X<7,male single,guarantor,life insurance,none,for free,skilled,none,yes
4,<0,delayed previously,new car,<100,1<=X<4,male single,none,no known property,none,for free,skilled,none,yes


In [4]:
df_X_number = df_X.select_dtypes(include=["number"])
df_X_number.head()

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents
0,6.0,1169.0,4.0,4.0,67.0,2.0,1.0
1,48.0,5951.0,2.0,2.0,22.0,1.0,1.0
2,12.0,2096.0,2.0,3.0,49.0,1.0,2.0
3,42.0,7882.0,2.0,4.0,45.0,1.0,2.0
4,24.0,4870.0,3.0,4.0,53.0,2.0,2.0


In [5]:
col_to_unique_values = {c: [df_X[c].unique()] for c in df_X.columns}
df_col_to_unique_values = pd.DataFrame(data=col_to_unique_values, index=["unique_values"]).T
df_col_to_unique_values.loc[:, "unique_count"] = df_col_to_unique_values.apply(lambda x : df_X[x.name].nunique(), axis=1)
df_col_to_unique_values

Unnamed: 0,unique_values,unique_count
checking_status,"['<0', '0<=X<200', 'no checking', '>=200'] Cat...",4
duration,"[6.0, 48.0, 12.0, 42.0, 24.0, 36.0, 30.0, 15.0...",33
credit_history,"['critical/other existing credit', 'existing p...",5
purpose,"['radio/tv', 'education', 'furniture/equipment...",10
credit_amount,"[1169.0, 5951.0, 2096.0, 7882.0, 4870.0, 9055....",921
savings_status,"['no known savings', '<100', '500<=X<1000', '>...",5
employment,"['>=7', '1<=X<4', '4<=X<7', 'unemployed', '<1'...",5
installment_commitment,"[4.0, 2.0, 3.0, 1.0]",4
personal_status,"['male single', 'female div/dep/mar', 'male di...",4
other_parties,"['none', 'guarantor', 'co applicant'] Categori...",3


In [6]:
from sklearn import model_selection as sk_ms

df_X_tr, df_X_ts, ds_y_tr, ds_y_ts = sk_ms.train_test_split(df_X, ds_y, test_size=0.2, shuffle=True, random_state=42)

df_feat_tr = pd.DataFrame(data=None, index=df_X_tr.index)
df_feat_ts = pd.DataFrame(data=None, index=df_X_ts.index)

In [7]:
ordinal_columns = ["credit_history", "savings_status"]
oe = sk_pp.OrdinalEncoder(
    # အောက်က categories parameter မှာ array-like of array-like (list of list) ထည့်ပေးရတာ သတိပြုပါ။
    categories=[
        ['no credits/all paid', 'all paid', 'existing paid', 'delayed previously', 'critical/other existing credit'],
        ['no known savings', '<100', '100<=X<500', '500<=X<1000', '>=1000']
    ], 
    # handle_unknown က သိပ်အရေးကြီးတယ်။ ဒါမပါသွားရင် production ကျမှ ပြဿနာ တက်တတ်တယ်။ default is "error"
    handle_unknown="use_encoded_value", 
    unknown_value=np.nan
)

df_feat_tr.loc[:, ["oe_{}".format(c) for c in ordinal_columns]] = oe.fit_transform(df_X_tr[ordinal_columns])
# see ? you can never ever fit with whole dataset; 
df_feat_ts.loc[:, ["oe_{}".format(c) for c in ordinal_columns]] = oe.transform(df_X_ts[ordinal_columns])

df_feat_tr.head()

Unnamed: 0,oe_credit_history,oe_savings_status
29,3.0,1.0
535,4.0,1.0
695,2.0,3.0
557,0.0,0.0
836,2.0,0.0


In [8]:
norminal_columns = [c for c in df_X_category.columns if c not in ordinal_columns]

ohe = sk_pp.OneHotEncoder(sparse=False, handle_unknown="ignore")
ohe.fit(df_X_tr[norminal_columns])

norminal_features = ohe.get_feature_names_out()
df_feat_tr.loc[:, norminal_features] = ohe.transform(df_X_tr[norminal_columns])
df_feat_ts.loc[:, norminal_features] = ohe.transform(df_X_ts[norminal_columns])
df_feat_tr.head()

Unnamed: 0,oe_credit_history,oe_savings_status,checking_status_0<=X<200,checking_status_<0,checking_status_>=200,checking_status_no checking,purpose_business,purpose_domestic appliance,purpose_education,purpose_furniture/equipment,...,housing_own,housing_rent,job_high qualif/self emp/mgmt,job_skilled,job_unemp/unskilled non res,job_unskilled resident,own_telephone_none,own_telephone_yes,foreign_worker_no,foreign_worker_yes
29,3.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
535,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
695,2.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
557,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
836,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [9]:
df_feat_ts.head()

Unnamed: 0,oe_credit_history,oe_savings_status,checking_status_0<=X<200,checking_status_<0,checking_status_>=200,checking_status_no checking,purpose_business,purpose_domestic appliance,purpose_education,purpose_furniture/equipment,...,housing_own,housing_rent,job_high qualif/self emp/mgmt,job_skilled,job_unemp/unskilled non res,job_unskilled resident,own_telephone_none,own_telephone_yes,foreign_worker_no,foreign_worker_yes
521,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
737,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
740,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
660,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
411,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


ဒါမပြီးသေးဘူး၊ အပိုင်း ၁ မှာ Feature Engineering တွေ ရှိသေးတယ်။ 

ဒီလိုတွေ ရှုပ်ရှုပ်ထွေးထွေးတွေ ရေးနေမဲ့အစား ... 

In [33]:
from sklearn import pipeline as sk_pipe
from sklearn import base as sk_base

In [34]:
class ColumnSelector(sk_base.TransformerMixin):
    def __init__(self, cols_to_select) -> None:
        super().__init__()
        self.cols_to_select = cols_to_select

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None):
        return X[self.cols_to_select]
    
ordinal_columns = ['credit_history', 'savings_status']
norminal_columns = [c for c in df_X_tr.select_dtypes(include=["category"]).columns if c not in ordinal_columns]
numeric_columns = list(df_X_tr.select_dtypes(include=["number"]).columns)

one_hot_pipeline = sk_pipe.Pipeline(steps=[
    ("norminal_selector", ColumnSelector(cols_to_select=norminal_columns)),
    ("one_hot_encoder", sk_pp.OneHotEncoder(sparse=False, handle_unknown="ignore"))
], verbose=True)

ordinal_pipeline = sk_pipe.Pipeline(steps=[
    ("ordinal_selector", ColumnSelector(cols_to_select=ordinal_columns)),
    ("ordinal_encoder", sk_pp.OrdinalEncoder(
        categories=[
            ['no credits/all paid', 'all paid', 'existing paid', 'delayed previously', 'critical/other existing credit'],
            ['no known savings', '<100', '100<=X<500', '500<=X<1000', '>=1000']
        ], 
    handle_unknown="use_encoded_value", unknown_value=np.nan
    ))
], verbose=True)

numeric_pipeline = sk_pipe.Pipeline(steps=[
    ("to_scale", sk_pipe.FeatureUnion(transformer_list=[
        ("ordinal_pipeline", ordinal_pipeline),
        ("numeric_selector", ColumnSelector(cols_to_select=numeric_columns))
    ], n_jobs=2)),
    ("scalers", sk_pipe.FeatureUnion(transformer_list=[
        ("ss", sk_pp.StandardScaler()),
        ("mms", sk_pp.MinMaxScaler()),
        ("mas", sk_pp.MaxAbsScaler())
    ], n_jobs=2, verbose=True))
])

In [35]:
from sklearn import feature_selection as sk_fs
from sklearn import svm
from sklearn import ensemble

model_pipeline = sk_pipe.Pipeline(steps=[
    ("preprocessing", sk_pipe.FeatureUnion(transformer_list=[
        ("ordinal_pipe", one_hot_pipeline),
        ("numeric_pipe", numeric_pipeline)
    ])),
    ("feature_select", sk_fs.SequentialFeatureSelector(
        estimator=svm.NuSVC(class_weight={0: 1, 1: 5}, random_state=42), n_features_to_select=50, direction="backward", cv=5, n_jobs=4)
    ),
    ("gbm", ensemble.GradientBoostingClassifier(n_estimators=100))
])

In [39]:
y_tr = [0 if _y == "good" else 1 for _y in ds_y_tr]
y_ts = [0 if _y == "good" else 1 for _y in ds_y_ts]

model_pipeline.fit(df_X_tr, y_tr)

[Pipeline] . (step 1 of 2) Processing norminal_selector, total=   0.0s
[Pipeline] ... (step 2 of 2) Processing one_hot_encoder, total=   0.0s
[Pipeline] .. (step 1 of 2) Processing ordinal_selector, total=   0.0s
[Pipeline] ... (step 2 of 2) Processing ordinal_encoder, total=   0.0s
[FeatureUnion] ............ (step 1 of 3) Processing ss, total=   0.0s
[FeatureUnion] ........... (step 2 of 3) Processing mms, total=   0.0s
[FeatureUnion] ........... (step 3 of 3) Processing mas, total=   0.0s


Pipeline(steps=[('preprocessing',
                 FeatureUnion(transformer_list=[('ordinal_pipe',
                                                 Pipeline(steps=[('norminal_selector',
                                                                  <__main__.ColumnSelector object at 0x7f22d012a438>),
                                                                 ('one_hot_encoder',
                                                                  OneHotEncoder(handle_unknown='ignore',
                                                                                sparse=False))],
                                                          verbose=True)),
                                                ('numeric_pipe',
                                                 Pipeline(steps=[('to_scale',
                                                                  FeatureUnion(n_jobs=2,
                                                                               transformer_list=[('ord

In [40]:
from sklearn import metrics
y_hat = model_pipeline.predict(df_X_ts)
print (metrics.classification_report(y_ts, y_hat, sample_weight=[1 if _y==0 else 5 for _y in y_ts]))

              precision    recall  f1-score   support

           0       0.43      0.89      0.58     141.0
           1       0.90      0.44      0.59     295.0

    accuracy                           0.59     436.0
   macro avg       0.66      0.67      0.59     436.0
weighted avg       0.75      0.59      0.59     436.0

