## Standard ML pipeline

### Dummy data 

In [1]:
import pandas as pd
import numpy as np

data = {
    'a': np.random.randint(1, 10, size=100),
    'b': np.random.randint(1, 10, size=100),
    'c': np.random.choice(['category1', 'category2'], size=100),
    'd': np.random.choice(['value1', 'value2'], size=100),
    'target': np.random.randint(0, 2, size=100)
}

df = pd.DataFrame(data)

print(df)

df.to_csv('misc/file.csv', index=False)

    a  b          c       d  target
0   9  1  category2  value1       0
1   4  1  category1  value1       1
2   9  8  category2  value1       1
3   7  1  category1  value2       0
4   6  6  category2  value1       0
.. .. ..        ...     ...     ...
95  1  3  category2  value1       0
96  2  8  category2  value1       1
97  9  3  category2  value2       0
98  7  8  category2  value2       1
99  5  8  category1  value2       1

[100 rows x 5 columns]


### Loading data

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('misc/file.csv')

Y = df['target']
X = df.drop(['target'], axis=1)

### Preprocessing

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [4]:
def check_data(df):
    print(f'Shape of the dataset : {df.shape}')
    # Count missing values for each column where count > 0
    if df.isna().sum().sum() > 0:
        print(df.isna().sum()[df.isna().sum() > 0])

#### Outlier

In [5]:
from ipywidgets import interact
import seaborn as sns
import matplotlib.pyplot as plt

def plot(column):
    if df[column].dtype in ['int64', 'float64']:
        sns.boxplot(x=df[column])
    else:
        sns.countplot(x=df[column])
    plt.show()

interact(plot, column=df.columns.tolist())

interactive(children=(Dropdown(description='column', options=('a', 'b', 'c', 'd', 'target'), value='a'), Outpu…

<function __main__.plot(column)>

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, columns_factors, verbose=False):
        self.columns_factors = columns_factors
        self.verbose = verbose

    def fit(self, X, y=None):
        self.iqr_ranges_ = {}
        for col, factor in self.columns_factors.items():
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            self.iqr_ranges_[col] = (Q1 - factor * IQR, Q3 + factor * IQR)
        return self

    def transform(self, X, y=None):
        X_out = X.copy()
        for col, factor in self.columns_factors.items():
            low, high = self.iqr_ranges_[col]
            outliers = X_out[(X_out[col] < low) | (X_out[col] > high)]
            if self.verbose:
                print(f"Removing {len(outliers)} outliers from column {col}")
            X_out = X_out[(X_out[col] >= low) & (X_out[col] <= high)]
        return X_out
  
# Parameters (1.5 is recommended factor)
outlier_dict = {
    'a': 1.5,
    'b': 1.5
}

# Execute outlier to see how many row will be affected
OutlierRemover(columns_factors = outlier_dict, verbose = True).fit_transform(X)

outlier_remover = OutlierRemover(columns_factors = outlier_dict)

Removing 0 outliers from column a
Removing 0 outliers from column b


#### Missing data

- DropNA, KNNImputer or IterativeImputer

In [7]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.compose import ColumnTransformer
import pandas as pd

# Get all col containing missing values
na_cols = [col for col in df.columns if df[col].isna().sum() > 0]

imputer = ColumnTransformer(
    transformers=[
        ('imputer', IterativeImputer(random_state=0), na_cols)
    ]
)

print(na_cols)


[]


#### Numerical data
- PCA
- StandardScaler

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [9]:
X.select_dtypes(include=['int64', 'float64']).columns.tolist()

['a', 'b']

In [10]:
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(f'Numeric columns : {numeric_columns}')

Numeric columns : ['a', 'b']


In [11]:
num_pipeline = ColumnTransformer(
    transformers=[
    ('scaler', StandardScaler(), numeric_columns), 
    ('pca', PCA(n_components=0.95), numeric_columns) 
])

#### Categorical data

- One-hot encoding
- Ordinal encoding


In [12]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Définir les colonnes pour chaque transformation
one_hot_cols = ['c', 'd']
ordinal_cols = []

# Créer le transformateur de colonnes
cat_pipeline = ColumnTransformer(
    transformers=[
        ('one_hot', OneHotEncoder(), one_hot_cols),
        ('ordinal', OrdinalEncoder(), ordinal_cols)
    ])

#### Preprocessing pipeline 

In [13]:
# Build preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('outlier', outlier_remover, list(outlier_dict.keys())),
        ('imputer', imputer, na_cols),
        ('num', num_pipeline, numeric_columns),
        ('cat', cat_pipeline, one_hot_cols)
    ])

# Check for errors in preprocessing
preprocessor.fit(X)

#### Splitting dataset

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

### Model training

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

In [16]:
from sklearn import set_config
set_config(display='diagram')

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', None)
])

pipeline

# Définir la grille de paramètres
param_grid =  [
    {
        'regressor' : [GradientBoostingRegressor()],
        'regressor__n_estimators': [2, 3, 5, 10, 30],
        'regressor__max_depth': [2, 3, 5, 10, 20, 40],
    },
    {
        'regressor' : [RandomForestRegressor()],
        'regressor__n_estimators': [2, 3, 5, 10, 30],
        'regressor__max_depth': [2, 3, 5, 10, 20, 40],
    },
]

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring = 'r2')
grid_search.fit(X_train, y_train)

### Model selection

In [17]:
# grid_search contains a param_grid with different regressors
# for each type of regressor :
# extract regressor who have the best score
# print the score and the best params
 
results = {}
for i in range(len(grid_search.cv_results_['params'])):
    regressor_name = grid_search.cv_results_['params'][i]['regressor'].__class__.__name__
    # Append or instantiate a list at results[regressor_name] containing a dict with scoring and params
    if regressor_name not in results.keys():
        results[regressor_name] = []
    results[regressor_name].append({
        'score': grid_search.cv_results_['mean_test_score'][i],
        'params': grid_search.cv_results_['params'][i]
    })
    
# Get best score and params for each regressor
for regressor_name, regressor_results in results.items():
    best_score = 0
    best_params = None
    for regressor_result in regressor_results:
        if regressor_result['score'] > best_score:
            best_score = regressor_result['score']
            best_params = regressor_result['params']
    print(f'{regressor_name} :')
    print(f'    score : {best_score}')
    print(f'    params : {best_params}')

GradientBoostingRegressor :
    score : 0
    params : None
RandomForestRegressor :
    score : 0
    params : None


In [18]:
# Select best model
best_model = grid_search.best_estimator_
print(best_model.get_params())

{'memory': None, 'steps': [('preprocessor', ColumnTransformer(transformers=[('outlier',
                                 OutlierRemover(columns_factors={'a': 1.5,
                                                                 'b': 1.5}),
                                 ['a', 'b']),
                                ('imputer',
                                 ColumnTransformer(transformers=[('imputer',
                                                                  IterativeImputer(random_state=0),
                                                                  [])]),
                                 []),
                                ('num',
                                 ColumnTransformer(transformers=[('scaler',
                                                                  StandardScaler(),
                                                                  ['a', 'b']),
                                                                 ('pca',
                              

### Saving model

In [19]:
import pickle

with open('misc/model.pkl', 'wb') as file:
    pickle.dump(best_model, file)