<a href="https://colab.research.google.com/github/maceteligolden/machine_learning_patterns/blob/main/DatapreprocessingPipelineExample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [69]:
"""
  This is a sample dataset generated primarily
  to show the data processing pipeline pattern
"""
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Gender': ['Female', 'Male', 'Male'],
    'Job': ['Engineer', 'Doctor', 'Teacher'],
    'Age': [25, None, 35],
    'City': ['New York', 'San Francisco', 'Los Angeles']
}

df = pd.DataFrame(data)

df

Unnamed: 0,Name,Gender,Job,Age,City
0,Alice,Female,Engineer,25.0,New York
1,Bob,Male,Doctor,,San Francisco
2,Charlie,Male,Teacher,35.0,Los Angeles


In [71]:
"""
This class drops the name, city and job columns
"""
class ColumnDropper(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    return X.drop(['Name', 'City', 'Job'], axis="columns", inplace=False)

"""
This class generates missing values of age by
taking the mean of the values before and after the missing cell
"""
class AgeImputer(BaseEstimator, TransformerMixin):

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    imputer = SimpleImputer(strategy='mean')
    X['Age'] = imputer.fit_transform(X[['Age']])
    return X

"""
this class transforms the gender string
to numbers using a binary encoder
"""
class GenderEncoder(BaseEstimator, TransformerMixin):

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    gender_dist = { 'Female': 0, 'Male': 1 }
    X['Gender'] = X['Gender'].map(gender_dist)
    return X

"""
this class uses a one hot encoder to generate unique columns
for each job type and uses binary to show which one was
assigned to the selected row
"""
class JobEncoder(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    encoder = OneHotEncoder()
    encoded_jobs = encoder.fit_transform(X[['Job']]).toarray()
    # fetch all the unique job titles dynamically
    jobs = encoder.get_feature_names_out(['Job'])
    # create a data frame for the encoded jobs
    encoded_df = pd.DataFrame(encoded_jobs, columns=jobs)
    return pd.concat([X, encoded_df], axis="columns")

In [72]:
from sklearn.pipeline import Pipeline

# here is a sample of the preprocessing pipeline create
preprocessing_pipeline = Pipeline([
    ("job_encoder", JobEncoder()),
    ("age_imputer", AgeImputer()),
    ("gender_encoder", GenderEncoder()),
    ("drop_column", ColumnDropper())
])

# execution of the preprocessing pipeline
df = preprocessing_pipeline.fit_transform(df)


In [73]:
# final result after preprocessign
df

Unnamed: 0,Gender,Age,Job_Doctor,Job_Engineer,Job_Teacher
0,0,25.0,0.0,1.0,0.0
1,1,30.0,1.0,0.0,0.0
2,1,35.0,0.0,0.0,1.0
