In [2]:
import pandas as pd

In [169]:
from pathlib import Path

data_path = Path(r"F:\machine learning\mlops\end to end machine learning pipeline\MLOPs_workflow\data\raw\AB_NYC_2019.csv")

data = pd.read_csv(data_path)

In [170]:
data.head(2)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer


def delta_date_feature(dates):
    """
    Given a 2D array containing dates, returns the delta in days between each date 
    and the most recent date in its column.
    """
    dates = pd.DataFrame(x, columns=["last_review"])
    dates['last_review'] = pd.to_datetime(dates["last_review"], format=f"%Y-%m-%d", errors="coerce")

    max_dates = dates['last_review'].max()
    return dates['last_review'].apply(lambda d : (max_dates - d)).dt.days.fillna(max_dates).to_numpy().reshape(-1, 1)


def get_feature_transformation_pipeline():
    """
    Constructs a feature transformation pipeline.

    Returns:
    --------
    feature_transformation_pipeline : Pipeline
        Scikit-learn pipeline for feature preprocessing.
    processed_features : list
        List of input features before transformation.
    new_features : list
        List of transformed feature names.
    """

    # Categorical Features
    # ordinal_categorical = ["room_type"]
    non_ordinal_categorical = ["neighbourhood_group"]

    ordinal_categorical_preproc = OrdinalEncoder()

    non_ordinal_categorical_preproc = make_pipeline(
        SimpleImputer(strategy="most_frequent"),
        OneHotEncoder(handle_unknown="ignore")
    )

    ordinal_categorical = ["room_type"]
    non_ordinal_categorical = ["neighbourhood_group"]
    # Numerical Features with Zero Imputation
    zero_imputed_columns = [
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365",
        "longitude",
        "latitude"
    ]
    zero_imputer = SimpleImputer(strategy="constant", fill_value=0)

    # Date Transformation
    date_imputer = make_pipeline(
        SimpleImputer(strategy='constant', fill_value='2010-01-01'),
        FunctionTransformer(delta_date_feature, validate=False)
    )

    # Text Feature Engineering for 'name' column
    name_tfidf = make_pipeline(
        SimpleImputer(strategy="constant", fill_value=""),
        FunctionTransformer(lambda x: x.ravel(), validate=False),  # Ensures 1D input for TF-IDF
        TfidfVectorizer(binary=False, max_features=5, stop_words='english')
    )

    # Column Transformer
    preprocessor = ColumnTransformer(
        transformers=[
            # ("ordinal_cat", ordinal_categorical_preproc, ordinal_categorical),
            ("non_ordinal_cat", non_ordinal_categorical_preproc, non_ordinal_categorical),
            ("impute_zero", zero_imputer, zero_imputed_columns),
            ("transform_date", date_imputer, ["last_review"]),
            ("transform_name", name_tfidf, ["name"])
        ],
        remainder="drop"  # Drops unused columns
    )

    # Feature Lists
    processed_features = ordinal_categorical + non_ordinal_categorical + zero_imputed_columns + ["last_review", "name"]

    new_features = ordinal_categorical + \
                   ['neighbourhood_group_Bronx', 'neighbourhood_group_Brooklyn', 
                    'neighbourhood_group_Manhattan', 'neighbourhood_group_Queens', 
                    'neighbourhood_group_Staten Island'] + \
                   zero_imputed_columns + ['last_review'] + \
                   ['apartment', 'bedroom', 'cozy', 'private', 'room']

    # Final Pipeline
    # feature_transformation_pipeline = Pipeline(
    #     steps=[("preprocessor", preprocessor)]
    # )

    return feature_transformation_pipeline, processed_features, new_features


In [7]:
pipeline, processed_features, new_features = get_feature_transformation_pipeline()

In [None]:
transformed_data = pipeline.fit_transform(data)

In [None]:
for step_name, step in pipeline.named_steps.items():
    print(f"🔍 Checking step: {step_name}")
    try:
        transformed = step.fit_transform(data)
        print(f" Step '{step_name}' completed successfully")
    except Exception as e:
        print(f" Error in step '{step_name}': {e}")


In [25]:
from pathlib import Path
import pandas as pd

parquet_file_path = Path("F://machine learning//mlops//end to end machine learning pipeline//MLOPs_workflow//data//processed//target.parquet")

data = pd.read_parquet(parquet_file_path)


In [None]:
data.columns

Index(['price', 'id', 'event_timestamp'], dtype='object')

: 

In [None]:
'room_type', 'neighbourhood_group_Bronx',
       'neighbourhood_group_Brooklyn', 'neighbourhood_group_Manhattan',
       'neighbourhood_group_Queens', 'id', 'event_timestamp']


'neighbourhood_group_Staten Island', 'minimum_nights',
       'number_of_reviews', 'reviews_per_month', 'id', 'event_timestamp'


'calculated_host_listings_count', 'availability_365', 'longitude',
       'latitude', 'id', 'event_timestamp'


'last_review', 'apartment', 'bedroom', 'cozy', 'private', 'room', 'id',
       'event_timestamp'


'price', 'id', 'event_timestamp'

In [None]:
# step1: room type ko OrdinalEncoder se encode karo 

# step2: non ordinal category ke liye phele most frequent fill karo then one hot encode karo

# step3: zero impute karo using SimpleImputer wth constant value 

# step4: date transformation karo using SimpleImputer with constant value and then FunctionTransformer)

# step5: name column ke liye phele most frequent fill karo then TfidfVectorizer use karo 

In [167]:
import pickle
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import OrdinalEncoder
from sklearn.base import TransformerMixin, BaseEstimator


class OrdinalEncoderTransformer(BaseEstimator, TransformerMixin):
    """Applies Ordinal Encoding to a specified column and persists the fitted encoder."""

    def __init__(self):
        """
        Initializes the transformer with a specific column to encode.

        Parameters:
        -----------
        column_name : str
            The name of the column to apply ordinal encoding.
        """
        self.encoder = OrdinalEncoder()

    def fit(self, X, column_name):
        """
        Fits the ordinal encoder on the specified column.

        Parameters:
        -----------
        X : pd.DataFrame
            Input dataframe containing the column to be encoded.
        """
        if column_name not in X:
            raise ValueError(f"Column '{column_name}' not found in the DataFrame.")
        
        self.encoder.fit(X[[column_name]])
        return self

    def transform(self, X, column_name):
        """
        Transforms the specified column using the fitted encoder.

        Parameters:
        -----------
        X : pd.DataFrame
            Input dataframe containing the column to be transformed.

        Returns:
        --------
        np.ndarray
            Transformed column as a NumPy array.
        """
        if column_name not in X:
            raise ValueError(f"Column '{column_name}' not found in the DataFrame.")
        
        # no extra column will be created here as we are transforming only one column
        return self.encoder.transform(X[[column_name]])

    def fit_transform(self, X, column_name):
        """Fits and transforms the data in one step."""
        return self.fit(X, column_name).transform(X, column_name)

    def save(self, path):
        """
        Saves the fitted encoder to a file.

        Parameters:
        -----------
        path : str
            The file path to save the encoder.
        """
        artifact_path = path / "ordinal_encoder.pkl"
        path.mkdir(parents=True, exist_ok=True)
        
      
        with open(artifact_path, "wb") as f:
            pickle.dump(self.encoder, f)
            print("file saved")


    @staticmethod
    def load(path):
        """
        Loads a previously saved encoder.

        Parameters:
        -----------
        path : str
            The file path from where to load the encoder.
        column_name : str
            The column name for which the encoder was originally created.

        Returns:
        --------
        OrdinalEncoderTransformer
            A new instance of OrdinalEncoderTransformer with the loaded encoder.



        usage : 
            # Load the trained encoder
            loaded_transformer = OrdinalEncoderTransformer.load("ordinal_encoder.pkl", column_name="room_type")
        """

        artifact_path = path.joinpath("ordinal_encoder.pkl")
        with open(artifact_path, "rb") as f:
            loaded_encoder = pickle.load(f)
        
        # create a new instance with the column name and loaded encoder
        # return the new instance to the caller
        transformer = OrdinalEncoderTransformer()
        transformer.encoder = loaded_encoder
        return transformer
    

# usage :

# Initialize and fit transformer
ordinal_transformer = OrdinalEncoderTransformer()
ordinal_transformer.fit(data, column_name="room_type")

# Transform data
transformed_data = ordinal_transformer.transform(data, column_name="room_type")
print(transformed_data)

from pathlib import Path

ordinal_transformer.save(Path("room_type"))
loaded_transformer = OrdinalEncoderTransformer.load(Path("room_type"))
x = loaded_transformer.transform(data, column_name="room_type")
# print(transformed_data.shape)



[[1.]
 [0.]
 [1.]
 ...
 [0.]
 [2.]
 [1.]]
file saved


In [168]:
pd.DataFrame(x, columns = ['room_type'])

Unnamed: 0,room_type
0,1.0
1,0.0
2,1.0
3,0.0
4,0.0
...,...
48890,1.0
48891,1.0
48892,0.0
48893,2.0


(48895, 16)

In [None]:
# 1️⃣ BaseEstimator
# ✔ Provides basic methods like get_params() and set_params()
# ✔ Allows the transformer to be used in hyperparameter tuning (GridSearchCV, RandomizedSearchCV)
# ✔ Helps maintain a consistent API across Scikit-Learn components

# 🔹 Example Usage of get_params()

# 2️⃣ TransformerMixin
# ✔ Ensures that the transformer supports fit_transform(X, y=None) method
# ✔ Reduces redundant code by automatically defining fit_transform() as fit().transform(X)
# ✔ Makes the transformer pipeline-compatible

# Without TransformerMixin, we would need to manually implement:

In [None]:
# # Categorical Features
# # ordinal_categorical = ["room_type"]
# non_ordinal_categorical = ["neighbourhood_group"]

# # ordinal_categorical_preproc = OrdinalEncoder()

# non_ordinal_categorical_preproc = make_pipeline(
#     SimpleImputer(strategy="most_frequent"),
#     OneHotEncoder(handle_unknown="ignore")
#     )

In [51]:
import pickle
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

class NonOrdinalCategoricalTransformer(BaseEstimator, TransformerMixin):
    """
    Transformer for handling non-ordinal categorical features.
    1. Imputes missing values using the most frequent category.
    2. Applies One-Hot Encoding with `handle_unknown="ignore"` to avoid errors during inference.

    Attributes:
    ------------
    imputer : SimpleImputer
        Imputer for handling missing categorical values.
    encoder : OneHotEncoder
        One-hot encoder for transforming categorical values.
    """

    def __init__(self):
        self.imputer = SimpleImputer(strategy="most_frequent")
        self.encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

    def fit(self, X, column_name):
        """
        Fits the imputer and the one-hot encoder.
        """
        out = self.imputer.fit_transform(X[column_name].values.reshape(-1, 1))  # Fit imputer
        # print(X[column_name])
        self.encoder.fit(out)  # Fit encoder on imputed data
        return self  # Return self for method chaining

    def transform(self, X, column_name):
        """
        Applies imputation and one-hot encoding to the data.
        """
        out = self.imputer.transform(X[column_name].values.reshape(-1, 1))# Impute missing values
        X_encoded = self.encoder.transform(out)  # One-hot encode
        # print(X_encoded.shape)

        return pd.DataFrame(X_encoded, columns=[self.get_feature_names(column_name)])
        

    def fit_transform(self, X, column_name):
        """
        Combines fit and transform for efficiency.
        """
        return self.fit(X, column_name).transform(X, column_name)

    def get_feature_names(self, column_name):
        """
        Returns the feature names for the encoded categories.
        """
        return self.encoder.get_feature_names_out([column_name])

    def save(self, path):
        """
        Saves the transformer (including imputer and encoder) as a pickle file.
        """

        # save the imputer first and then the encode

        imputer_path = path / "imputer.pkl"
        encoder_path = path / "encoder.pkl"


        path.mkdir(parents=True, exist_ok=True)

        with open(imputer_path, "wb") as f:
            pickle.dump(self.imputer, f)

        with open(encoder_path, "wb") as f:
            pickle.dump(self.encoder, f)

    @staticmethod
    def load(path):
        """
        Loads a saved transformer from a pickle file.
        """

        # from the current class's artifact folder get the artifacts 
        for artifact in path.iterdir():
            if "imputer" in str(artifact):
                with open(artifact, "rb") as f:
                    imputer_pkl = pickle.load(f)
            else:
                with open(artifact, "rb") as f:
                    encoder_pkl = pickle.load(f)
            

        
        # create a new instance with the loaded pickle file 
        non_ordinal_categorical_transformer = NonOrdinalCategoricalTransformer()
        non_ordinal_categorical_transformer.imputer = imputer_pkl
        non_ordinal_categorical_transformer.encoder = encoder_pkl

        # return the instance of this class with arguments already loaded
        # this can be directly used for transformation
        # example provided below 
        return non_ordinal_categorical_transformer


NonOrdinalCategoricalTransformer_obj = NonOrdinalCategoricalTransformer()
NonOrdinalCategoricalTransformer_obj.fit(data, column_name="neighbourhood_group")
transformed_data_nonordinal = NonOrdinalCategoricalTransformer_obj.transform(data, column_name="neighbourhood_group")

NonOrdinalCategoricalTransformer_obj.save(Path("neighborhood"))


# this instance will be loaded with 
non_ordinal_categorical_transformer = NonOrdinalCategoricalTransformer.load(Path("neighborhood"))
non_ordinal_categorical_transformer.encoder.get_feature_names_out(['neighborhood'])


array(['neighborhood_Bronx', 'neighborhood_Brooklyn',
       'neighborhood_Manhattan', 'neighborhood_Queens',
       'neighborhood_Staten Island'], dtype=object)

In [66]:
# Numerical Features with Zero Imputation
mean_imputed_columns = [
    "minimum_nights",
    "number_of_reviews",
    "reviews_per_month",
    "calculated_host_listings_count",
    "availability_365",
    "longitude",
    "latitude"
]
mean_imputer = SimpleImputer(strategy="mean", fill_value=0)
mean_imputed = mean_imputer.fit_transform(data[mean_imputed_columns])

mean_imputed_df = pd.DataFrame(mean_imputed, columns=mean_imputed_columns)


In [None]:
# Date Transformation
# date_imputer = make_pipeline(
#     SimpleImputer(strategy='constant', fill_value='2010-01-01'),
#     FunctionTransformer(delta_date_feature, validate=False)
# )

# date_imputer.fit_transform(data[["last_review"]])

In [None]:
# data_copy = data.copy()
# imputer = SimpleImputer(strategy='constant', fill_value='2010-01-01')
# # first fill null values
# data_copy['last_review_date'] = imputer.fit_transform(data_copy['last_review'].values.reshape(-1, 1)).reshape(-1,)

# # then convert to pd datetime format
# data_copy['last_review_date']  = pd.to_datetime(data_copy['last_review_date'])

# # then get the max value
# np.max(data_copy['last_review_date'])


# (np.max(data_copy['last_review_date']) - data_copy['last_review_date']).dt.days.fillna(0).values
# imputer.fit_transform(data_copy['last_review'].values.reshape(-1, 1)).reshape(-1,)

In [163]:
from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin


class DeltaDatetimeFeature(BaseEstimator, TransformerMixin):
    def __init__(self):
        """ we need a imputer to fill null values and a function transformer to transform the date"""
        self.imputer = SimpleImputer(strategy='constant', fill_value='2010-01-01')
        self.max_date = None
        
        

    def fit(self, X, column_name):
        X_copy = X.copy()
        
        # Convert column to string before applying SimpleImputer
        X_copy[column_name] = X_copy[column_name].astype(str)
        
        # Impute missing values with "2010-01-01"
        X_copy[column_name] = self.imputer.fit_transform(X_copy[[column_name]]).ravel()

        # Convert back to datetime
        X_copy[column_name] = pd.to_datetime(X_copy[column_name], format="%Y-%m-%d", errors="coerce")

        # Compute max date while handling NaT values
        self.max_date = np.max(X_copy[column_name])
        return self
    
    def transform(self, X, column_name):
        # now caluclate the delta date
        data_copy = X.copy()
        # use the imputer to fill null values
        data_copy['last_review_date'] = imputer.fit_transform(data_copy['last_review'].values.reshape(-1, 1)).reshape(-1,)

        # Convert to datetime, handling errors gracefully
        data_copy['last_review_date'] = pd.to_datetime(data_copy['last_review_date'], format="%Y-%m-%d", errors="coerce")

        if pd.isna(self.max_date):  # If all values are NaT, return a default fill value
            return np.full((len(data_copy), 1), fill_value=-1)  # -1 can indicate missing values

        # Vectorized computation of delta in days
        delta_days = (self.max_date - data_copy['last_review_date']).dt.days.fillna(0).values
        return pd.DataFrame(delta_days,columns=["days_from_max_date"])    
    
    def fit_transform(self, X, column_name):
        return self.fit(X, column_name).transform(X, column_name)
    
    def save(self, path):
        """ save the artifacts of the model"""

        imputer_path = path / "imputer.pkl"
        max_date_val = path / "max_value.pkl"

        path.mkdir(parents=True, exist_ok=True)

        with open(imputer_path, "wb") as f:
            pickle.dump(self.imputer, f)
        
        with open(max_date_val, "wb") as f:
            pickle.dump(self.max_date, f)
    
    
    @staticmethod
    def load(path):
        
        for artifact in path.iterdir():
            print(artifact)
            if "imputer" in str(artifact):
                with open(artifact, "rb") as f:
                    imputer = pickle.load(f)
            else:
                with open(artifact, "rb") as f:
                    max_value = pickle.load(f)
        
        # create a new instance of the class
        delta_date_feature = DeltaDatetimeFeature()
        delta_date_feature.imputer = imputer   
        delta_date_feature.max_date = max_value

        return delta_date_feature



deltadatetimefeature_obj = DeltaDatetimeFeature()
deltadatetimefeature_obj.fit(data, 'last_review')
out = deltadatetimefeature_obj.transform(data, 'last_review')

deltadatetimefeature_obj.save(Path("src\project\prod\prod_artifacts\DateFeature"))
new_obj = DeltaDatetimeFeature.load(Path("src\project\prod\prod_artifacts\DateFeature"))

src\project\prod\prod_artifacts\DateFeature\imputer.pkl
src\project\prod\prod_artifacts\DateFeature\max_value.pkl


In [None]:
class tfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.imputer = SimpleImputer(strategy="constant", fill_value="")
        self.vectorizer = TfidfVectorizer(binary=False, max_features=5, stop_words='english')

    def fit(self, X, column_name):
        new = self.imputer.fit_transform([X[column_name]])        # or [[]] --> .reshape(-1, 1)
        self.vectorizer.fit(new.ravel())
        return self
    
    def transform(self, X, column_name):
        data_copy = X.copy()
        new = self.imputer.transform([data_copy[column_name]])
        print(new.shape)
        vectorized = self.vectorizer.transform(new.ravel())
        
        # vectorized are csr_matrix, we need to convert it into a dataframe 
        return pd.DataFrame(vectorized.toarray(), columns=self.get_feature_names())
    
    def get_feature_names(self,):
        return self.vectorizer.get_feature_names_out().tolist()
    
    def fit_transform(self, X, column_name):
        return self.fit(X, column_name).transform(X, column_name)

    def save(self, path):
        imputer_path = path / "imputer.pkl"
        vectorizer_path = path / "vectorizer.pkl"

        path.mkdir(parents=True, exist_ok=True)
        
        with open(imputer_path, "wb") as f:
            pickle.dump(self, f)
        
        with open(vectorizer_path, "wb") as f:
            pickle.dump(self, f)

    @staticmethod
    def load(path):
        for artifact in path.iterdir():
            print(artifact)
            if "imputer" in str(artifact):
                with open(artifact, "rb") as f:
                    imputer = pickle.load(f)
            else:
                with open(artifact, "rb") as f:
                    vectorizer = pickle.load(f)
        
        tfidfVectorizerobj = tfidfVectorizer()
        tfidfVectorizerobj.imputer = imputer
        tfidfVectorizerobj.vectorizer = vectorizer

        return tfidfVectorizerobj

In [None]:
# tfidfVectorizerobj = tfidfVectorizer()
# tfidfVectorizerobj.fit(data, "name")
# tfidfVectorizerobj.transform(data, 'name')


(1, 48895)


Unnamed: 0,apartment,bedroom,cozy,private,room
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
48890,0.0,1.0,0.0,0.0,0.0
48891,0.0,0.0,0.0,0.0,1.0
48892,0.0,0.0,0.0,0.0,0.0
48893,0.0,0.0,1.0,0.0,0.0


In [None]:
# combine the full pipeline output 

# har ek class se khali load karo artifacts and transform karo data
# original dataframe se kaunse kaunse features nahi hai transformed mein wo add karke dekhna hai 
# ek new dataframe banao having transformed dataset
# 
def MeanImputer(columns):
    """ 
    impute null values with mean values , doest word column wise but will work on overall data 
    
    """
    # Numerical Features with Zero Imputation
    mean_imputer = SimpleImputer(strategy="mean", fill_value=0)
    mean_imputed = mean_imputer.fit_transform(data[columns])

    mean_imputed_df = pd.DataFrame(mean_imputed, columns=columns)
    return mean_imputed_df


def pipeline(df, path, training=True):
    """

    will take a dataframe as input and process the required features and return the transformed dataframe,
    if the training args is true that means we need to train the sklearn artifacts used in each classes and save them in respectve 
    folders

    Parameters:
    ---------------------
    df : pd.DataFrame
        dataframe to preprocess
    
    path : pathlib.Path
        path to store or retrieve artifacts
    
    training : boolean
        whether to fit and save the artifacts or retrieve the fitter artifacts


    """
    # get the columns that need processing
    ordinal_categorical = ["room_type"]
    non_ordinal_categorical = ["neighbourhood_group"]
        
    # Numerical Features with Zero Imputation
    mean_imputed_columns = [
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365",
        "longitude",
        "latitude"
        ]
    
    
    
    if training:
        # we will only save the artifacts in the respective folder

        # first for the given dataframe, process the ordinal categorial features 
        oheobj = OrdinalEncoderTransformer()
        oheobj.fit(df, column_name=ordinal_categorical[0])
        oheobj.save(path / "OrdinalCat")

        nonordinalprocess = NonOrdinalCategoricalTransformer()
        nonordinalprocess.fit(df, column_name=non_ordinal_categorical[0])
        nonordinalprocess.save(path / "NonOrdinalCat")

        datefeatureprocess = DeltaDatetimeFeature()
        datefeatureprocess.fit(df, column_name="last_review")
        datefeatureprocess.save(path / "DateFeature")

        # training doesnt need mean imputation
        # meanimputerprocessed = MeanImputer(mean_imputed_columns)

        nameprocess = tfidfVectorizer()
        nameprocess.fit(df, column_name="name")
        nameprocess.save(path / "tfidf")
        return "done"

    else:

        # load the required classes 
        oheobj = OrdinalEncoderTransformer.load()
        onehotencoded = oheobj.transform(df, column_name=ordinal_categorical[0])
        print(onehotencoded.shape)
        print(onehotencoded.columns)
        print("\n\n")


        nonordinalprocess = NonOrdinalCategoricalTransformer().load()
        nonordinalencoded = nonordinalprocess.transform(df, "neighbourhood_group")
        print(nonordinalencoded.shape)
        print(nonordinalencoded.columns)
        print("\n\n")


        datefeatureprocess = DeltaDatetimeFeature().load()
        date_transformed = datefeatureprocess.transform(df, 'last_review')
        print(date_transformed.shape)
        print(date_transformed.columns)
        print("\n\n")


        nameprocess = tfidfVectorizer().load()
        name_transfored = nameprocess.transform(df, 'name')
        print(name_transfored.shape)
        print(name_transfored.columns)
        print("\n\n")

        meanimputerprocessed = MeanImputer(mean_imputed_columns)
        print(meanimputerprocessed.shape)
        print(meanimputerprocessed.columns)
        print("\n\n")

        # datalist = [onehotencoded, nonordinalencoded, date_transformed, name_transfored, meanimputerprocessed]

        # final_df = CombineDataFrames(datalist)
        # return final_df
    
        return "DOne"



def CombineDataFrames(dataframes):
    """ 
    combine list of dataframes
    """
    result = pd.concat(dataframes, ignore_index=True)
    return result
