In [1]:
import pandas as pd

In [15]:
from pathlib import Path

data_path = Path(r"F:\machine learning\mlops\end to end machine learning pipeline\MLOPs_workflow\data\raw\AB_NYC_2019.csv")

data = pd.read_csv(data_path)

In [16]:
data.head(2)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355


In [15]:
data[0:5].values

array([[2539, 'Clean & quiet apt home by the park', 2787, 'John',
        'Brooklyn', 'Kensington', 40.64749, -73.97237, 'Private room',
        149, 1, 9, '2018-10-19', 0.21, 6, 365],
       [2595, 'Skylit Midtown Castle', 2845, 'Jennifer', 'Manhattan',
        'Midtown', 40.75362, -73.98377, 'Entire home/apt', 225, 1, 45,
        '2019-05-21', 0.38, 2, 355],
       [3647, 'THE VILLAGE OF HARLEM....NEW YORK !', 4632, 'Elisabeth',
        'Manhattan', 'Harlem', 40.80902, -73.9419, 'Private room', 150,
        3, 0, nan, nan, 1, 365],
       [3831, 'Cozy Entire Floor of Brownstone', 4869, 'LisaRoxanne',
        'Brooklyn', 'Clinton Hill', 40.68514, -73.95976,
        'Entire home/apt', 89, 1, 270, '2019-07-05', 4.64, 1, 194],
       [5022, 'Entire Apt: Spacious Studio/Loft by central park', 7192,
        'Laura', 'Manhattan', 'East Harlem', 40.79851, -73.94399,
        'Entire home/apt', 80, 10, 9, '2018-11-19', 0.1, 1, 0]],
      dtype=object)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [72]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer


def delta_date_feature(dates):
    """
    Given a 2D array containing dates, returns the delta in days between each date 
    and the most recent date in its column.
    """
    dates = pd.DataFrame(x, columns=["last_review"])
    dates['last_review'] = pd.to_datetime(dates["last_review"], format=f"%Y-%m-%d", errors="coerce")

    max_dates = dates['last_review'].max()
    return dates['last_review'].apply(lambda d : (max_dates - d)).dt.days.fillna(max_dates).to_numpy().reshape(-1, 1)


def get_feature_transformation_pipeline():
    """
    Constructs a feature transformation pipeline.

    Returns:
    --------
    feature_transformation_pipeline : Pipeline
        Scikit-learn pipeline for feature preprocessing.
    processed_features : list
        List of input features before transformation.
    new_features : list
        List of transformed feature names.
    """

    # Categorical Features
    ordinal_categorical = ["room_type"]
    non_ordinal_categorical = ["neighbourhood_group"]

    ordinal_categorical_preproc = OrdinalEncoder()

    non_ordinal_categorical_preproc = make_pipeline(
        SimpleImputer(strategy="most_frequent"),
        OneHotEncoder(handle_unknown="ignore")
    )

    # Numerical Features with Zero Imputation
    zero_imputed = [
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365",
        "longitude",
        "latitude"
    ]
    zero_imputer = SimpleImputer(strategy="constant", fill_value=0)

    # Date Transformation
    date_imputer = make_pipeline(
        SimpleImputer(strategy='constant', fill_value='2010-01-01'),
        FunctionTransformer(delta_date_feature, validate=False)
    )

    # Text Feature Engineering for 'name' column
    name_tfidf = make_pipeline(
        SimpleImputer(strategy="constant", fill_value=""),
        FunctionTransformer(lambda x: x.ravel(), validate=False),  # Ensures 1D input for TF-IDF
        TfidfVectorizer(binary=False, max_features=5, stop_words='english')
    )

    # Column Transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ("ordinal_cat", ordinal_categorical_preproc, ordinal_categorical),
            ("non_ordinal_cat", non_ordinal_categorical_preproc, non_ordinal_categorical),
            ("impute_zero", zero_imputer, zero_imputed),
            ("transform_date", date_imputer, ["last_review"]),
            ("transform_name", name_tfidf, ["name"])
        ],
        remainder="drop"  # Drops unused columns
    )

    # Feature Lists
    processed_features = ordinal_categorical + non_ordinal_categorical + zero_imputed + ["last_review", "name"]

    new_features = ordinal_categorical + \
                   ['neighbourhood_group_Bronx', 'neighbourhood_group_Brooklyn', 
                    'neighbourhood_group_Manhattan', 'neighbourhood_group_Queens', 
                    'neighbourhood_group_Staten Island'] + \
                   zero_imputed + ['last_review'] + \
                   ['apartment', 'bedroom', 'cozy', 'private', 'room']

    # Final Pipeline
    feature_transformation_pipeline = Pipeline(
        steps=[("preprocessor", preprocessor)]
    )

    return feature_transformation_pipeline, processed_features, new_features


In [73]:
pipeline, processed_features, new_features = get_feature_transformation_pipeline()

In [74]:
transformed_data = pipeline.fit_transform(data)

In [None]:
for step_name, step in pipeline.named_steps.items():
    print(f"🔍 Checking step: {step_name}")
    try:
        transformed = step.fit_transform(data)
        print(f" Step '{step_name}' completed successfully")
    except Exception as e:
        print(f" Error in step '{step_name}': {e}")


In [25]:
from pathlib import Path
import pandas as pd

parquet_file_path = Path("F://machine learning//mlops//end to end machine learning pipeline//MLOPs_workflow//data//processed//target.parquet")

data = pd.read_parquet(parquet_file_path)


In [26]:
data.columns

Index(['price', 'id', 'event_timestamp'], dtype='object')

In [None]:
'room_type', 'neighbourhood_group_Bronx',
       'neighbourhood_group_Brooklyn', 'neighbourhood_group_Manhattan',
       'neighbourhood_group_Queens', 'id', 'event_timestamp']


'neighbourhood_group_Staten Island', 'minimum_nights',
       'number_of_reviews', 'reviews_per_month', 'id', 'event_timestamp'


'calculated_host_listings_count', 'availability_365', 'longitude',
       'latitude', 'id', 'event_timestamp'


'last_review', 'apartment', 'bedroom', 'cozy', 'private', 'room', 'id',
       'event_timestamp'


'price', 'id', 'event_timestamp'

In [29]:
# Categorical Features
ordinal_categorical = ["room_type"]
non_ordinal_categorical = ["neighbourhood_group"]

ordinal_categorical_preproc = OrdinalEncoder()

non_ordinal_categorical_preproc = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
    )

In [None]:
# Numerical Features with Zero Imputation
zero_imputed = [
    "minimum_nights",
    "number_of_reviews",
    "reviews_per_month",
    "calculated_host_listings_count",
    "availability_365",
    "longitude",
    "latitude"
]
zero_imputer = SimpleImputer(strategy="constant", fill_value=0)

# Date Transformation
date_imputer = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='2010-01-01'),
    FunctionTransformer(delta_date_feature, validate=False)
)




date_imputer.fit_transform(data[["last_review"]])

In [None]:
x = SimpleImputer(strategy='constant', fill_value='2010-01-01').fit_transform(data[["last_review"]])
# dates = FunctionTransformer(delta_date_feature, validate=False).fit_transform(x)

In [56]:
# pd.to_datetime(pd.DataFrame(x, columns=["last_review"]))
# max_dates = date_sanitized.max()
# return date_sanitized.apply(lambda d: (max_dates - d.fillna(max_dates)).dt.days, axis=0).values
dates = pd.DataFrame(x, columns=["last_review"])

dates['last_review'] = pd.to_datetime(dates['last_review'], format=f"%Y-%m-%d", errors="coerce")


In [68]:
max_dates = dates['last_review'].max()

dates['last_review'].apply(lambda d : (max_dates - d)).dt.days.fillna(max_dates)

0         262
1          48
2        3475
3           3
4         231
         ... 
48890    3475
48891    3475
48892    3475
48893    3475
48894    3475
Name: last_review, Length: 48895, dtype: int64