In [21]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib

In [22]:
# Load data
df = pd.read_csv("../data/train.csv")

In [23]:
# Date extraction
df['Dates'] = pd.to_datetime(df['Dates'])
df['Hour'] = df['Dates'].dt.hour
df['Day'] = df['Dates'].dt.day
df['Month'] = df['Dates'].dt.month
df['Weekday'] = df['Dates'].dt.weekday

In [24]:
# Selecting columns
categorical_cols = ['DayOfWeek', 'PdDistrict']
numeric_cols = ['X', 'Y', 'Hour', 'Day', 'Month', 'Weekday']

In [25]:
# Defining Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols),
        ('num', MinMaxScaler(), numeric_cols)
    ]
)

In [26]:
# Creating pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

In [27]:
# Fit and transform data
features_scaled = pipeline.fit_transform(df)

In [28]:
# Extracting column names
cat_feature_names = list(pipeline.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out(categorical_cols))
feature_names = cat_feature_names + numeric_cols

In [29]:
# Saving preprocessed data with correct column names
preprocessed_df = pd.DataFrame(features_scaled, columns=feature_names)
preprocessed_df.to_csv("../data/preprocessed_data.csv", index=False)

# Saving the pipeline
joblib.dump(pipeline, '../models/preprocessing_pipeline.joblib')

['../models/preprocessing_pipeline.joblib']

In [30]:
preprocessed_df

Unnamed: 0,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,...,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,X,Y,Hour,Day,Month,Weekday
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.043578,0.001276,1.0,0.400000,0.363636,0.333333
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.043578,0.001276,1.0,0.400000,0.363636,0.333333
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.044337,0.001770,1.0,0.400000,0.363636,0.333333
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.043030,0.001778,1.0,0.400000,0.363636,0.333333
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.037198,0.001217,1.0,0.400000,0.363636,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878044,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.027120,0.000118,0.0,0.166667,0.000000,0.000000
878045,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.032915,0.000460,0.0,0.166667,0.000000,0.000000
878046,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.054752,0.001384,0.0,0.166667,0.000000,0.000000
878047,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.061138,0.001391,0.0,0.166667,0.000000,0.000000


In [31]:
preprocessed_df.columns

Index(['DayOfWeek_Monday', 'DayOfWeek_Saturday', 'DayOfWeek_Sunday',
       'DayOfWeek_Thursday', 'DayOfWeek_Tuesday', 'DayOfWeek_Wednesday',
       'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE', 'PdDistrict_MISSION',
       'PdDistrict_NORTHERN', 'PdDistrict_PARK', 'PdDistrict_RICHMOND',
       'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN',
       'X', 'Y', 'Hour', 'Day', 'Month', 'Weekday'],
      dtype='object')