In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


class DataTypeHandler(BaseEstimator, TransformerMixin):
    """
    Enforces correct data types for numerical and categorical features.

    This transformer ensures that numerical columns are converted to numeric
    types and categorical columns are converted to pandas ``category`` dtype
    before applying any statistical operations or encoding steps.

    The transformer is stateless and compatible with Scikit-Learn pipelines.

    Parameters
    ----------
    numerical_cols : list of str, optional
        List of column names that should be treated as numerical features.
        Any non-convertible values will be coerced to NaN.

    categorical_cols : list of str, optional
        List of column names that should be treated as categorical features.
        These columns will be converted to pandas ``category`` dtype.

    Attributes
    ----------
    numerical_cols : list of str
        Stored list of numerical column names.

    categorical_cols : list of str
        Stored list of categorical column names.
    """

    def __init__(self, numerical_cols=None, categorical_cols=None):
        """
        Initialize the DataTypeHandler transformer.

        Parameters
        ----------
        numerical_cols : list of str, optional
            Names of columns to be converted to numeric types.

        categorical_cols : list of str, optional
            Names of columns to be converted to categorical types.
        """
        self.numerical_cols = numerical_cols if numerical_cols else []
        self.categorical_cols = categorical_cols if categorical_cols else []

    def fit(self, X, y=None):
        """
        Fit the transformer.

        This transformer does not learn any parameters from the data
        and is therefore stateless.

        Parameters
        ----------
        X : pandas.DataFrame
            Input feature matrix.

        y : None, optional
            Target values (ignored).

        Returns
        -------
        self : DataTypeHandler
            Returns the instance itself.
        """
        return self

    def transform(self, X):
        """
        Apply data type enforcement to the input DataFrame.

        Numerical columns are converted using ``pandas.to_numeric`` with
        ``errors='coerce'`` to safely handle invalid values.
        Categorical columns are converted to pandas ``category`` dtype.

        Parameters
        ----------
        X : pandas.DataFrame
            Input feature matrix to be transformed.

        Returns
        -------
        X_transformed : pandas.DataFrame
            Transformed DataFrame with enforced data types.
        """
        X = X.copy()

        for col in self.numerical_cols:
            if col in X.columns:
                X[col] = pd.to_numeric(X[col], errors='coerce')

        for col in self.categorical_cols:
            if col in X.columns:
                X[col] = X[col].astype('category')

        return X

    def get_feature_names_out(self, input_features=None):
        """
        Get output feature names for transformation.

        This method is required for compatibility with modern
        Scikit-Learn pipelines to preserve column names.

        Parameters
        ----------
        input_features : array-like of str, optional
            Input feature names.

        Returns
        -------
        feature_names : array-like of str
            Output feature names.
        """
        return input_features


In [2]:
df = pd.read_csv('fordgobike-tripdataFor201902.csv')

In [3]:
df.dtypes

duration_sec                 int64
start_time                  object
end_time                    object
start_station_id           float64
start_station_name          object
start_station_latitude     float64
start_station_longitude    float64
end_station_id             float64
end_station_name            object
end_station_latitude       float64
end_station_longitude      float64
bike_id                      int64
user_type                   object
member_birth_year          float64
member_gender               object
bike_share_for_all_trip     object
dtype: object

In [5]:
numerical_cols = [
    'duration_sec',
    'start_station_latitude',
    'start_station_longitude',
    'end_station_latitude',
    'end_station_longitude',
    'member_birth_year'
]

categorical_cols = [
    'start_station_name',
    'end_station_name',
    'user_type',
    'member_gender',
    'bike_share_for_all_trip'
]


In [8]:
handlr=DataTypeHandler(numerical_cols,categorical_cols)
df = handlr.fit_transform(df)

In [9]:
df.dtypes

duration_sec                  int64
start_time                   object
end_time                     object
start_station_id            float64
start_station_name         category
start_station_latitude      float64
start_station_longitude     float64
end_station_id              float64
end_station_name           category
end_station_latitude        float64
end_station_longitude       float64
bike_id                       int64
user_type                  category
member_birth_year           float64
member_gender              category
bike_share_for_all_trip    category
dtype: object

In [2]:
# Create "dirty" sample data
dirty_data = pd.DataFrame({
    'Age': [25, 30, "Error", 22, 150, 28, np.nan, 32],  # Contains text, an outlier value (150), and a missing value
    'Fare': [10, 15, 12, 300, 14, np.nan, 18, 20],     # Contains an outlier value (300) and a missing value
    'Gender': ['M', 'F', 'M', np.nan, 'F', 'M', 'F', 'F']  # Contains a missing value
})

print("--- Data before preprocessing ---")
dirty_data = pd.DataFrame(dirty_data)
print(dirty_data)


--- Data before preprocessing ---
     Age   Fare Gender
0     25   10.0      M
1     30   15.0      F
2  Error   12.0      M
3     22  300.0    NaN
4    150   14.0      F
5     28    NaN      M
6    NaN   18.0      F
7     32   20.0      F


In [3]:
num_cols = ['Gender', 'Fare']
cat_cols = ['Gender']

x_train = DataTypeHandler(num_cols, cat_cols)
x_train.fit(x_train)

AttributeError: 'DataTypeHandler' object has no attribute 'fit'

In [4]:
from sklearn.pipeline import Pipeline

# 1. Define configuration
num_cols = ['Age', 'Fare']
cat_cols = ['Gender']

# 2. Build the pipeline
pipeline = Pipeline([
    ('step1_types', DataTypeHandler(numerical_cols=num_cols, categorical_cols=cat_cols))])
# 3. Run the pipeline (Fit & Transform)
clean_data = pipeline.fit_transform(dirty_data)
print("\n--- Data after preprocessing (final result) ---")
new = pd.DataFrame(clean_data)
print(new.dtypes)


TypeError: Last step of Pipeline should implement fit or be the string 'passthrough'. 'DataTypeHandler(categorical_cols=['Gender'], numerical_cols=['Age', 'Fare'])' (type <class '__main__.DataTypeHandler'>) doesn't

In [16]:
# unit test

In [None]:
import unittest
import pandas as pd

class TestDataTypeHandler(unittest.TestCase):
    def setUp(self):
        self.raw_data = pd.DataFrame({'age': ['25', '30'], 'city': ['Cairo', 'Giza']})
        self.handler = DataTypeHandler(numerical_cols=['age'], categorical_cols=['city'])

    def test_conversion(self):
        cleaned_df = self.handler.fit_transform(self.raw_data)
        self.assertTrue(pd.api.types.is_numeric_dtype(cleaned_df['age']))

if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

.
----------------------------------------------------------------------
Ran 1 test in 0.005s

OK
