In [18]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
CAT_THRESHOLD = 10   # If missing values are below this %, impute with mode
MERGE_THRESHOLD = 8  # If subcategories account for less than this %, merge to form 'Others'

class ColumnProcessor:
    """Handles column normalization"""
    @staticmethod
    def normalize_column_names(df):
        """Normalizes column names by removing spaces and special characters"""
        df_copy = df.copy()
        df_copy.columns = [re.sub(r'[^a-zA-Z0-9]', '', col.strip()) for col in df_copy.columns]
        return df_copy

class FeatureEngineer(BaseEstimator, TransformerMixin):
    """Creates new features and removes original features used in engineering"""
    def __init__(self):
        self.original_features = None
        self.engineered_features = None
        self.features_to_drop = None
        
    def fit(self, X, y=None):
        X = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
        self.original_features = X.columns.tolist()
        return self

    def transform(self, X):
        """Apply feature engineering transformations"""
        X = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
        data = X.copy()
        
        print("\nFeature Engineering Step:")
        print("Input columns:", data.columns.tolist())
        
        # Total Square Footage
        if all(col in data.columns for col in ['GrLivArea', 'TotalBsmtSF']):
            data['TotalSqFt'] = data['GrLivArea'].astype(float) + data['TotalBsmtSF'].fillna(0).astype(float)
            print("Added TotalSqFt")
        else:
            print("Missing columns for TotalSqFt:", {'GrLivArea', 'TotalBsmtSF'} - set(data.columns))
        
        # House Age
        if all(col in data.columns for col in ['YrSold', 'YearBuilt']):
            data['HouseAge'] = abs(data['YrSold'].astype(float) - data['YearBuilt'].astype(float))
            print("Added HouseAge")
        else:
            print("Missing columns for HouseAge:", {'YrSold', 'YearBuilt'} - set(data.columns))
        
        # Total Bathrooms
        bathroom_cols = ['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']
        if all(col in data.columns for col in bathroom_cols):
            data['TotalBaths'] = (
                data['FullBath'].astype(float) + 
                0.5 * data['HalfBath'].fillna(0).astype(float) + 
                data['BsmtFullBath'].fillna(0).astype(float) + 
                0.5 * data['BsmtHalfBath'].fillna(0).astype(float)
            )
            print("Added TotalBaths")
        else:
            print("Missing columns for TotalBaths:", set(bathroom_cols) - set(data.columns))
        
        # Years since remodeling
        if all(col in data.columns for col in ['YrSold', 'YearRemodAdd']):
            data['YrRemodAge'] = abs(data['YrSold'].astype(float) - data['YearRemodAdd'].astype(float))
            print("Added YrRemodAge")
        else:
            print("Missing columns for YrRemodAge:", {'YrSold', 'YearRemodAdd'} - set(data.columns))
        
        print("\nEngineered columns added:", [col for col in data.columns if col not in X.columns])
        print("Final columns:", data.columns.tolist())
        return data

class DataTransformer:
    def __init__(self):
        self.engineering_features = [
            'GrLivArea', 'TotalBsmtSF',
            'YrSold', 'YearBuilt',
            'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath',
            'YearRemodAdd'
        ]
        
        self.original_numeric_features = [
            'OverallQual',
            'TotRmsAbvGrd',
            'GarageCars',
            'Fireplaces',
            'LotFrontage',
            'MasVnrArea'
        ]
        
        self.engineered_features = [
            'TotalSqFt',
            'HouseAge',
            'TotalBaths',
            'YrRemodAge'
        ]
        
        self.numeric_features = self.original_numeric_features + self.engineered_features
        
        self.categorical_features = [
            'Neighborhood',
            'FireplaceQu',
            'KitchenQual',
            'BsmtExposure'
        ]
        
        self.column_processor = ColumnProcessor()

    def create_pipeline(self):
        """Creates preprocessing pipeline with feature engineering and custom categorical handling"""
        # Initial preprocessing with custom categorical handling
        initial_preprocessor = FeaturePreprocessor(
            numeric_features=self.engineering_features + self.original_numeric_features,
            categorical_features=self.categorical_features
        )
        
        # Feature engineering step
        feature_engineer = FeatureEngineer()
        
        # Final transformers
        numeric_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(
            drop='first',
            sparse_output=False,
            handle_unknown='ignore'
        )
        
        # Final preprocessing
        final_preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, self.numeric_features),
                ('cat', categorical_transformer, self.categorical_features)
            ],
            remainder='drop'
        )
        
        return Pipeline([
            ('initial_preprocessing', initial_preprocessor),
            ('feature_engineering', feature_engineer),
            ('final_preprocessing', final_preprocessor)
        ])

    def transform_data(self, data_path):
        # Load data and normalize column names
        data = pd.read_csv(data_path)
        data = self.column_processor.normalize_column_names(data)
        
        print("\nFeature Groups After Normalization:")
        print("Engineering features:", self.engineering_features)
        print("Original numeric features:", self.original_numeric_features)
        print("Engineered features to create:", self.engineered_features)
        print("All numeric features for final transform:", self.numeric_features)
        print("Categorical features:", self.categorical_features)
        
        # Create and fit pipeline
        pipeline = self.create_pipeline()
        transformed_data = pipeline.fit_transform(data)
        
        # Get feature names
        final_preprocessor = pipeline.named_steps['final_preprocessing']
        numeric_features = final_preprocessor.named_transformers_['num'].get_feature_names_out()
        categorical_features = final_preprocessor.named_transformers_['cat'].get_feature_names_out()
        feature_names = np.concatenate([numeric_features, categorical_features])
        
        # Convert to DataFrame with feature names
        transformed_df = pd.DataFrame(transformed_data, columns=feature_names)
        
        print("\nFinal Transformation Summary:")
        print(f"Original shape: {data.shape}")
        print(f"Transformed shape: {transformed_df.shape}")
        print("Final columns:", transformed_df.columns.tolist())
        
        return transformed_df

# Usage example
if __name__ == "__main__":
    transformer = DataTransformer()
    transformed_data = transformer.transform_data('./data/raw_data/AmesHousing.csv')
       
class FeaturePreprocessor(BaseEstimator, TransformerMixin):
    """Custom preprocessor for handling categorical values with thresholds"""
    def __init__(
        self, 
        numeric_features, 
        categorical_features,
        cat_threshold=10,    # If missing values are below this %, impute with mode
        merge_threshold=8    # If subcategories account for less than this %, merge to form 'Others'
    ):
        self.numeric_features = numeric_features
        self.categorical_features = categorical_features
        self.cat_threshold = cat_threshold
        self.merge_threshold = merge_threshold
        self.num_imputer = SimpleImputer(strategy='median')
        self.cat_modes = {}
        self.category_maps = {}

    def fit(self, X, y=None):
        data = X.copy()
        
        # Fit numeric imputer
        numeric_data = data[self.numeric_features].copy()
        self.num_imputer.fit(numeric_data)
        
        # Learn categorical handling parameters
        for col in self.categorical_features:
            if col in data.columns:
                missing_ratio = data[col].isnull().mean() * 100
                
                if missing_ratio > self.cat_threshold:
                    unique_cats = list(data[col].dropna().unique())
                    self.category_maps[col] = unique_cats + ['Unknown']
                else:
                    self.cat_modes[col] = data[col].mode()[0]
                    value_counts = data[col].value_counts(normalize=True) * 100
                    keep_categories = value_counts[value_counts >= self.merge_threshold].index.tolist()
                    
                    if any(value_counts < self.merge_threshold):
                        keep_categories.append('Other')
                    
                    self.category_maps[col] = keep_categories

        return self

    def transform(self, X):
        data = X.copy()
        
        # Transform numeric features
        numeric_data = data[self.numeric_features].copy()
        data[self.numeric_features] = self.num_imputer.transform(numeric_data)
        
        # Transform categorical features
        for col in self.categorical_features:
            if col in data.columns:
                missing_ratio = data[col].isnull().mean() * 100
                
                if missing_ratio > self.cat_threshold:
                    data[col] = data[col].fillna('Unknown')
                else:
                    data[col] = data[col].fillna(self.cat_modes[col])
                    
                    # Map values using comprehensive logic
                    known_categories = set(self.category_maps[col])
                    data[col] = data[col].apply(
                        lambda x: (x if x in known_categories else 'Other')
                        if 'Other' in known_categories
                        else (x if x in known_categories else next(iter(known_categories)))
                    )

        return data

class DataTransformer:
    """Data transformer with feature engineering, custom categorical handling, and SalePrice"""
    def __init__(self):
        self.engineering_features = [
            'GrLivArea', 'TotalBsmtSF',
            'YrSold', 'YearBuilt',
            'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath',
            'YearRemodAdd'
        ]
        
        self.original_numeric_features = [
            'OverallQual',
            'TotRmsAbvGrd',
            'GarageCars',
            'Fireplaces',
            'LotFrontage',
            'MasVnrArea',
            'SalePrice'  # Added SalePrice
        ]
        
        self.engineered_features = [
            'TotalSqFt',
            'HouseAge',
            'TotalBaths',
            'YrRemodAge'
        ]
        
        self.numeric_features = self.original_numeric_features + self.engineered_features
        
        self.categorical_features = [
            'Neighborhood',
            'FireplaceQu',
            'KitchenQual',
            'BsmtExposure'
        ]
        
        self.column_processor = ColumnProcessor()

    def create_pipeline(self):
        """Creates preprocessing pipeline with feature engineering and custom categorical handling"""
        # Initial preprocessing with custom categorical handling
        initial_preprocessor = FeaturePreprocessor(
            numeric_features=self.engineering_features + self.original_numeric_features,
            categorical_features=self.categorical_features
        )
        
        # Feature engineering step
        feature_engineer = FeatureEngineer()
        
        # Final transformers
        numeric_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(
            drop='first',
            sparse_output=False,
            handle_unknown='ignore'
        )
        
        # Final preprocessing
        final_preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, self.numeric_features),
                ('cat', categorical_transformer, self.categorical_features)
            ],
            remainder='drop'
        )
        
        return Pipeline([
            ('initial_preprocessing', initial_preprocessor),
            ('feature_engineering', feature_engineer),
            ('final_preprocessing', final_preprocessor)
        ])

    def transform_data(self, data_path):
        """Load and transform the data"""
        # Load data and normalize column names
        data = pd.read_csv(data_path)
        self.original_data = data.copy()  # Store original data
        data = self.column_processor.normalize_column_names(data)
        
        # Update feature names to match normalized column names
        self.engineering_features = [re.sub(r'[^a-zA-Z0-9]', '', col.strip()) 
                                   for col in self.engineering_features]
        self.original_numeric_features = [re.sub(r'[^a-zA-Z0-9]', '', col.strip()) 
                                        for col in self.original_numeric_features]
        self.categorical_features = [re.sub(r'[^a-zA-Z0-9]', '', col.strip()) 
                                   for col in self.categorical_features]
        
        # Create and fit pipeline
        pipeline = self.create_pipeline()
        transformed_data = pipeline.fit_transform(data)
        
        # Get feature names
        final_preprocessor = pipeline.named_steps['final_preprocessing']
        numeric_features = final_preprocessor.named_transformers_['num'].get_feature_names_out()
        categorical_features = final_preprocessor.named_transformers_['cat'].get_feature_names_out()
        feature_names = np.concatenate([numeric_features, categorical_features])
        
        # Convert to DataFrame with feature names
        transformed_df = pd.DataFrame(transformed_data, columns=feature_names)
        
        # Store original prices for reference
        self.original_prices = data['SalePrice'].copy()
        
        # Print transformation summary
        print("\nTransformation Summary:")
        print("Original columns:", data.columns.tolist())
        print("Normalized columns:", transformed_df.columns.tolist())
        print(f"\nOriginal shape: {data.shape}")
        print(f"Transformed shape: {transformed_df.shape}")
        print("\nEngineered features:", self.engineered_features)
        print("\nCategorical value handling:")
        initial_preprocessor = pipeline.named_steps['initial_preprocessing']
        for col in self.categorical_features:
            print(f"\n{col}:")
            print(f"  Original categories: {len(data[col].unique())}")
            print(f"  Final categories: {len(initial_preprocessor.category_maps[col])}")
            print(f"  Categories kept: {initial_preprocessor.category_maps[col]}")
        
        # Store the pipeline for inverse transforms if needed
        self.pipeline = pipeline
        
        return transformed_df, self.original_data

# Usage example
if __name__ == "__main__":
    transformer = DataTransformer()
    transformed_data, original_data = transformer.transform_data('./data/raw_data/AmesHousing.csv')


Feature Groups After Normalization:
Engineering features: ['GrLivArea', 'TotalBsmtSF', 'YrSold', 'YearBuilt', 'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath', 'YearRemodAdd']
Original numeric features: ['OverallQual', 'TotRmsAbvGrd', 'GarageCars', 'Fireplaces', 'LotFrontage', 'MasVnrArea']
Engineered features to create: ['TotalSqFt', 'HouseAge', 'TotalBaths', 'YrRemodAge']
All numeric features for final transform: ['OverallQual', 'TotRmsAbvGrd', 'GarageCars', 'Fireplaces', 'LotFrontage', 'MasVnrArea', 'TotalSqFt', 'HouseAge', 'TotalBaths', 'YrRemodAge']
Categorical features: ['Neighborhood', 'FireplaceQu', 'KitchenQual', 'BsmtExposure']

Feature Engineering Step:
Input columns: ['PID', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exter

In [78]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

CAT_THRESHOLD = 10   # If missing values are below this %, impute with mode
MERGE_THRESHOLD = 8  # If subcategories account for less than this %, merge to form 'Others'

class ColumnProcessor:
    """Handles column normalization"""
    @staticmethod
    def normalize_column_names(df):
        """Normalizes column names by removing spaces and special characters"""
        df_copy = df.copy()
        df_copy.columns = [re.sub(r'[^a-zA-Z0-9]', '', col.strip()) for col in df_copy.columns]
        return df_copy

class FeatureEngineer(BaseEstimator, TransformerMixin):
    """Creates new features and removes original features used in engineering"""
    def __init__(self):
        self.original_features = None
        self.engineered_features = None
        self.features_to_drop = None
        
    def fit(self, X, y=None):
        X = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
        self.original_features = X.columns.tolist()
        return self

    def transform(self, X):
        """Apply feature engineering transformations"""
        X = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
        data = X.copy()
        
        print("\nFeature Engineering Step:")
        print("Input columns:", data.columns.tolist())
        
        # Total Square Footage
        if all(col in data.columns for col in ['GrLivArea', 'TotalBsmtSF']):
            data['TotalSqFt'] = data['GrLivArea'].astype(float) + data['TotalBsmtSF'].fillna(0).astype(float)
            print("Added TotalSqFt")
        else:
            print("Missing columns for TotalSqFt:", {'GrLivArea', 'TotalBsmtSF'} - set(data.columns))
        
        # House Age
        if all(col in data.columns for col in ['YrSold', 'YearBuilt']):
            data['HouseAge'] = abs(data['YrSold'].astype(float) - data['YearBuilt'].astype(float))
            print("Added HouseAge")
        else:
            print("Missing columns for HouseAge:", {'YrSold', 'YearBuilt'} - set(data.columns))
        
        # Total Bathrooms
        bathroom_cols = ['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']
        if all(col in data.columns for col in bathroom_cols):
            data['TotalBaths'] = (
                data['FullBath'].astype(float) + 
                0.5 * data['HalfBath'].fillna(0).astype(float) + 
                data['BsmtFullBath'].fillna(0).astype(float) + 
                0.5 * data['BsmtHalfBath'].fillna(0).astype(float)
            )
            print("Added TotalBaths")
        else:
            print("Missing columns for TotalBaths:", set(bathroom_cols) - set(data.columns))
        
        # Years since remodeling
        if all(col in data.columns for col in ['YrSold', 'YearRemodAdd']):
            data['YrRemodAge'] = abs(data['YrSold'].astype(float) - data['YearRemodAdd'].astype(float))
            print("Added YrRemodAge")
        else:
            print("Missing columns for YrRemodAge:", {'YrSold', 'YearRemodAdd'} - set(data.columns))
        
        print("\nEngineered columns added:", [col for col in data.columns if col not in X.columns])
        print("Final columns:", data.columns.tolist())
        return data

class FeaturePreprocessor(BaseEstimator, TransformerMixin):
    """Custom preprocessor for handling categorical values with thresholds"""
    def __init__(
        self, 
        numeric_features, 
        categorical_features,
        cat_threshold=CAT_THRESHOLD,    # If missing values are below this %, impute with mode
        merge_threshold=MERGE_THRESHOLD    # If subcategories account for less than this %, merge to form 'Others'
    ):
        self.numeric_features = numeric_features
        self.categorical_features = categorical_features
        self.cat_threshold = cat_threshold
        self.merge_threshold = merge_threshold
        self.num_imputer = SimpleImputer(strategy='median')
        self.cat_modes = {}
        self.category_maps = {}

    def fit(self, X, y=None):
        data = X.copy()
        
        # Fit numeric imputer
        numeric_data = data[self.numeric_features].copy()
        self.num_imputer.fit(numeric_data)
        
        # Learn categorical handling parameters
        for col in self.categorical_features:
            if col in data.columns:
                missing_ratio = data[col].isnull().mean() * 100
                
                if missing_ratio > self.cat_threshold:
                    unique_cats = list(data[col].dropna().unique())
                    self.category_maps[col] = unique_cats + ['Unknown']
                else:
                    self.cat_modes[col] = data[col].mode()[0]
                    value_counts = data[col].value_counts(normalize=True) * 100
                    keep_categories = value_counts[value_counts >= self.merge_threshold].index.tolist()
                    
                    if any(value_counts < self.merge_threshold):
                        keep_categories.append('Other')
                    
                    self.category_maps[col] = keep_categories

        return self

    def transform(self, X):
        data = X.copy()
        
        # Transform numeric features
        numeric_data = data[self.numeric_features].copy()
        data[self.numeric_features] = self.num_imputer.transform(numeric_data)
        
        # Transform categorical features
        for col in self.categorical_features:
            if col in data.columns:
                missing_ratio = data[col].isnull().mean() * 100
                
                if missing_ratio > self.cat_threshold:
                    data[col] = data[col].fillna('Unknown')
                else:
                    data[col] = data[col].fillna(self.cat_modes[col])
                    
                    # Map values using comprehensive logic
                    known_categories = set(self.category_maps[col])
                    data[col] = data[col].apply(
                        lambda x: (x if x in known_categories else 'Other')
                        if 'Other' in known_categories
                        else (x if x in known_categories else next(iter(known_categories)))
                    )

        return data

class DataTransformer:
    """Data transformer with feature engineering, custom categorical handling, and SalePrice"""
    def __init__(self):
        self.engineering_features = [
            'GrLivArea', 'TotalBsmtSF',
            'YrSold', 'YearBuilt',
            'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath',
            'YearRemodAdd'
        ]
        
        self.original_numeric_features = [
            'OverallQual',
            'TotRmsAbvGrd',
            'GarageCars',
            'Fireplaces',
            'LotFrontage',
            'MasVnrArea',
            'SalePrice'  # Added SalePrice
        ]
        
        self.engineered_features = [
            'TotalSqFt',
            'HouseAge',
            'TotalBaths',
            'YrRemodAge'
        ]
        
        self.numeric_features = self.original_numeric_features + self.engineered_features
        
        self.categorical_features = [
            'Neighborhood',
            'FireplaceQu',
            'KitchenQual',
            'BsmtExposure'
        ]
        
        self.column_processor = ColumnProcessor()

    def create_pipeline(self):
        """Creates preprocessing pipeline with feature engineering and custom categorical handling"""
        # Initial preprocessing with custom categorical handling
        initial_preprocessor = FeaturePreprocessor(
            numeric_features=self.engineering_features + self.original_numeric_features,
            categorical_features=self.categorical_features,
            cat_threshold=CAT_THRESHOLD,
            merge_threshold=MERGE_THRESHOLD
        )
        
        # Feature engineering step
        feature_engineer = FeatureEngineer()
        
        # Final transformers
        numeric_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(
            drop='first',
            sparse_output=False,
            handle_unknown='ignore'
        )
        
        # Final preprocessing
        final_preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, self.numeric_features),
                ('cat', categorical_transformer, self.categorical_features)
            ],
            remainder='drop'
        )
        
        return Pipeline([
            ('initial_preprocessing', initial_preprocessor),
            ('feature_engineering', feature_engineer),
            ('final_preprocessing', final_preprocessor)
        ])

    def transform_data(self, data_path):
        """Load and transform the data"""
        # Load data and normalize column names
        data = pd.read_csv(data_path)
        self.original_data = data.copy()  # Store original data
        data = self.column_processor.normalize_column_names(data)
        
        # Update feature names to match normalized column names
        self.engineering_features = [re.sub(r'[^a-zA-Z0-9]', '', col.strip()) 
                                   for col in self.engineering_features]
        self.original_numeric_features = [re.sub(r'[^a-zA-Z0-9]', '', col.strip()) 
                                        for col in self.original_numeric_features]
        self.categorical_features = [re.sub(r'[^a-zA-Z0-9]', '', col.strip()) 
                                   for col in self.categorical_features]
        
        # Create and fit pipeline
        pipeline = self.create_pipeline()
        transformed_data = pipeline.fit_transform(data)
        
        # Get feature names
        final_preprocessor = pipeline.named_steps['final_preprocessing']
        numeric_features = final_preprocessor.named_transformers_['num'].get_feature_names_out()
        categorical_features = final_preprocessor.named_transformers_['cat'].get_feature_names_out()
        feature_names = np.concatenate([numeric_features, categorical_features])
        
        # Convert to DataFrame with feature names
        transformed_df = pd.DataFrame(transformed_data, columns=feature_names)
        
        # Store original prices for reference
        self.original_prices = data['SalePrice'].copy()
        
        # Print transformation summary
        print("\nTransformation Summary:")
        print("Original columns:", data.columns.tolist())
        print("Normalized columns:", transformed_df.columns.tolist())
        print(f"\nOriginal shape: {data.shape}")
        print(f"Transformed shape: {transformed_df.shape}")
        print("\nEngineered features:", self.engineered_features)
        print("\nCategorical value handling:")
        initial_preprocessor = pipeline.named_steps['initial_preprocessing']
        for col in self.categorical_features:
            print(f"\n{col}:")
            print(f"  Original categories: {len(data[col].unique())}")
            print(f"  Final categories: {len(initial_preprocessor.category_maps[col])}")
            print(f"  Categories kept: {initial_preprocessor.category_maps[col]}")
        
        # Store the pipeline for inverse transforms if needed
        self.pipeline = pipeline
        
        return transformed_df, self.original_data
    
# Usage example
if __name__ == "__main__":
    transformer = DataTransformer()
    transformed_data, original_data = transformer.transform_data('./data/raw_data/AmesHousing.csv')


Feature Engineering Step:
Input columns: ['PID', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3Ssn

In [79]:
transformed_data.columns

Index(['OverallQual', 'TotRmsAbvGrd', 'GarageCars', 'Fireplaces',
       'LotFrontage', 'MasVnrArea', 'SalePrice', 'TotalSqFt', 'HouseAge',
       'TotalBaths', 'YrRemodAge', 'Neighborhood_NAmes',
       'Neighborhood_OldTown', 'Neighborhood_Other', 'FireplaceQu_Fa',
       'FireplaceQu_Gd', 'FireplaceQu_Po', 'FireplaceQu_TA',
       'FireplaceQu_Unknown', 'KitchenQual_Other', 'KitchenQual_TA',
       'BsmtExposure_Gd', 'BsmtExposure_Mn', 'BsmtExposure_No'],
      dtype='object')

In [80]:
transformed_data

Unnamed: 0,OverallQual,TotRmsAbvGrd,GarageCars,Fireplaces,LotFrontage,MasVnrArea,SalePrice,TotalSqFt,HouseAge,TotalBaths,YrRemodAge,Neighborhood_NAmes,Neighborhood_OldTown,Neighborhood_Other,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_Po,FireplaceQu_TA,FireplaceQu_Unknown,KitchenQual_Other,KitchenQual_TA,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No
0,-0.067254,0.354167,0.306589,2.162180,3.375742,0.061046,0.428229,0.229567,0.447912,-0.269932,1.269572,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,-0.776079,-0.917535,-1.008648,-0.925143,0.514952,-0.566039,-0.948957,-0.961044,0.414893,-1.508619,1.221616,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,-0.067254,-0.281684,-1.008648,-0.925143,0.561850,0.038650,-0.110125,0.132628,0.513951,-0.889276,1.365485,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.641571,0.990018,0.306589,2.162180,1.124628,-0.566039,0.791305,2.073895,0.183758,1.588098,0.885920,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.776079,-0.281684,0.306589,0.618518,0.233563,-0.566039,0.113980,0.007104,-0.773800,0.349411,-0.552777,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,-0.067254,-0.281684,0.306589,-0.925143,-1.501671,-0.566039,-0.479462,-0.677684,-0.476627,-0.269932,-0.073211,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2926,-0.776079,-0.917535,0.306589,-0.925143,-0.047827,-0.566039,-0.623440,-0.975958,-0.443607,-0.269932,-0.025255,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2927,-0.776079,-0.281684,-2.323886,-0.925143,-0.329216,-0.566039,-0.610920,-0.831792,-0.740781,-0.889276,-0.456864,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2928,-0.776079,-0.281684,0.306589,0.618518,0.374257,-0.566039,-0.135165,0.281765,-0.146434,-0.269932,0.358398,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [84]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

class RecommendationSystem:
    def __init__(self, data_transformer):
        self.data_transformer = data_transformer
        self.original_data = None
        self.transformed_data = None
        self.feature_names = None

    def fit(self, data_path):
        """Fit the recommendation system on the given data"""
        self.transformed_data, self.original_data = self.data_transformer.transform_data(data_path)
        self.feature_names = self.transformed_data.columns.tolist()

    def recommend(self, input_filters, quantity_similar_items=5):
        """
        Return the original DataFrame with the most similar items in descending order.
        
        Parameters:
        input_filters (dict): A dictionary of input filters, e.g. {'SalePrice': 200000, 'Neighborhood': 'NAmes'}
        quantity_similar_items (int): The number of similar items to return
        """
        # Find the input item
        mask = pd.Series([True] * len(self.original_data))
        for col, value in input_filters.items():
            mask &= (self.original_data[col] == value)

        try:
            input_item = self.original_data.loc[mask].iloc[0]
        except IndexError:
            return None, None

        # Get the similarity scores
        input_item_transformed = self.transformed_data.loc[input_item.name]
        nearest_neighbor = NearestNeighbors(n_neighbors=quantity_similar_items + 1).fit(self.transformed_data)
        distances, indices = nearest_neighbor.kneighbors([input_item_transformed])

        # Create a DataFrame with the similar items
        similar_items = self.original_data.iloc[indices.squeeze()[1:]].copy()
        similar_items['Score'] = distances.squeeze()[1:]

        # Return the similar items and the original item
        return similar_items.sort_values('Score', ascending=False), input_item

if __name__ == "__main__":
    data_transformer = DataTransformer()
    recommender = RecommendationSystem(data_transformer)
    recommender.fit('./data/raw_data/AmesHousing.csv')
    
    # Example usage
    input_filters = {'SalePrice': 200000, 'Neighborhood': 'NAmes'}
    similar_items, original_item = recommender.recommend(input_filters, quantity_similar_items=3)
    
    if similar_items is not None and original_item is not None:
        print("Original item:")
        print(original_item)
        print("\nSimilar items:")
        print(similar_items)
    else:
        print("No similar items found.")


Feature Engineering Step:
Input columns: ['PID', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3Ssn

