In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [4]:
df=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

In [5]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
df.shape

(1460, 81)

In [7]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
import numpy as np
import pandas as pd
from typing import Dict, Union, List, Optional

class DataCleaner:
    """
    Final corrected version of DataCleaner with consistent variable naming
    """
    
    def __init__(self, 
                 fill_strategies: Optional[Dict[str, Union[str, float, int]]] = None,
                 default_strategy: str = 'mode',
                 default_fill_value: Optional[Union[float, int, str]] = None,
                 drop_threshold: float = 0.8,
                 verbose: bool = True):
        """
        Args:
            fill_strategies: Dictionary of {column: strategy} pairs
            default_strategy: Strategy for unspecified columns
            default_fill_value: Default value when strategy='value'
            drop_threshold: Drop columns with NaN ratio >= this value
            verbose: Whether to print processing details
        """
        self.fill_strategies = fill_strategies or {}
        self.default_strategy = default_strategy
        self.default_fill_value = default_fill_value
        self.drop_threshold = drop_threshold
        self.verbose = verbose
        self._fill_values = {}
        self._dropped_columns = []
        
    def fit(self, df: pd.DataFrame) -> None:
        """Analyze data and compute fill values"""
        if self.verbose:
            print("Initial missing value analysis:")
            mv = df.isna().sum()
            print(mv[mv > 0])
        
        # Identify columns to drop
        total_rows = len(df)
        self._dropped_columns = [
            col for col in df.columns 
            if df[col].isna().sum() / total_rows >= self.drop_threshold
        ]
        
        if self.verbose and self._dropped_columns:
            print(f"\nDropping columns with ≥{self.drop_threshold:.0%} NaN values:")
            for col in self._dropped_columns:
                na_count = df[col].isna().sum()
                print(f"- {col}: {na_count}/{total_rows} NA values ({na_count/total_rows:.1%})")
        
        # Compute fill values for remaining columns
        remaining_cols = [col for col in df.columns if col not in self._dropped_columns]
        df_remaining = df[remaining_cols]
        
        self._fill_values = {}
        
        for col in df_remaining.columns:
            strategy = self.fill_strategies.get(col, self.default_strategy)
            
            if strategy == 'mode':
                fill_value = df_remaining[col].mode()[0] if not df_remaining[col].mode().empty else self.default_fill_value
            elif strategy == 'median':
                fill_value = df_remaining[col].median() if pd.api.types.is_numeric_dtype(df_remaining[col]) else self.default_fill_value
            elif strategy == 'mean':
                fill_value = df_remaining[col].mean() if pd.api.types.is_numeric_dtype(df_remaining[col]) else self.default_fill_value
            elif strategy == 'value':
                fill_value = self.fill_strategies.get(col, self.default_fill_value)
            elif strategy == 'none':
                fill_value = 0
            else:
                raise ValueError(f"Unknown strategy: {strategy}")
            
            self._fill_values[col] = fill_value
            
            if self.verbose and df_remaining[col].isna().any():
                na_count = df_remaining[col].isna().sum()
                if fill_value is None:
                    action = "will remain NA"
                else:
                    action = f"will be filled with {fill_value}"
                print(f"{col: <15} {na_count: >4} NA values {action} ({strategy} strategy)")
    
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Apply the cleaning transformations"""
        # Drop high-NA columns
        df_filled = df.drop(columns=self._dropped_columns, errors='ignore')
        
        # Fill remaining NAs
        for col, fill_value in self._fill_values.items():
            if col in df_filled.columns and fill_value is not None:
                df_filled[col] = df_filled[col].fillna(fill_value)
        
        if self.verbose:
            remaining_na = df_filled.isna().sum().sum()
            print(f"\nPost-cleaning report:")
            print(f"- Dropped {len(self._dropped_columns)} columns")
            print(f"- Remaining NA values: {remaining_na}")
            if remaining_na > 0:
                print("Columns with remaining NAs:")
                print(df_filled.isna().sum()[df_filled.isna().sum() > 0])
        
        return df_filled
    
    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        self.fit(df)
        return self.transform(df)

In [9]:
cleaner = DataCleaner(
    fill_strategies={
        'GarageYrBlt': 'value'
    },
    default_strategy='mode',
    default_fill_value=0,
    drop_threshold=0.8,
    verbose=True
)

# Clean the data
train_clean = cleaner.fit_transform(X_train)

Initial missing value analysis:
LotFrontage      217
Alley           1094
MasVnrType       683
MasVnrArea         6
BsmtQual          28
BsmtCond          28
BsmtExposure      28
BsmtFinType1      28
BsmtFinType2      28
Electrical         1
FireplaceQu      547
GarageType        64
GarageYrBlt       64
GarageFinish      64
GarageQual        64
GarageCond        64
PoolQC          1162
Fence            935
MiscFeature     1122
dtype: int64

Dropping columns with ≥80% NaN values:
- Alley: 1094/1168 NA values (93.7%)
- PoolQC: 1162/1168 NA values (99.5%)
- Fence: 935/1168 NA values (80.1%)
- MiscFeature: 1122/1168 NA values (96.1%)
LotFrontage      217 NA values will be filled with 60.0 (mode strategy)
MasVnrType       683 NA values will be filled with BrkFace (mode strategy)
MasVnrArea         6 NA values will be filled with 0.0 (mode strategy)
BsmtQual          28 NA values will be filled with TA (mode strategy)
BsmtCond          28 NA values will be filled with TA (mode strategy)
Bsmt