In [21]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

pd.options.display.max_rows = 200

raw_train_data = pd.read_csv('train.csv',index_col='Id')

data = raw_train_data.copy(deep=True )



## Data cleanage ##

In [22]:
data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [23]:
feature_groups = {
    'numerical': [
        "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
        "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
        "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath",
        "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
        "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
        "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal"
    ],
    'time': ["MoSold", "YrSold", "GarageYrBlt", "YearBuilt", "YearRemodAdd"]
}
feature_groups['categorical'] = list(set(data.columns) - set(feature_groups['numerical'] + feature_groups['time'] + ['SalePrice']))

print(data.isnull().sum().sort_values(ascending=False))

PoolQC           1453
MiscFeature      1406
Alley            1369
Fence            1179
MasVnrType        872
FireplaceQu       690
LotFrontage       259
GarageQual         81
GarageType         81
GarageFinish       81
GarageCond         81
GarageYrBlt        81
BsmtExposure       38
BsmtFinType2       38
BsmtQual           37
BsmtFinType1       37
BsmtCond           37
MasVnrArea          8
Electrical          1
BldgType            0
Neighborhood        0
LandSlope           0
Condition2          0
Condition1          0
LandContour         0
LotShape            0
Street              0
LotArea             0
MSSubClass          0
MSZoning            0
LotConfig           0
Utilities           0
HouseStyle          0
Foundation          0
ExterQual           0
ExterCond           0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
BsmtFinSF1          0
Exterior2nd         0
Exterior1st         0
RoofMatl            0
RoofStyle           0
YearRemodAdd        0
OverallQua

In [24]:
data = data.dropna(subset=['Electrical'])

### Imputing categorical data ###

## Data scaling ##

In [25]:
y = data['SalePrice']
X = data.drop(['SalePrice'], axis=1)


In [26]:

data = data.dropna(subset=['Electrical'])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(
            steps=[
                ('imputer' , SimpleImputer(strategy='constant', fill_value='No')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))
            ]
        ), feature_groups['categorical']),
        
        ('time', Pipeline([
            ('imputer', SimpleImputer(strategy='median'))
        ]), feature_groups['time']),
        
        ('num', Pipeline(
            steps=[
                ('imputer' , SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()) 
            ]
        ), feature_groups['numerical'])
    ]
)

preprocessor.fit(X)



In [27]:
rf = RandomForestRegressor(random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
  
    ('classifier', rf)  # Replace with your model
])

param_grid = {
     'classifier__n_estimators': [100, 200, 300],           
    'classifier__max_features': ['auto', 'sqrt', None],  
    'classifier__max_depth': [None, 10, 20, 30],           
    'classifier__min_samples_split': [2, 5, 10],           
    'classifier__min_samples_leaf': [1, 2, 4],             
    'classifier__bootstrap': [True, False]                
}

X = pd.DataFrame(X)


X_train,X_test, y_train,  y_test = train_test_split(X,y,test_size=0.2, random_state=1)

grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

best_model = grid_search.best_estimator_

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


1080 fits failed out of a total of 3240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
330 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Max\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Max\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Max\AppData\Loc

Best parameters found:  {'classifier__bootstrap': False, 'classifier__max_depth': 20, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 300}
Best cross-validation score:  0.8571293310010903


In [28]:
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
print(f"Mean CV score: {cv_scores.mean():.4f}")



Mean CV score: 0.8443


In [29]:
pipeline.fit(X_train, y_train)
from sklearn.metrics import r2_score
y_pred = pipeline.predict(X_test)
test_score = r2_score(y_test, y_pred)
print(f"Test R2 score: {test_score:.4f}")




Test R2 score: 0.8864
