# imports and setup

In [96]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [97]:
# dagshub setup
import dagshub

dagshub.init(repo_owner='lmamu21', repo_name='House-Prices', mlflow=True)

In [98]:
import os
os.environ['MLFLOW_TRACKING_USERNAME'] = 'lmamu21' 
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'd392b86afa2c37911a6814230b474c4b5df06fcb'
os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/lmamu21/House-Prices.mlflow'

In [99]:
import mlflow
mlflow.set_tracking_uri("https://dagshub.com/lmamu21/House-Prices.mlflow")
mlflow.set_experiment("House-Prices")

<Experiment: artifact_location='mlflow-artifacts:/f9ed4fdc67b34e6b8b9d059aa67ad3d5', creation_time=1744051643876, experiment_id='3', last_update_time=1744051643876, lifecycle_stage='active', name='House-Prices', tags={}>

In [100]:
test = pd.read_csv("../data/test.csv")
train = pd.read_csv("../data/train.csv")
sample = pd.read_csv("../data/sample_submission.csv")


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# data cleaning

In [144]:
train_ids = X_train.pop('Id')
test_ids = X_test.pop('Id')

### NA values

In [102]:
with pd.option_context('display.max_rows', None):
    print(X_train.isna().mean()[X_train.isna().mean() > 0])

LotFrontage     0.185788
Alley           0.936644
MasVnrType      0.584760
MasVnrArea      0.005137
BsmtQual        0.023973
BsmtCond        0.023973
BsmtExposure    0.023973
BsmtFinType1    0.023973
BsmtFinType2    0.023973
Electrical      0.000856
FireplaceQu     0.468322
GarageType      0.054795
GarageYrBlt     0.054795
GarageFinish    0.054795
GarageQual      0.054795
GarageCond      0.054795
PoolQC          0.994863
Fence           0.800514
MiscFeature     0.960616
dtype: float64


In [103]:
print(X_train['PoolArea'].value_counts())
print(X_train['PoolQC'].value_counts())
print(X_train['PoolQC'].isna().mean())    # 99.4% of PoolQC records are NA

PoolArea
0      1162
648       1
555       1
480       1
512       1
519       1
738       1
Name: count, dtype: int64
PoolQC
Fa    2
Ex    2
Gd    2
Name: count, dtype: int64
0.9948630136986302


In [None]:
X_train.pop('PoolQC')
with pd.option_context('display.max_rows', None):
    print(X_train.isna().mean()[X_train.isna().mean() > 0])

In [81]:
X_train['MiscFeature'].value_counts()
X_train['MiscVal'].value_counts()

MiscVal
0        1124
400         8
500         8
700         5
450         4
2000        4
1200        2
480         2
15500       1
600         1
3500        1
8300        1
1300        1
800         1
350         1
620         1
1150        1
1400        1
2500        1
Name: count, dtype: int64

In [107]:
X_train.pop('MiscFeature')
X_train.pop('MiscVal')

254     0
1066    0
638     0
799     0
380     0
       ..
1095    0
1130    0
1294    0
860     0
1126    0
Name: MiscVal, Length: 1168, dtype: int64

In [108]:
with pd.option_context('display.max_rows', None):
    print(X_train.isna().mean()[X_train.isna().mean() > 0])

LotFrontage     0.185788
Alley           0.936644
MasVnrType      0.584760
MasVnrArea      0.005137
BsmtQual        0.023973
BsmtCond        0.023973
BsmtExposure    0.023973
BsmtFinType1    0.023973
BsmtFinType2    0.023973
Electrical      0.000856
FireplaceQu     0.468322
GarageType      0.054795
GarageYrBlt     0.054795
GarageFinish    0.054795
GarageQual      0.054795
GarageCond      0.054795
Fence           0.800514
dtype: float64


In [109]:
X_train['Alley'].value_counts()

Alley
Grvl    44
Pave    30
Name: count, dtype: int64

In [110]:
X_train.pop('Alley')

254      NaN
1066     NaN
638      NaN
799      NaN
380     Pave
        ... 
1095     NaN
1130     NaN
1294     NaN
860      NaN
1126     NaN
Name: Alley, Length: 1168, dtype: object

In [111]:
with pd.option_context('display.max_rows', None):
    print(X_train.isna().mean()[X_train.isna().mean() > 0])

LotFrontage     0.185788
MasVnrType      0.584760
MasVnrArea      0.005137
BsmtQual        0.023973
BsmtCond        0.023973
BsmtExposure    0.023973
BsmtFinType1    0.023973
BsmtFinType2    0.023973
Electrical      0.000856
FireplaceQu     0.468322
GarageType      0.054795
GarageYrBlt     0.054795
GarageFinish    0.054795
GarageQual      0.054795
GarageCond      0.054795
Fence           0.800514
dtype: float64


In [112]:
X_train['Fence'].value_counts()

Fence
MnPrv    128
GdPrv     50
GdWo      46
MnWw       9
Name: count, dtype: int64

In [113]:
X_train['Fence'] = X_train['Fence'].notna()

In [114]:
X_train['Fence'].value_counts()

Fence
False    935
True     233
Name: count, dtype: int64

In [115]:
with pd.option_context('display.max_rows', None):
    print(X_train.isna().mean()[X_train.isna().mean() > 0])

LotFrontage     0.185788
MasVnrType      0.584760
MasVnrArea      0.005137
BsmtQual        0.023973
BsmtCond        0.023973
BsmtExposure    0.023973
BsmtFinType1    0.023973
BsmtFinType2    0.023973
Electrical      0.000856
FireplaceQu     0.468322
GarageType      0.054795
GarageYrBlt     0.054795
GarageFinish    0.054795
GarageQual      0.054795
GarageCond      0.054795
dtype: float64


In [127]:
X_train['MasVnrType'].isna().mean()

np.float64(0.5847602739726028)

In [128]:
X_train.pop('MasVnrType')

254         NaN
1066        NaN
638         NaN
799     BrkFace
380         NaN
         ...   
1095        NaN
1130        NaN
1294        NaN
860         NaN
1126    BrkFace
Name: MasVnrType, Length: 1168, dtype: object

In [129]:
X_train['MasVnrArea'] = X_train['MasVnrArea'].fillna(0)

In [142]:
with pd.option_context('display.max_rows', None):
    print(X_train.isna().mean()[X_train.isna().mean() > 0])

LotFrontage     0.185788
BsmtQual        0.023973
BsmtCond        0.023973
BsmtExposure    0.023973
BsmtFinType1    0.023973
BsmtFinType2    0.023973
Electrical      0.000856
FireplaceQu     0.468322
GarageType      0.054795
GarageYrBlt     0.054795
GarageFinish    0.054795
GarageQual      0.054795
GarageCond      0.054795
dtype: float64


In [171]:
print(X_train['Fireplaces'].value_counts(dropna=False))
print(X_train['FireplaceQu'].value_counts(dropna=False))

Fireplaces
0    547
1    524
2     93
3      4
Name: count, dtype: int64
FireplaceQu
0    547
4    305
3    252
2     27
5     21
1     16
Name: count, dtype: int64


In [173]:
fireplace_qu_map = {
    'No Fireplace': 0,
    'Po': 1,
    'Fa': 2,
    'TA': 3,
    'Gd': 4,
    'Ex': 5
}

X_train['FireplaceQu'] = X_train['FireplaceQu'].fillna('No Fireplace')
X_train['FireplaceQu'] = X_train['FireplaceQu'].replace(fireplace_qu_map)

In [175]:
print(X_train['Fireplaces'].value_counts(dropna=False))
print(X_train['FireplaceQu'].value_counts(dropna=False))

Fireplaces
0    547
1    524
2     93
3      4
Name: count, dtype: int64
FireplaceQu
0    547
4    305
3    252
2     27
5     21
1     16
Name: count, dtype: int64


In [176]:
with pd.option_context('display.max_rows', None):
    print(X_train.isna().mean()[X_train.isna().mean() > 0])

LotFrontage     0.185788
BsmtQual        0.023973
BsmtCond        0.023973
BsmtExposure    0.023973
BsmtFinType1    0.023973
BsmtFinType2    0.023973
Electrical      0.000856
GarageType      0.054795
GarageYrBlt     0.054795
GarageFinish    0.054795
GarageQual      0.054795
GarageCond      0.054795
dtype: float64


In [180]:
X_train['LotFrontage'] = X_train['LotFrontage'].fillna(0)
print(X_train['LotFrontage'].value_counts(dropna=False))

LotFrontage
0.0      217
60.0     112
70.0      57
80.0      54
50.0      47
        ... 
182.0      1
174.0      1
38.0       1
101.0      1
153.0      1
Name: count, Length: 108, dtype: int64


In [183]:
with pd.option_context('display.max_rows', None):
    print(X_train.isna().mean()[X_train.isna().mean() > 0])

BsmtQual        0.023973
BsmtCond        0.023973
BsmtExposure    0.023973
BsmtFinType1    0.023973
BsmtFinType2    0.023973
Electrical      0.000856
GarageType      0.054795
GarageYrBlt     0.054795
GarageFinish    0.054795
GarageQual      0.054795
GarageCond      0.054795
dtype: float64


In [187]:
X_train['Electrical'].value_counts(dropna=False)

Electrical
SBrkr    1072
FuseA      69
FuseF      24
FuseP       3
Name: count, dtype: int64

In [188]:
most_common = X_train['Electrical'].mode()[0]
X_train['Electrical'].fillna(most_common, inplace=True)


In [189]:
with pd.option_context('display.max_rows', None):
    print(X_train.isna().mean()[X_train.isna().mean() > 0])

BsmtQual        0.023973
BsmtCond        0.023973
BsmtExposure    0.023973
BsmtFinType1    0.023973
BsmtFinType2    0.023973
GarageType      0.054795
GarageYrBlt     0.054795
GarageFinish    0.054795
GarageQual      0.054795
GarageCond      0.054795
dtype: float64


In [191]:
no_basement_value = "No Basement"
basement_fields = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
# Replace NaN values in the basement columns with 'No Basement'
for field in basement_fields:
    X_train[field].fillna(no_basement_value, inplace=True)

In [212]:
no_garage_value = "No Garage"

garage_cat_fields = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']

for field in garage_cat_fields:
    X_train[field].fillna(no_garage_value, inplace=True)

X_train['GarageYrBlt'].fillna(X_train['GarageYrBlt'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train['GarageYrBlt'].fillna(X_train['GarageYrBlt'].median(), inplace=True)


In [216]:
with pd.option_context('display.max_rows', None):
    print(X_train.isna().mean()[X_train.isna().mean() > 0])

Series([], dtype: float64)
