# imports and setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# dagshub setup
import dagshub

dagshub.init(repo_owner='lmamu21', repo_name='House-Prices', mlflow=True)

In [3]:
import os
os.environ['MLFLOW_TRACKING_USERNAME'] = 'lmamu21' 
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'd392b86afa2c37911a6814230b474c4b5df06fcb'
os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/lmamu21/House-Prices.mlflow'

In [4]:
import mlflow
mlflow.set_tracking_uri("https://dagshub.com/lmamu21/House-Prices.mlflow")
mlflow.set_experiment("House-Prices")

<Experiment: artifact_location='mlflow-artifacts:/f9ed4fdc67b34e6b8b9d059aa67ad3d5', creation_time=1744051643876, experiment_id='3', last_update_time=1744051643876, lifecycle_stage='active', name='House-Prices', tags={}>

In [5]:
test = pd.read_csv("../data/test.csv")
train = pd.read_csv("../data/train.csv")
sample = pd.read_csv("../data/sample_submission.csv")


In [6]:
from sklearn.model_selection import train_test_split

X = train.drop(columns=['SalePrice'])  # train dataframe remains unchanged, X is now train dataframe without SalePrice
y = train['SalePrice']                 # train dataframe SalePrices is copied to y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# data cleaning

In [7]:
train_ids = X_train.pop('Id')
test_ids = X_test.pop('Id')

### NA values

In [8]:
with pd.option_context('display.max_rows', None):
    print(X_train.isna().mean()[X_train.isna().mean() > 0])
    print(X_test.isna().mean()[X_test.isna().mean() > 0])

LotFrontage     0.185788
Alley           0.936644
MasVnrType      0.584760
MasVnrArea      0.005137
BsmtQual        0.023973
BsmtCond        0.023973
BsmtExposure    0.023973
BsmtFinType1    0.023973
BsmtFinType2    0.023973
Electrical      0.000856
FireplaceQu     0.468322
GarageType      0.054795
GarageYrBlt     0.054795
GarageFinish    0.054795
GarageQual      0.054795
GarageCond      0.054795
PoolQC          0.994863
Fence           0.800514
MiscFeature     0.960616
dtype: float64
LotFrontage     0.143836
Alley           0.941781
MasVnrType      0.647260
MasVnrArea      0.006849
BsmtQual        0.030822
BsmtCond        0.030822
BsmtExposure    0.034247
BsmtFinType1    0.030822
BsmtFinType2    0.034247
FireplaceQu     0.489726
GarageType      0.058219
GarageYrBlt     0.058219
GarageFinish    0.058219
GarageQual      0.058219
GarageCond      0.058219
PoolQC          0.996575
Fence           0.835616
MiscFeature     0.972603
dtype: float64


In [9]:
print(X_train['PoolArea'].value_counts(dropna=False))
print(X_train['PoolQC'].value_counts(dropna=False))
print(X_train['PoolQC'].isna().mean())    # 99.4% of PoolQC records are NA

print(X_test['PoolArea'].value_counts(dropna=False))
print(X_test['PoolQC'].value_counts(dropna=False))
print(X_test['PoolQC'].isna().mean()) 

PoolArea
0      1162
648       1
555       1
480       1
512       1
519       1
738       1
Name: count, dtype: int64
PoolQC
NaN    1162
Fa        2
Ex        2
Gd        2
Name: count, dtype: int64
0.9948630136986302
PoolArea
0      291
576      1
Name: count, dtype: int64
PoolQC
NaN    291
Gd       1
Name: count, dtype: int64
0.9965753424657534


In [10]:
PoolQC_train = X_train.pop('PoolQC')
PoolQC_test = X_test.pop('PoolQC')

In [11]:
print(X_train['MiscFeature'].value_counts(dropna=False))
print(X_train['MiscVal'].value_counts(dropna=False))

MiscFeature
NaN     1122
Shed      41
Othr       2
Gar2       2
TenC       1
Name: count, dtype: int64
MiscVal
0        1124
400         8
500         8
700         5
450         4
2000        4
1200        2
480         2
15500       1
600         1
3500        1
8300        1
1300        1
800         1
350         1
620         1
1150        1
1400        1
2500        1
Name: count, dtype: int64


In [12]:
MiscFeature_train = X_train.pop('MiscFeature')
MiscVal_train = X_train.pop('MiscVal')

MiscFeature_test = X_test.pop('MiscFeature')
MiscVal_test = X_test.pop('MiscVal')

In [13]:
X_train['Alley'].value_counts(dropna=False)

Alley
NaN     1094
Grvl      44
Pave      30
Name: count, dtype: int64

In [14]:
Alley_train = X_train.pop('Alley')
Alley_test = X_test.pop('Alley')

In [15]:
X_train['Fence'].value_counts(dropna=False)

Fence
NaN      935
MnPrv    128
GdPrv     50
GdWo      46
MnWw       9
Name: count, dtype: int64

In [16]:
X_train['Fence'] = X_train['Fence'].notna()  # changed Fence to binary data -> has Fend or not
X_test['Fence'] = X_test['Fence'].notna()

In [17]:
X_train['Fence'].value_counts(dropna=False)

Fence
False    935
True     233
Name: count, dtype: int64

In [18]:
X_train['MasVnrType'].isna().mean()

np.float64(0.5847602739726028)

In [19]:
MasVnrType_train = X_train.pop('MasVnrType')
MasVnrType_test = X_test.pop('MasVnrType')

In [20]:
X_train['MasVnrArea'] = X_train['MasVnrArea'].fillna(0) 
X_test['MasVnrArea'] = X_test['MasVnrArea'].fillna(0)

In [21]:
print(X_train['Fireplaces'].value_counts(dropna=False))
print(X_train['FireplaceQu'].value_counts(dropna=False))

Fireplaces
0    547
1    524
2     93
3      4
Name: count, dtype: int64
FireplaceQu
NaN    547
Gd     305
TA     252
Fa      27
Ex      21
Po      16
Name: count, dtype: int64


In [22]:
fireplace_qu_map = {
    'No Fireplace': 0,
    'Po': 1,
    'Fa': 2,
    'TA': 3,
    'Gd': 4,
    'Ex': 5
}

X_train['FireplaceQu'] = X_train['FireplaceQu'].fillna('No Fireplace')
X_train['FireplaceQu'] = X_train['FireplaceQu'].replace(fireplace_qu_map)

X_test['FireplaceQu'] = X_test['FireplaceQu'].fillna('No Fireplace')
X_test['FireplaceQu'] = X_test['FireplaceQu'].replace(fireplace_qu_map)

  X_train['FireplaceQu'] = X_train['FireplaceQu'].replace(fireplace_qu_map)
  X_test['FireplaceQu'] = X_test['FireplaceQu'].replace(fireplace_qu_map)


In [23]:
print(X_train['Fireplaces'].value_counts(dropna=False))
print(X_train['FireplaceQu'].value_counts(dropna=False))

Fireplaces
0    547
1    524
2     93
3      4
Name: count, dtype: int64
FireplaceQu
0    547
4    305
3    252
2     27
5     21
1     16
Name: count, dtype: int64


In [24]:
X_train['LotFrontage'] = X_train['LotFrontage'].fillna(0)
print(X_train['LotFrontage'].value_counts(dropna=False))

X_test['LotFrontage'] = X_test['LotFrontage'].fillna(0)
print(X_test['LotFrontage'].value_counts(dropna=False))

LotFrontage
0.0      217
60.0     112
70.0      57
80.0      54
50.0      47
        ... 
182.0      1
174.0      1
38.0       1
101.0      1
153.0      1
Name: count, Length: 108, dtype: int64
LotFrontage
0.0      42
60.0     31
80.0     15
70.0     13
75.0     11
         ..
32.0      1
107.0     1
41.0      1
36.0      1
124.0     1
Name: count, Length: 70, dtype: int64


In [25]:
X_train['Electrical'].value_counts(dropna=False)

Electrical
SBrkr    1071
FuseA      69
FuseF      24
FuseP       3
NaN         1
Name: count, dtype: int64

In [26]:
most_common = X_train['Electrical'].mode()[0]
X_train['Electrical'].fillna(most_common, inplace=True)
X_test['Electrical'].fillna(most_common, inplace=True)   # filling test data from most_common of train data is intentional to avoid data leakage

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train['Electrical'].fillna(most_common, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['Electrical'].fillna(most_common, inplace=True)   # filling test data from most_common of train data is intentional to avoid data leakage


In [27]:
no_basement_value = "No Basement"
basement_fields = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
# Replace NaN values in the basement columns with 'No Basement'
for field in basement_fields:
    X_train[field].fillna(no_basement_value, inplace=True)
    X_test[field].fillna(no_basement_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[field].fillna(no_basement_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[field].fillna(no_basement_value, inplace=True)


In [28]:
no_garage_value = "No Garage"

garage_cat_fields = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']

for field in garage_cat_fields:
    X_train[field].fillna(no_garage_value, inplace=True)
    X_test[field].fillna(no_garage_value, inplace=True)


median = X_train['GarageYrBlt'].median()

X_train['GarageYrBlt'].fillna(median, inplace=True)
X_test['GarageYrBlt'].fillna(median, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[field].fillna(no_garage_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[field].fillna(no_garage_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are set

In [29]:
with pd.option_context('display.max_rows', None):
    print(X_train.isna().mean()[X_train.isna().mean() > 0])
    print(X_test.isna().mean()[X_test.isna().mean() > 0])

Series([], dtype: float64)
Series([], dtype: float64)
