In [3]:
import pandas as pd

#Load raw training data into a DataFrame

df = pd.read_csv("../data/train.csv")
df.shape

(1460, 81)

In [4]:
#Count missing values per column, sorted descending
missing_counts = df.isnull().sum().sort_values(ascending = False)
missing_counts.head(10)

PoolQC         1453
MiscFeature    1406
Alley          1369
Fence          1179
MasVnrType      872
FireplaceQu     690
LotFrontage     259
GarageYrBlt      81
GarageCond       81
GarageType       81
dtype: int64

In [5]:
#Drop columns where >50% of values are missing
thresh = len(df) * 0.5
cols_to_drop = missing_counts[missing_counts > thresh].index.tolist()
print("Dropping: ", cols_to_drop)
df = df.drop(columns=cols_to_drop)
df.shape

Dropping:  ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType']


(1460, 76)

In [26]:
#Impute numeric columns with their median
num_cols = df.select_dtypes(include="number").columns
for col in num_cols:
    if df[col].isnull().sum():
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
        print(f"Filled NA in {col} with median = {median_val}")

Null counts by column:
Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 76, dtype: int64


In [17]:
#Impute categorical columns with their mode
cat_cols = df.select_dtypes(include = "object").columns
for col in cat_cols:
    if df[col].isnull().any():
        mode_val = df[col].mode()
        df[col] = df[col].fillna(mode_val)
        print(f"Filled NA in {col} with mode = {mode_val}")

Filled NA in BsmtQual with mode = 0    TA
Name: BsmtQual, dtype: object
Filled NA in BsmtCond with mode = 0    TA
Name: BsmtCond, dtype: object
Filled NA in BsmtExposure with mode = 0    No
Name: BsmtExposure, dtype: object
Filled NA in BsmtFinType1 with mode = 0    Unf
Name: BsmtFinType1, dtype: object
Filled NA in BsmtFinType2 with mode = 0    Unf
Name: BsmtFinType2, dtype: object
Filled NA in Electrical with mode = 0    SBrkr
Name: Electrical, dtype: object
Filled NA in FireplaceQu with mode = 0    Gd
Name: FireplaceQu, dtype: object
Filled NA in GarageType with mode = 0    Attchd
Name: GarageType, dtype: object
Filled NA in GarageFinish with mode = 0    Unf
Name: GarageFinish, dtype: object
Filled NA in GarageQual with mode = 0    TA
Name: GarageQual, dtype: object
Filled NA in GarageCond with mode = 0    TA
Name: GarageCond, dtype: object


In [None]:
#Save cleaned df to CSV for future use
df.to_csv("data/processed_train.csv", index = False)