In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

#change to your path
train = pd.read_csv('D:\\TANVI_COLLEGE_FILES\\PROGRAMMING_SCRIPTS\\PERSONAL_PROJECTS\\DevOps-CA1\\Devops-CA1\\data\\train.csv')
test = pd.read_csv('D:\\TANVI_COLLEGE_FILES\\PROGRAMMING_SCRIPTS\\PERSONAL_PROJECTS\\DevOps-CA1\\Devops-CA1\\data\\train.csv')

train_target = train["SalePrice"] #keeping the target variable

train["source"] = "train"
test["source"] = "test"
test["SalePrice"] = np.nan
combined = pd.concat([train, test], ignore_index=True)

#PREPROCESSING (one-hot encoding, wherein all categorical text columns were converted to binary columns; label encoding for ordered categories, like ExterQual which were mapped to numbers)
# the strings are encoded now
#convert all the categorical variables to numeric because ML models (like LR, RandomForest, XGBoost) cannot work directly with strings/categories.
none_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
             'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
             'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
             'BsmtFinType2', 'MasVnrType']
for col in none_cols:
    combined[col] = combined[col].fillna("None")

zero_cols = ['GarageYrBlt', 'GarageArea', 'GarageCars',
             'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
             'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']
for col in zero_cols:
    combined[col] = combined[col].fillna(0)

mode_cols = ['Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd',
             'SaleType', 'Functional', 'MSZoning', 'Utilities']
for col in mode_cols:
    combined[col] = combined[col].fillna(combined[col].mode()[0])

combined["LotFrontage"] = combined.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median())
)


combined.drop("Utilities", axis=1, inplace=True)

categorical_cols = combined.select_dtypes(include=["object"]).columns
combined = pd.get_dummies(combined, columns=categorical_cols)

numerical_cols = combined.select_dtypes(include=["int64", "float64"]).columns
numerical_cols = numerical_cols.drop("SalePrice")  # don't scale the target
scaler = StandardScaler()
combined[numerical_cols] = scaler.fit_transform(combined[numerical_cols])

train_cleaned = combined[combined["source_train"] == 1].drop(["source_train", "source_test"], axis=1)
test_cleaned = combined[combined["source_test"] == 1].drop(["source_train", "source_test", "SalePrice"], axis=1)

#save
train_cleaned.to_csv("../data/train_cleaned.csv", index=False)
test_cleaned.to_csv("../data/test_cleaned.csv", index=False)
