In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
prices = pd.read_csv('House Prices/train.csv')

In [4]:
prices.shape

(1460, 81)

In [5]:
prices.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
#percent of missing values
freq_na = prices.isna().mean()
freq_na.sort_values(ascending=False).head(20).round(2)

PoolQC          1.00
MiscFeature     0.96
Alley           0.94
Fence           0.81
MasVnrType      0.60
FireplaceQu     0.47
LotFrontage     0.18
GarageYrBlt     0.06
GarageCond      0.06
GarageType      0.06
GarageFinish    0.06
GarageQual      0.06
BsmtFinType2    0.03
BsmtExposure    0.03
BsmtQual        0.03
BsmtCond        0.03
BsmtFinType1    0.03
MasVnrArea      0.01
Electrical      0.00
Id              0.00
dtype: float64

In [7]:
# drop columns with na share > 40%
prices.drop(prices.columns[freq_na > 0.4], inplace=True, axis=1)

In [8]:
# number of each type of column
prices.dtypes.value_counts()

object     37
int64      35
float64     3
Name: count, dtype: int64

In [9]:
# number of unique classes in each object column
prices.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

MSZoning          5
Street            2
LotShape          4
LandContour       4
Utilities         2
LotConfig         5
LandSlope         3
Neighborhood     25
Condition1        9
Condition2        8
BldgType          5
HouseStyle        8
RoofStyle         6
RoofMatl          8
Exterior1st      15
Exterior2nd      16
ExterQual         4
ExterCond         5
Foundation        6
BsmtQual          4
BsmtCond          4
BsmtExposure      4
BsmtFinType1      6
BsmtFinType2      6
Heating           6
HeatingQC         5
CentralAir        2
Electrical        5
KitchenQual       4
Functional        7
GarageType        6
GarageFinish      3
GarageQual        5
GarageCond        5
PavedDrive        3
SaleType          9
SaleCondition     6
dtype: int64

In [10]:
# X = prices.drop(columns = ['SalePrice'])
# y = prices['SalePrice']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# pipeline = make_pipeline(TargetEncoder(),
#     SimpleImputer(strategy = 'constant'),
#                      MinMaxScaler(feature_range = (0, 1)),
#                      KNeighborsRegressor(n_neighbors = 4, weights='distance'))
                     
# pipeline.fit(X_train, y_train)
# preds = pipeline.predict(X_test)

# # te = TargetEncoder()
# # te.fit(X_train, y_train)
# # X_train = te.transform(X_train)
# # X_test = te.transform(X_test)

# # imputer = SimpleImputer(strategy = 'constant')
# # imputer.fit(X_train)
# # X_train = imputer.transform(X_train)
# # X_test = imputer.transform(X_test)

# # scaler = MinMaxScaler(feature_range = (0, 1))
# # scaler.fit(X_train)
# # X_train = scaler.transform(X_train)
# # X_test = scaler.transform(X_test)

# # knn = KNeighborsRegressor(n_neighbors = 4, weights='distance')
# # knn.fit(X_train, y_train)
# # preds = knn.predict(X_test)

# print('MAE: ' + str(mean_absolute_error(y_test, preds)))
# print('MSE: ' + str(mean_squared_error(y_test, preds)))

In [11]:
# pipeline for for encoding, scaling, filling gaps and training, cross validation
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

X = prices.drop(columns=['SalePrice'])
y = prices['SalePrice']

pipeline = make_pipeline(TargetEncoder(),             # Encode categorical features based on the target
                         SimpleImputer(strategy='constant'),  # Handle missing values
                         MinMaxScaler(feature_range=(0, 1)),  # Scale features to the range [0, 1]
                         KNeighborsRegressor(n_neighbors=4, weights='distance')  # KNN Regressor
                        )

cv_results = cross_validate(pipeline,
                            X, y,
                            cv=5,
                            scoring='neg_mean_squared_error',
                            return_train_score=False
                           )

mean_test_score = -cv_results['test_score'].mean()

print(f"MSE from Cross-Validation: {mean_test_score:.2f}")
print("Fold Scores:", -cv_results['test_score'])

MSE from Cross-Validation: 1687667610.58
Fold Scores: [1.44909915e+09 1.87693883e+09 1.90396148e+09 1.13513187e+09
 2.07320673e+09]


In [12]:
#load test data and align with train
prices_test = pd.read_csv('House Prices/test.csv')

prices_labels = prices['SalePrice']
prices, prices_test  = prices.align(prices_test, join = 'inner', axis = 1)
prices['SalePrice'] = prices_labels

print('Training Data Shape: ', prices.shape)
print('Testing Data Shape: ', prices_test.shape)

Training Data Shape:  (1460, 75)
Testing Data Shape:  (1459, 74)


In [13]:
#train on full train set and making predictions with test data
X = prices.drop(columns=['SalePrice'])
y = prices['SalePrice']

pipeline = make_pipeline(TargetEncoder(),             # Encode categorical features based on the target
                         SimpleImputer(strategy='constant'),  # Handle missing values
                         MinMaxScaler(feature_range=(0, 1)),  # Scale features to the range [0, 1]
                         KNeighborsRegressor(n_neighbors=4, weights='distance')  # KNN Regressor
                        )

pipeline.fit(X, y)
preds = pipeline.predict(prices_test)

In [14]:
#create submission
submit_prices = pd.DataFrame(prices_test['Id'])
submit_prices['SalePrice'] = preds
submit_prices.head()

Unnamed: 0,Id,SalePrice
0,1461,129390.293889
1,1462,140904.571794
2,1463,198811.051398
3,1464,211070.560337
4,1465,196307.368756


In [15]:
submit_prices.to_csv('submit_prices_1.csv', index=False)