In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
import pickle

warnings.filterwarnings("ignore", message="divide by zero encountered in log")
warnings.filterwarnings("ignore", message="invalid value encountered in divide")
warnings.filterwarnings("ignore", message="divide by zero encountered in divide")

In [2]:
df = pd.read_csv('../csv/Old_Dataset.csv')

## Data Cleaning

In [3]:
city_counts = df['city'].value_counts()
df = df[df['city'].isin(city_counts[city_counts >= 1000].index)]

In [4]:
df["bed"] = df["bed"].convert_dtypes(int)
df["bath"] = df["bath"].convert_dtypes(int)
df["zip_code"] = df["zip_code"].convert_dtypes(int)

df['zip_code'] = df['zip_code'].astype(str).fillna('')
df['zip_code'] = df['zip_code'].apply(lambda x: x.zfill(5))

In [5]:
df['zip_code'] = df['zip_code'].str[3:]

In [6]:
df['sold'] = df['prev_sold_date'].notnull().map({True: 'yes', False: 'no'})
df.drop(["prev_sold_date"], axis=1, inplace=True)

In [48]:
df.dropna(subset=["price"], axis = 0, inplace=True)
df.drop(df[df["price"] <= 1000].index, axis = 0, inplace=True)

spec_cols = ['bed', 'bath', 'house_size', 'acre_lot']
df.dropna(subset=spec_cols, axis = 0, inplace=True, how='all')

n_bins = 50
quantiles = df['price'].quantile(np.linspace(0, 1, n_bins + 1)[1:])
min_price = df['price'].min() 

bins = quantiles.tolist()
bins.insert(0, min_price)

price_categories = pd.cut(df['price'], bins=bins, include_lowest=True)
df['price_category'] = price_categories

price_category_bed_means = df.groupby('price_category')['bed'].mean().astype(int)
price_category_bath_means = df.groupby('price_category')['bath'].mean().astype(int)
price_category_size_means = df.groupby('price_category')['house_size'].mean()
price_category_acre_means = df.groupby('price_category')['acre_lot'].mean()

In [49]:
def fill_bed(row):
    category = row['price_category']
    if pd.isna(row['bed']):
      return price_category_bed_means.loc[category]
    else:
      return row['bed']

df["bed"] = df.apply(fill_bed, axis=1)

def fill_bath(row):
    category = row['price_category']
    if pd.isna(row['bath']):
      return price_category_bath_means.loc[category]
    else:
      return row['bath']
    
df["bath"] = df.apply(fill_bath, axis=1)


In [50]:
n_bins = 10

bins2 = [1,2,3,4,5,6,10,15,25,142]
bed_categories = pd.cut(df['bed'], bins=bins2, include_lowest=True)
df['bed_category'] = bed_categories

bed_category_size_means = df.groupby('bed_category')['house_size'].mean()
bed_category_acre_means = df.groupby('bed_category')['acre_lot'].mean()

bins3 = [1,2,3,4,5,6,10,15,25,198]
bath_categories = pd.cut(df['bath'], bins=bins3, include_lowest=True)
df['bath_category'] = bath_categories

bath_category_size_means = df.groupby('bath_category')['house_size'].mean()
bath_category_acre_means = df.groupby('bath_category')['acre_lot'].mean()

In [51]:
def fill_house_size(row):
    category1 = row['price_category']
    category2 = row['bed_category']
    category3 = row['bath_category']
    if pd.isna(row['house_size']):
      return (price_category_size_means.loc[category1] + bed_category_size_means.loc[category2] + bath_category_size_means.loc[category3])/3
    else:
      return row['house_size']

df["house_size"] = df.apply(fill_house_size, axis=1)

def fill_acre_lot(row):
    category1 = row['price_category']
    category2 = row['bed_category']
    category3 = row['bath_category']
    if pd.isna(row['acre_lot']):
      return (price_category_acre_means.loc[category1] + bed_category_acre_means.loc[category2] + bath_category_acre_means.loc[category3])/3
    else:
      return row['acre_lot']
    
df["acre_lot"] = df.apply(fill_acre_lot, axis=1)
df.drop(df[df["acre_lot"] == 0].index, axis = 0, inplace=True)

df.drop(["price_category"], axis=1, inplace=True)
df.drop(["bed_category"], axis=1, inplace=True)
df.drop(["bath_category"], axis=1, inplace=True)

## Scaling

In [61]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])
numeric_df.drop(["price"], axis=1, inplace=True)

scaler = StandardScaler()
columns = ['bed', 'bath']
numeric_df = pd.DataFrame(scaler.fit_transform(numeric_df[columns]), columns=columns)

np.seterr(all='warn')
numeric_df["acre_lot"] = np.log10(df["acre_lot"])
numeric_df["house_size"] = np.log10(df["house_size"])

In [62]:
numeric_df = numeric_df.reset_index(drop=True)
df = df.reset_index(drop=True)

In [71]:
df["bed"] = numeric_df['bed']
df["bath"] = numeric_df['bath']
df["house_size"] = numeric_df['house_size']
df["acre_lot"] = numeric_df['acre_lot']

## One-Hot Encoding

In [73]:
print("Num of status: ",len(df["status"].unique()))
print("Num of cities: ",len(df["city"].unique()))
print("Num of states: ",len(df["state"].unique()))
print("Num of zipcodes: ",len(df["zip_code"].unique()))
print("Num of sold: ",len(df["sold"].unique()))

Num of status:  2
Num of cities:  415
Num of states:  17
Num of zipcodes:  100
Num of sold:  2


In [8]:
OHE = OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype='int8').set_output(transform='pandas')
cols = ["city", "state", "zip_code", "status", "sold"]
NewDF = OHE.fit_transform(df[cols])

In [9]:
with open('encoder.pkl', 'wb') as f:
  pickle.dump(OHE, f)

## Finalizing the Dataframe

In [75]:
NewDF['bed'] = df['bed']
NewDF['bath'] = df['bath']
NewDF['acre_lot'] = df['acre_lot']
NewDF['house_size'] = df['house_size']
NewDF['price'] = df['price'].round(0)

NewDF = NewDF.reset_index(drop=True)

## Training & Test sets

In [81]:
def create_bins(df):
    quantiles = df.quantile([0,0.1, 0.25, 0.5, 0.75, 0.9,1])
    bins_df = pd.cut(df, bins=quantiles, labels=["1k - 89k", "90k - 187k", "188k - 373k", "374k - 699k", "700k - 1424k", "1425k - 875000k"])
    bins_df = bins_df.fillna("1k - 89k")
    return bins_df

In [82]:
features = NewDF.drop(columns=['price'])
label = NewDF["price"]

X_train , X_test , y_train , y_test = train_test_split(features, label, test_size=0.2, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

y_test_binned = create_bins(y_test)
y_train_binned = create_bins(y_train)

In [85]:
Train = X_train
Train['label'] = y_train
Train['binned_label'] = y_train_binned

Test = X_test
Test['label'] = y_test
Test['binned_label'] = y_test_binned

In [86]:
Train.to_csv("Train_Set.csv")
Test.to_csv("Test_Set.csv")