In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [48]:
df = pd.read_csv('Old_Dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2001666 entries, 0 to 2001665
Data columns (total 10 columns):
 #   Column          Dtype  
---  ------          -----  
 0   status          object 
 1   bed             float64
 2   bath            float64
 3   acre_lot        float64
 4   city            object 
 5   state           object 
 6   zip_code        float64
 7   house_size      float64
 8   prev_sold_date  object 
 9   price           float64
dtypes: float64(6), object(4)
memory usage: 152.7+ MB


Null Percentage:

In [49]:
nulls_percent = df.isnull().sum()*100 / df.shape[0]
print(nulls_percent)

status             0.000000
bed               16.914210
bath              17.122037
acre_lot          20.921372
city               0.015037
state              0.000000
zip_code           0.025878
house_size        32.233150
prev_sold_date    49.691457
price              0.013539
dtype: float64


There is no feature with a null percentage exceeding 70% which means that the missing values can be dealt with rather than eliminating the column from the dataset

These statistics will be useful for encoding the categorical data (OHE)

Correlation

In [51]:
for i in df.columns[:9] :
    if df[i].dtype != "object":
        print("Correlation between",i, "and the label \"price\": ", round(df[i].corr(df["price"])*100,2))

Correlation between bed and the label "price":  16.01
Correlation between bath and the label "price":  27.79
Correlation between acre_lot and the label "price":  -0.1
Correlation between zip_code and the label "price":  -7.42
Correlation between house_size and the label "price":  14.15


There are no high correlations exist between the features and the label, so none will be redundant features. It is important to note that there exists a high correlation between the city,state and the zipcode as the city and the state are could be determined through the zip code. This will be dealt with later on.

Fixing Data Types

In [52]:
df["bed"] = df["bed"].convert_dtypes(int)
df["bath"] = df["bath"].convert_dtypes(int)
df["zip_code"] = df["zip_code"].convert_dtypes(int)

df['zip_code'] = df['zip_code'].astype(str).fillna('')
df['zip_code'] = df['zip_code'].apply(lambda x: x.zfill(5))  

df.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price
0,for_sale,3,2,0.12,Adjuntas,Puerto Rico,601,920.0,,105000.0
1,for_sale,4,2,0.08,Adjuntas,Puerto Rico,601,1527.0,,80000.0
2,for_sale,2,1,0.15,Juana Diaz,Puerto Rico,795,748.0,,67000.0
3,for_sale,4,2,0.1,Ponce,Puerto Rico,731,1800.0,,145000.0
4,for_sale,6,2,0.05,Mayaguez,Puerto Rico,680,,,65000.0


Preparing the Zip_code by shortening it to two digits since we don't need the first three digits as they indicate the state and city which are already present in the dataset. We will also set the null values to 100

In [68]:
df['zip_code'] = df['zip_code'].str[3:]
df['zip_code'] = df['zip_code'].replace("A>", "100")
df.head()

Unnamed: 0,bed,bath,acre_lot,house_size,status,city,state,zip_code,Sold,price
0,3,2,0.12,920.0,for_sale,Adjuntas,Puerto Rico,1,0,105000.0
1,4,2,0.08,1527.0,for_sale,Adjuntas,Puerto Rico,1,0,80000.0
2,2,1,0.15,748.0,for_sale,Juana Diaz,Puerto Rico,95,0,67000.0
3,4,2,0.1,1800.0,for_sale,Ponce,Puerto Rico,31,0,145000.0
4,6,2,0.05,2143.26,for_sale,Mayaguez,Puerto Rico,80,0,65000.0


Changing the prev_sold_date into a boolean sold feature indicating whether the house was sold or not


In [61]:
df['Sold'] = df['prev_sold_date'].notnull().map({True: 1, False: 0})
df.drop(["prev_sold_date"], axis=1, inplace=True)
df = df.iloc[:, [1, 2, 3, 7, 0, 4, 5, 6, 9, 8]]  
df.head()

Unnamed: 0,bed,bath,acre_lot,house_size,status,city,state,zip_code,Sold,price
0,3,2,0.12,920.0,for_sale,Adjuntas,Puerto Rico,1,0,105000.0
1,4,2,0.08,1527.0,for_sale,Adjuntas,Puerto Rico,1,0,80000.0
2,2,1,0.15,748.0,for_sale,Juana Diaz,Puerto Rico,95,0,67000.0
3,4,2,0.1,1800.0,for_sale,Ponce,Puerto Rico,31,0,145000.0
4,6,2,0.05,,for_sale,Mayaguez,Puerto Rico,80,0,65000.0


We will handle the Nulls in the numerical features by setting them to the mean value.

In [66]:
Bed_Mean = int(df['bed'].mean())
Bath_Mean = int(df['bath'].mean())
Acre_Mean = df['acre_lot'].mean()
Size_Mean = df['house_size'].mean()

df['bed'] = df['bed'].fillna(Bed_Mean)
df['bath'] = df['bath'].fillna(Bath_Mean)
df['acre_lot'] = df['acre_lot'].fillna(Acre_Mean)
df['house_size'] = df['house_size'].fillna(Size_Mean)

nulls_percent = df.isnull().sum()*100 / df.shape[0]
print(nulls_percent)

df['acre_lot'] = df['acre_lot'].round(2)
df['house_size'] = df['house_size'].round(2)
df.head()

bed           0.000000
bath          0.000000
acre_lot      0.000000
house_size    0.000000
status        0.000000
city          0.015037
state         0.000000
zip_code      0.000000
Sold          0.000000
price         0.013539
dtype: float64


Unnamed: 0,bed,bath,acre_lot,house_size,status,city,state,zip_code,Sold,price
0,3,2,0.12,920.0,for_sale,Adjuntas,Puerto Rico,1,0,105000.0
1,4,2,0.08,1527.0,for_sale,Adjuntas,Puerto Rico,1,0,80000.0
2,2,1,0.15,748.0,for_sale,Juana Diaz,Puerto Rico,95,0,67000.0
3,4,2,0.1,1800.0,for_sale,Ponce,Puerto Rico,31,0,145000.0
4,6,2,0.05,2143.26,for_sale,Mayaguez,Puerto Rico,80,0,65000.0


One-Hot Encoding

In [54]:
print("Num of cities: ",len(df["status"].unique()))
print("Num of cities: ",len(df["city"].unique()))
print("Num of states: ",len(df["state"].unique()))
print("Num of zipcodes: ",len(df["zip_code"].unique()))

Num of cities:  5000
Num of states:  24
Num of zipcodes:  101
