# Real Estate Pricing Tier Classification by Thai, Laxmi, and Daniel

### Data Cleaning & Preparation - Thai

In [17]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [23]:
# Load the dataset
real_estate = pd.read_csv('realtor-data.zip.csv')
print("Original dataset:")
print(real_estate.head())

# Print all columns
print("All columns in the dataset:")
print(real_estate.columns)

# Remove irrelevant columns
real_estate.drop(['prev_sold_date'], axis=1, inplace=True)

# Handle missing values for numerical columns by imputing with the median
numerical_columns = ['bed', 'bath', 'acre_lot', 'house_size', 'price']
real_estate[numerical_columns] = real_estate[numerical_columns].apply(lambda x: x.fillna(x.median()), axis=0)

# Handle missing values for categorical columns by imputing with the mode
categorical_columns = ['city', 'state']
real_estate[categorical_columns] = real_estate[categorical_columns].apply(lambda x: x.fillna(x.mode()[0]), axis=0)

# Convert categorical variables into numerical representations using Label Encoding (optional: based on algorithm)
label_encoder = LabelEncoder()
for col in categorical_columns:
    real_estate[col] = label_encoder.fit_transform(real_estate[col])
    
print("Clean dataset:")
print(real_estate.head())

# Save the cleaned dataset
real_estate.to_csv('cleaned_realtor_data.csv', index=False)

Original dataset:
     status  bed  bath  acre_lot        city        state  zip_code  \
0  for_sale  3.0   2.0      0.12    Adjuntas  Puerto Rico     601.0   
1  for_sale  4.0   2.0      0.08    Adjuntas  Puerto Rico     601.0   
2  for_sale  2.0   1.0      0.15  Juana Diaz  Puerto Rico     795.0   
3  for_sale  4.0   2.0      0.10       Ponce  Puerto Rico     731.0   
4  for_sale  6.0   2.0      0.05    Mayaguez  Puerto Rico     680.0   

   house_size prev_sold_date     price  
0       920.0            NaN  105000.0  
1      1527.0            NaN   80000.0  
2       748.0            NaN   67000.0  
3      1800.0            NaN  145000.0  
4         NaN            NaN   65000.0  
All columns in the dataset:
Index(['status', 'bed', 'bath', 'acre_lot', 'city', 'state', 'zip_code',
       'house_size', 'prev_sold_date', 'price'],
      dtype='object')
Clean dataset:
     status  bed  bath  acre_lot  city  state  zip_code  house_size     price
0  for_sale  3.0   2.0      0.12    36     2

### Exploratory Data Analysis - Laxmi