# Real Estate Pricing Tier Classification by Thai, Laxmi, and Daniel

### Data Cleaning & Preparation - Thai

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [12]:
# Remove irrelevant columns, handle missing values, and convert categorical to numerical

# Load the dataset
real_estate = pd.read_csv('realtor-data.zip.csv')

# Print the first few rows of the original dataset
print("Original dataset:")
print(real_estate.head())

# Print all columns in the original dataset
print("\nAll columns in the original dataset:")
print(real_estate.columns)

# Display the data types of each column
print("\nData types of each column:")
print(real_estate.dtypes)

# Check the dimensions of the dataset
print("\nNumber of rows and columns in the dataset:", real_estate.shape)

# Check for missing values
print("\nMissing values in the dataset:")
print(real_estate.isnull().sum())

# Remove rows with missing values
real_estate.dropna(inplace=True)

# Remove irrelevant columns
real_estate.drop(['prev_sold_date'], axis=1, inplace=True)

# Define the list of categorical columns (optional based on algorithms)
# categorical_columns = ['status', 'city', 'state']  # Add more columns if needed

# Convert categorical variables into numerical representations using Label Encoding
# label_encoder = LabelEncoder()
# for col in categorical_columns:
    # real_estate[col] = label_encoder.fit_transform(real_estate[col])

# Define a dictionary to map old column names to new column names for the columns to be renamed
column_mapping = {
    'acre_lot': 'lot_size',
    'zip_code': 'zipcode',
    'price': 'sales_price'
}

# Rename specified columns using the defined mapping
real_estate.rename(columns=column_mapping, inplace=True)

# Print cleaned dataset
print("\nCleaned dataset:")
print(real_estate.head())

# Print the last few rows of the cleaned dataset
print("\nLast few rows of the cleaned dataset:")
print(real_estate.tail())

# Print the number of rows in the cleaned dataset
print("\nNumber of rows in the cleaned dataset:", real_estate.shape)

# Save the cleaned dataset
real_estate.to_csv('cleaned_realtor_data.csv', index=False)

Original dataset:
     status  bed  bath  acre_lot        city        state  zip_code  \
0  for_sale  3.0   2.0      0.12    Adjuntas  Puerto Rico     601.0   
1  for_sale  4.0   2.0      0.08    Adjuntas  Puerto Rico     601.0   
2  for_sale  2.0   1.0      0.15  Juana Diaz  Puerto Rico     795.0   
3  for_sale  4.0   2.0      0.10       Ponce  Puerto Rico     731.0   
4  for_sale  6.0   2.0      0.05    Mayaguez  Puerto Rico     680.0   

   house_size prev_sold_date     price  
0       920.0            NaN  105000.0  
1      1527.0            NaN   80000.0  
2       748.0            NaN   67000.0  
3      1800.0            NaN  145000.0  
4         NaN            NaN   65000.0  

All columns in the original dataset:
Index(['status', 'bed', 'bath', 'acre_lot', 'city', 'state', 'zip_code',
       'house_size', 'prev_sold_date', 'price'],
      dtype='object')

Data types of each column:
status             object
bed               float64
bath              float64
acre_lot          flo

### Exploratory Data Analysis - Laxmi