# PROBLEM
#### DATA COLLECTION
#### DATA PREPROCESSING
#### FEATURE ENGINEERING?
#### TRAINING SPLIT
#### TREATING THE MODEL
#### TRAINING THE MODEL
#### HYPERPARAMETER TUNING
#### TEST DATA EVALUATION

Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm

In [None]:
df = pd.read_csv("realtor-data.csv")
df.head()

In [None]:
df.info()

# Data Cleaning

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
# sns.heatmap(df.isnull(), yticklabels=False)

In [None]:
#Remove houses not ready for sale
df = df.drop(df[df['status'] != 'for_sale'].index)
#Since all are for sale we can drop this column
df = df.drop('status', axis = 1)

#Drop houses with no price since it is our main dependent variable
df = df.dropna(subset='price')

#Drop unimportant prev_sold_date, city, and zip_code variables
df = df.drop(['prev_sold_date', 'city', 'zip_code'], axis=1)

#Remove rows with two or more NaN values
df = df.dropna(thresh = 2)

In [None]:
print("Raw numbers")
print(df.isnull().sum().sort_values(ascending=False))
print("\nPercentages")
print(((df.isnull().sum() / len(df)) * 100).sort_values(ascending=False))       

In [None]:
sns.boxplot(df)

In [None]:
#Because bed and bath NaN values contribute to less than 15% of the
#904966 entry dataset, we can drop these rows
df = df.dropna(subset = ['bed', 'bath'])

In [None]:
#Imputation based on mean per state for acre_lot and house_size

#acre_mean = df.groupby('state')['acre_lot'].mean()
#df['acre_lot'].fillna(df['state'].map(acre_mean), inplace = True)

#size_mean = df.groupby('state')['house_size'].mean()
#df['house_size'].fillna(df['state'].map(size_mean), inplace = True)

In [None]:
notnull_houseSize = df.dropna(subset=['house_size'])
X = notnull_houseSize[['beds', 'bath', 'state', 'house_size', 'price']]
y = notnull_houseSize['acre_lot']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a preprocessor to handle missing values and encode categorical variables
preprocessor = SimpleImputer(strategy='mean')  # You can use other strategies as well
X_train_imputed = preprocessor.fit_transform(X_train)
X_test_imputed = preprocessor.transform(X_test)

# Create a Random Forest Regressor model
model = RandomForestRegressor(random_state=42)

# Fit the model to the training data
model.fit(X_train_imputed, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_imputed)

# Evaluate the model performance (optional)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Now, you can use the model to predict 'acre_lot' for new data
new_data = pd.DataFrame({'beds': [3], 'bath': [2], 'state': ['A'], 'house_size': [1800], 'price': [250000]})
new_data_imputed = preprocessor.transform(new_data)
prediction = model.predict(new_data_imputed)
print(f'Predicted acre_lot: {prediction[0]}')

In [None]:
sns.boxplot(df['price'])

In [None]:
#Removing price outliers

Q1 = np.quantile(df['price'] , 0.25)
Q3 = np.quantile(df['price'] , 0.75)
IQR = Q3 - Q1

lower_range = Q1 - 1.5 * IQR
upper_range = Q3 + 1.5 * IQR
outliers = [value for value in df['price'] if (
        (value > lower_range) & (value < upper_range))]
df = df.loc[df['price'].isin(outliers)]

In [None]:
#Now there are no null values
print(df.isnull().sum().sort_values(ascending=False))

In [None]:
df["state"].value_counts() 

In [None]:
#Since Georgia, Wyoming, and West Virginia only contribute to 54 entries, we can drop these values
df = df[~df['state'].isin(["Georgia", "Wyoming", "West Virginia"])]
print(df['state'].unique())

In [None]:
#Numbers for each unique state

state_names = df['state'].unique()
state_mapping = {state: i for i, state in enumerate(state_names)}
df['state #'] = df['state'].map(state_mapping)

for state, number in state_mapping.items():
    print(f"{state}: {number}")
    
#df = df.drop('state', axis = 1)
# numeric_value_for_desired_state = state_mapping[desired_state]

# Data Visualization