In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [28]:
df = pd.read_csv('../../data/final_clean_data.csv')
df = df.drop(columns=['Unnamed: 0','date'])

In [29]:
x = df[['sqft_living','bedrooms','grade']]
y = df['price']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state = 23)

In [30]:
cat_cols = ['grade']
encoder = OneHotEncoder(handle_unknown='error',drop='first',categories='auto')
ct = ColumnTransformer(transformers=[('ohe', encoder, cat_cols)],remainder='passthrough',sparse_threshold=0)
ct.fit(x_train)
x_train_enc = ct.transform(x_train)

In [31]:
scaler = StandardScaler()
scaler.fit(x_train_enc)
x_train_scaled = scaler.transform(x_train_enc)

In [32]:
lr = LinearRegression()
lr = lr.fit(x_train_scaled,y_train)

In [33]:
mean_sqft = x_train['sqft_living'].mean()
median_bedrooms = x_train['bedrooms'].median()
mode_grade = x_train['grade'].mode()

In [34]:
def take_inputs():
    sqft_living = input("Enter the sqare foot living area of the house:\t")
    bedrooms = input("Enter the number of bedrooms in the house:\t")
    grade = str(input("Enter the grade of the house:\t"))
    
    
    for unique_grade in [*x_train['grade'].value_counts().index]:
        if str(grade) in unique_grade:
            grade = unique_grade
    return [sqft_living, bedrooms, grade]

In [35]:
def predict_price(fitted_ct,
                  fitted_scaler,
                  fitted_lr,
                  sqft_living = mean_sqft,
                  bedrooms = median_bedrooms,
                  grade = mode_grade
                 ):
    '''
    Takes in information about a house and uses the linear regression model, column transformer, and scaler passed to it
    to predict the value of a house matching the values passed in.
    If a value is not passed to the function it will use a measure of central tendency depending on the column.
    '''
    # create a single row dataframe to test the model on and get the price prediction
    test_df = pd.DataFrame({'sqft_living': [sqft_living], 'bedrooms': [bedrooms], 'grade': [grade]})
    
    # encode categorical values
    test_df_enc = ct.transform(test_df)
    
    # scale data
    test_df_scaled = scaler.transform(test_df_enc)
    
    # run the linear regression and return the prediction
    prediction = lr.predict(test_df_scaled)
    
    print(F"\nPredicted price of this house:\t{int(prediction[0])}.")
    return

In [39]:
user_input = take_inputs()
function_input = [ct,scaler,lr,*user_input]
predict_price(*function_input)

Enter the sqare foot living area of the house:	1
Enter the number of bedrooms in the house:	1
Enter the grade of the house:	1

Predicted price of this house:	554426.
