In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import random

In [2]:
df = pd.read_csv('../../data/final_clean_data.csv')
df = df.drop(columns=['Unnamed: 0','date'])

In [3]:
x = df[['sqft_living', 'floors', 'sqft_basement', 'yr_built', 'bedrooms', 'grade']]
y = df['price']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state = 23)

In [4]:
df['grade'].value_counts()

7 Average        8520
8 Good           5656
9 Better         2330
6 Low Average    1936
10 Very Good      947
Name: grade, dtype: int64

In [4]:
x_train

Unnamed: 0,sqft_living,floors,sqft_basement,yr_built,bedrooms,grade
7662,3020,2.0,0.0,1989,3,9 Better
12756,2660,1.0,910.0,1977,4,8 Good
17759,3070,2.0,0.0,2005,4,10 Very Good
19012,2890,1.0,1060.0,1976,6,8 Good
477,1250,1.0,0.0,1964,3,7 Average
...,...,...,...,...,...,...
6175,1820,1.5,0.0,1921,4,8 Good
9704,1890,2.0,0.0,1993,3,8 Good
11190,760,1.0,0.0,1941,2,6 Low Average
9256,2760,1.0,1380.0,1964,5,7 Average


In [5]:
cat_cols = ['grade']
encoder = OneHotEncoder(handle_unknown='error',drop='first',categories='auto')
ct = ColumnTransformer(transformers=[('ohe', encoder, cat_cols)],remainder='passthrough',sparse_threshold=0)
ct.fit(x_train)
x_train_enc = ct.transform(x_train)

In [6]:
x_train['grade'].value_counts()

7 Average        6338
8 Good           4250
9 Better         1783
6 Low Average    1465
10 Very Good      705
Name: grade, dtype: int64

In [7]:
scaler = StandardScaler()
scaler.fit(x_train_enc)
x_train_scaled = scaler.transform(x_train_enc)

In [8]:
lr = LinearRegression()
lr = lr.fit(x_train_scaled,y_train)

In [9]:
mean_living= x_train['sqft_living'].mean()
median_floors = x_train['floors'].median()
mean_basement = x_train['sqft_basement'].mean()
median_yr = x_train['yr_built'].median()
median_bedrooms = x_train['bedrooms'].median()
mode_grade = x_train['grade'].mode().values[0]

central_tendency = [mean_living,median_floors,mean_basement,median_yr,median_bedrooms]

In [10]:
def take_inputs():
    living_val = input("Enter the square foot living area of the house:\t")
    floors_val = input("Enter the number of floors the house has:\t")
    basement_val = input("Enter the square foot basement area of the house:\t")
    yr_val = input("Enter the year the house was built:\t")
    bedrooms_val = input("Enter the number of bedrooms in the house:\t")
    grade_val = str(input("Enter the grade of the house:\t"))
    
    feature_list = [living_val,floors_val,basement_val,yr_val,bedrooms_val]
    count = 0
    for feature in feature_list:
        if feature == '':
            feature_list[count] = float(central_tendency[count])
        count += 1
        
    for unique_grade in [*x_train['grade'].value_counts().index]:
        if str(grade_val) in unique_grade:
            print(f"Using {unique_grade}")
            grade_val = unique_grade
    if grade_val not in [*x_train['grade'].value_counts().index]:
        grade_val = mode_grade
        print("Value not in model, using '7 Average' in place.")
    return [*feature_list,grade_val]

In [11]:
def predict_price(fitted_ct,
                  fitted_scaler,
                  fitted_lr,
                  sqft_living = mean_living,
                  floors = median_floors,
                  sqft_basement = mean_basement,
                  yr_built = median_yr,
                  bedrooms = median_bedrooms,
                  grade = mode_grade
                 ):
    '''
    Takes in information about a house and uses the linear regression model, column transformer, and scaler passed to it
    to predict the value of a house matching the values passed in.
    If a value is not passed to the function it will use a measure of central tendency depending on the column.
    '''
    # create a single row dataframe to test the model on and get the price prediction
    test_df = pd.DataFrame({'sqft_living': [sqft_living],
                            'floors': [floors],
                            'sqft_basement': sqft_basement,
                            'yr_built':yr_built,
                            'bedrooms': [bedrooms],
                            'grade': [grade]
                           })
    display(test_df)
    # encode categorical values
    test_df_enc = fitted_ct.transform(test_df)
    
    # scale data
    test_df_scaled = fitted_scaler.transform(test_df_enc)
    
    # run the linear regression and return the prediction
    prediction = lr.predict(test_df_scaled)
    
    print(F"\nPredicted price of this house:\t{int(prediction[0])}.")
    return

In [12]:
x_train['grade'].value_counts()

7 Average        6338
8 Good           4250
9 Better         1783
6 Low Average    1465
10 Very Good      705
Name: grade, dtype: int64

In [38]:
rand_int = random.randint(0,4848)
display(x_test.iloc[rand_int],
y_test.iloc[rand_int])

sqft_living           2200
floors                 1.5
sqft_basement            0
yr_built              1971
bedrooms                 4
grade            7 Average
Name: 4811, dtype: object

522500.0

In [39]:
user_input = take_inputs()
function_input = [ct,scaler,lr,*user_input]
predict_price(*function_input)

Enter the square foot living area of the house:	2200
Enter the number of floors the house has:	1.5
Enter the square foot basement area of the house:	0
Enter the year the house was built:	1971
Enter the number of bedrooms in the house:	4
Enter the grade of the house:	7
Using 7 Average


Unnamed: 0,sqft_living,floors,sqft_basement,yr_built,bedrooms,grade
0,2200,1.5,0,1971,4,7 Average



Predicted price of this house:	416022.
