#Importing Packages

In [1]:
import pandas as pd
import json
from joblib import load

import warnings
warnings.filterwarnings("ignore")

#Loading and Pre-processing Data

In [2]:
user_data = pd.read_csv("New User Data.csv")

In [3]:
user_data.head()

Unnamed: 0,city,state,street,streetSuffix,zip,latitude,longitude,landUseDescription,zoningDescription,lotSizeAcres,...,yearBuilt,totalStories,totalRooms,bedrooms,baths,heating,airConditioning,foundation,fireplace,soldYear
0,Lilburn,GA,Village Green,Ct,30047,33.873543,-84.117197,Single Family Residential,R100-Single Family Residence,0.46,...,1971,1,6,4,2,Forced air,Yes,Crawl Space/Raised,Masonry,2021


In [4]:
lotSizqSqFt = user_data["lotSizeSquareFeet"].values

In [5]:
# Loading the Encoding Database to encode the categorical data into numbers

file_path = "../Output/encodings_database.json"
file = open(file_path, "r")
encodings_database = json.load(file)

In [6]:
categorical_cols = ["city", "state", "street", "streetSuffix", "landUseDescription", "zoningDescription", "lotTopography", "architecturalStyle", "condition", "heating", "airConditioning",	"foundation", "fireplace"]
numerical_cols = ["zip", "latitude", "longitude", "lotSizeAcres",	"lotSizeSquareFeet",	"yearBuilt",	"totalStories",	"totalRooms",	"bedrooms",	"baths", "soldYear"]

In [7]:
def encode_state(state):
    '''
    This function returns the label of the code of the entered state.
    Args:
        state: str
        - accepts states in USA

    Returns:
        state_code: int
            - -1 for invalid
            - 0 to 50 for actual state codes
    '''
    state = state.lower()
    if state not in encodings_database["state"].keys():
        print("Please enter one of the states of USA!")
        state_code = -1
    else:
        print("State Encoded")
        state_code = encodings_database["state"][state]
    return state_code

In [8]:
# Check whether the state is valid or not
state_code = encode_state(list(user_data["state"])[0])
print(state_code)

State Encoded
9


In [9]:
def validate_numerical_data(num_col_val):
    '''
    This function validates whether the input numerical column actually has the numerical data
    Args:
        num_col_val: str
            - string with an integer e.g. '56'
    Returns:
        number_flag: bool
            - True if all the data are real numbers
            - False if the data is other than real number
    '''
    valid_number_flag = True
    try:
        float(num_col_val)
    except:
        valid_number_flag = False
        print("Please enter a valid number!")
    else:
        print("Numerical Data: Detected and Validated")

    return valid_number_flag

In [10]:
for j, col in enumerate(numerical_cols):
    valid_number_flag = validate_numerical_data(list(user_data[col])[0])
    print(list(user_data[col])[0])
    print(valid_number_flag)
    print("\n")

Numerical Data: Detected and Validated
30047
True


Numerical Data: Detected and Validated
33.873543
True


Numerical Data: Detected and Validated
-84.11719699999999
True


Numerical Data: Detected and Validated
0.46
True


Numerical Data: Detected and Validated
20037.6
True


Numerical Data: Detected and Validated
1971
True


Numerical Data: Detected and Validated
1
True


Numerical Data: Detected and Validated
6
True


Numerical Data: Detected and Validated
4
True


Numerical Data: Detected and Validated
2
True


Numerical Data: Detected and Validated
2021
True




In [11]:
def validate_categorical_data(cat_col_val):
    '''
    This function validates whether the input categorical column actually has the categorical data
    Args:
        cat_col_val: str
            - actual string value
    Returns:
        valid_categorical_flag: bool
            - True if all the data are strings
            - False if the data is other than strings
    '''
    valid_categorical_flag = True
    try:
        float(cat_col_val)
    except:
        if bool(cat_col_val) is False:
            valid_categorical_flag = False
            return valid_categorical_flag
        print("Categorical Data: Detected and Validated!")
    else:
        print("You tried to enter numerical data in categorical column!")
        valid_categorical_flag = False

    return valid_categorical_flag

In [12]:
for j, col in enumerate(categorical_cols):
    valid_categorical_flag = validate_categorical_data(list(user_data[col])[0])
    print(list(user_data[col])[0])
    print(valid_categorical_flag)
    print("\n")

Categorical Data: Detected and Validated!
Lilburn
True


Categorical Data: Detected and Validated!
GA
True


Categorical Data: Detected and Validated!
Village Green
True


Categorical Data: Detected and Validated!
Ct
True


Categorical Data: Detected and Validated!
Single Family Residential
True


Categorical Data: Detected and Validated!
R100-Single Family Residence
True


Categorical Data: Detected and Validated!
Low Elevation
True


Categorical Data: Detected and Validated!
Ranch/Rambler
True


Categorical Data: Detected and Validated!
Average
True


Categorical Data: Detected and Validated!
Forced air
True


Categorical Data: Detected and Validated!
Yes
True


Categorical Data: Detected and Validated!
Crawl Space/Raised
True


Categorical Data: Detected and Validated!
Masonry
True




In [13]:
user_data

Unnamed: 0,city,state,street,streetSuffix,zip,latitude,longitude,landUseDescription,zoningDescription,lotSizeAcres,...,yearBuilt,totalStories,totalRooms,bedrooms,baths,heating,airConditioning,foundation,fireplace,soldYear
0,Lilburn,GA,Village Green,Ct,30047,33.873543,-84.117197,Single Family Residential,R100-Single Family Residence,0.46,...,1971,1,6,4,2,Forced air,Yes,Crawl Space/Raised,Masonry,2021


In [14]:
for i, name in enumerate(categorical_cols):
  user_data[name] = encodings_database[name][user_data[name].values[0].lower()]

In [15]:
user_data

Unnamed: 0,city,state,street,streetSuffix,zip,latitude,longitude,landUseDescription,zoningDescription,lotSizeAcres,...,yearBuilt,totalStories,totalRooms,bedrooms,baths,heating,airConditioning,foundation,fireplace,soldYear
0,11,9,8470,14,30047,33.873543,-84.117197,13,43,0.46,...,1971,1,6,4,2,1,1,0,0,2021


In [16]:
# Loading the MinMaxScalar to normalize the categorical features

filename = "../Output/minMaxScalar.joblib"
minMaxScalar = load(filename)

In [17]:
# MINMAX NORMALIZATION:

categorical_cols.remove("state")
categorical_data = user_data[categorical_cols]

user_data[categorical_cols] = minMaxScalar.transform(categorical_data)

#Encoding state separately, since we did not have all the 50 states in the data
user_data["state"] /= 49

In [18]:
user_data

Unnamed: 0,city,state,street,streetSuffix,zip,latitude,longitude,landUseDescription,zoningDescription,lotSizeAcres,...,yearBuilt,totalStories,totalRooms,bedrooms,baths,heating,airConditioning,foundation,fireplace,soldYear
0,0.55,0.183673,0.923261,0.215385,30047,33.873543,-84.117197,0.8125,0.494253,0.46,...,1971,1,6,4,2,0.25,1.0,0.0,0.0,2021


In [19]:
# Now we have all numerical values but in string format, so we convert all the values to numeric
user_data = user_data.apply(pd.to_numeric, errors="coerce")

In [20]:
# Loading the StandardScalar to normalize the numerical features

filename = "../Output/standardScalar.joblib"
standardScalar = load(filename)

In [21]:
numerical_data = user_data[numerical_cols]

user_data[numerical_cols] = standardScalar.transform(numerical_data)

In [22]:
user_data.shape

(1, 24)

In [31]:
for val in user_data:
    print(user_data[val])

0    0.55
Name: city, dtype: float64
0    0.183673
Name: state, dtype: float64
0    0.923261
Name: street, dtype: float64
0    0.215385
Name: streetSuffix, dtype: float64
0   -0.427423
Name: zip, dtype: float64
0    0.074451
Name: latitude, dtype: float64
0   -0.10837
Name: longitude, dtype: float64
0    0.8125
Name: landUseDescription, dtype: float64
0    0.494253
Name: zoningDescription, dtype: float64
0   -0.006182
Name: lotSizeAcres, dtype: float64
0   -0.006182
Name: lotSizeSquareFeet, dtype: float64
0    0.4
Name: lotTopography, dtype: float64
0    0.0
Name: condition, dtype: float64
0    1.0
Name: architecturalStyle, dtype: float64
0   -1.726295
Name: yearBuilt, dtype: float64
0   -1.10741
Name: totalStories, dtype: float64
0   -0.742079
Name: totalRooms, dtype: float64
0    0.573313
Name: bedrooms, dtype: float64
0   -1.007367
Name: baths, dtype: float64
0    0.25
Name: heating, dtype: float64
0    1.0
Name: airConditioning, dtype: float64
0    0.0
Name: foundation, dtype: floa

In [26]:
rf_model = load("rf_model_95_86_randomState27.joblib")
type(rf_model)

sklearn.ensemble._forest.RandomForestRegressor

In [29]:
preds = rf_model.predict(user_data.iloc[:1,:])

AttributeError: 'DecisionTreeRegressor' object has no attribute 'n_features_'

In [None]:
user_data.values

In [None]:
# Now the new test data is ready to feed into the Random Forest Regression Model
rf_model = load("../Output/final_rf_model.joblib")

preds = rf_model.predict(user_data.values)
preds = round(preds, 3)
print("Predicted Housing Price:", preds)

In [None]:
# Dividing the predicted housing price by lotSizeSquareFeet

predPerSqFt = preds/lotSizqSqFt

In [None]:
predPerSqFt

In [None]:
amortization_table = pd.read_csv("/content/drive/MyDrive/Software Engg Regression Analysis/[LATEST] Manish New Data/Amortization_table.csv")

In [None]:
amortization_table.head()

In [None]:
amortization_table["Predicted House Price"] = preds

In [None]:
amortization_table.head()

In [None]:
amortization_table["Predicted Square Feet Price"] = predPerSqFt

In [None]:
amortization_table["Equity Value"] = amortization_table["Predicted House Price"] - amortization_table["Remaining Principal"]

In [None]:
amortization_table

In [None]:
amortization_table.to_csv("amortization_table_output.csv", index=False)