#Importing Packages

In [942]:
import pandas as pd
import json
from joblib import load

import warnings
warnings.filterwarnings("ignore")

In [943]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Loading and Pre-processing Data

In [944]:
user_data_2 = pd.read_csv("/content/drive/MyDrive/Software Engg Regression Analysis/[LATEST] Manish New Data/demo_test_data.csv")

In [945]:
# User 108 and 113
user_id = 108

In [946]:
user_data = user_data_2.loc[user_data_2["id"] == user_id]
user_data

Unnamed: 0,id,city,state,street,streetSuffix,zip,latitude,longitude,landUseDescription,zoningDescription,lotSizeAcres,lotSizeSquareFeet,lotTopography,condition,architecturalStyle,yearBuilt,totalStories,totalRooms,bedrooms,baths,heating,airConditioning,foundation,fireplace,soldYear
0,108,Stone Mountain,GA,Dee,Ct,30087,33.857828,-84.167465,Single Family Residential,R100-Single Family Residence,0.41,17859.6,Level Grade,Average,Ranch/Rambler,1975,1,8,4,2,Forced air,Yes,Crawl Space/Raised,Masonry,2021


In [947]:
zip_average_price = {30047: 224924.8452,
                     30087: 190259.9042}

zip = int(user_data["zip"])
print(f"User {user_id} belongs to the zip {zip}, and the average price for that zip is {zip_average_price[zip]}")

User 108 belongs to the zip 30087, and the average price for that zip is 190259.9042


In [948]:
user_id = int(user_data["id"])
user_data = user_data.iloc[:1, 1:]

In [949]:
user_id

108

In [950]:
user_data

Unnamed: 0,city,state,street,streetSuffix,zip,latitude,longitude,landUseDescription,zoningDescription,lotSizeAcres,lotSizeSquareFeet,lotTopography,condition,architecturalStyle,yearBuilt,totalStories,totalRooms,bedrooms,baths,heating,airConditioning,foundation,fireplace,soldYear
0,Stone Mountain,GA,Dee,Ct,30087,33.857828,-84.167465,Single Family Residential,R100-Single Family Residence,0.41,17859.6,Level Grade,Average,Ranch/Rambler,1975,1,8,4,2,Forced air,Yes,Crawl Space/Raised,Masonry,2021


In [951]:
lotSizqSqFt = user_data["lotSizeSquareFeet"].values

In [952]:
# Loading the Encoding Database to encode the categorical data into numbers

file_path = "/content/drive/MyDrive/Software Engg Regression Analysis/[LATEST] Manish New Data/encodings_database.json"
file = open(file_path, "r")
encodings_database = json.load(file)

In [953]:
categorical_cols = ["city", "state", "street", "streetSuffix", "landUseDescription", "zoningDescription", "lotTopography", "architecturalStyle", "condition", "heating", "airConditioning",	"foundation", "fireplace"]
numerical_cols = ["zip", "latitude", "longitude", "lotSizeAcres",	"lotSizeSquareFeet",	"yearBuilt",	"totalStories",	"totalRooms",	"bedrooms",	"baths", "soldYear"]

In [954]:
def encode_state(state):
    '''
    This function returns the label of the code of the entered state.
    Args:
        state: str
        - accepts states in USA

    Returns:
        state_code: int
            - -1 for invalid
            - 0 to 50 for actual state codes
    '''
    state = state.lower()
    if state not in encodings_database["state"].keys():
        print("Please enter one of the states of USA!")
        state_code = -1
    else:
        print("State Encoded")
        state_code = encodings_database["state"][state]
    return state_code

In [955]:
# Check whether the state is valid or not
state_code = encode_state(list(user_data["state"])[0])
print(state_code)

State Encoded
9


In [956]:
def validate_numerical_data(num_col_val):
    '''
    This function validates whether the input numerical column actually has the numerical data
    Args:
        num_col_val: str
            - string with an integer e.g. '56'
    Returns:
        number_flag: bool
            - True if all the data are real numbers
            - False if the data is other than real number
    '''
    valid_number_flag = True
    try:
        float(num_col_val)
    except:
        valid_number_flag = False
        print("Please enter a valid number!")
    else:
        print("Numerical Data: Detected and Validated")

    return valid_number_flag

In [957]:
for j, col in enumerate(numerical_cols):
    valid_number_flag = validate_numerical_data(list(user_data[col])[0])
    print(list(user_data[col])[0])
    print(valid_number_flag)
    print("\n")

Numerical Data: Detected and Validated
30087
True


Numerical Data: Detected and Validated
33.857828000000005
True


Numerical Data: Detected and Validated
-84.167465
True


Numerical Data: Detected and Validated
0.41
True


Numerical Data: Detected and Validated
17859.6
True


Numerical Data: Detected and Validated
1975
True


Numerical Data: Detected and Validated
1
True


Numerical Data: Detected and Validated
8
True


Numerical Data: Detected and Validated
4
True


Numerical Data: Detected and Validated
2
True


Numerical Data: Detected and Validated
2021
True




In [958]:
def validate_categorical_data(cat_col_val):
    '''
    This function validates whether the input categorical column actually has the categorical data
    Args:
        cat_col_val: str
            - actual string value
    Returns:
        valid_categorical_flag: bool
            - True if all the data are strings
            - False if the data is other than strings
    '''
    valid_categorical_flag = True
    try:
        float(cat_col_val)
    except:
        if bool(cat_col_val) is False:
            valid_categorical_flag = False
            return valid_categorical_flag
        print("Categorical Data: Detected and Validated!")
    else:
        print("You tried to enter numerical data in categorical column!")
        valid_categorical_flag = False

    return valid_categorical_flag

In [959]:
for j, col in enumerate(categorical_cols):
    valid_categorical_flag = validate_categorical_data(list(user_data[col])[0])
    print(list(user_data[col])[0])
    print(valid_categorical_flag)
    print("\n")

Categorical Data: Detected and Validated!
Stone Mountain
True


Categorical Data: Detected and Validated!
GA
True


Categorical Data: Detected and Validated!
Dee
True


Categorical Data: Detected and Validated!
Ct
True


Categorical Data: Detected and Validated!
Single Family Residential
True


Categorical Data: Detected and Validated!
R100-Single Family Residence
True


Categorical Data: Detected and Validated!
Level Grade
True


Categorical Data: Detected and Validated!
Ranch/Rambler
True


Categorical Data: Detected and Validated!
Average
True


Categorical Data: Detected and Validated!
Forced air
True


Categorical Data: Detected and Validated!
Yes
True


Categorical Data: Detected and Validated!
Crawl Space/Raised
True


Categorical Data: Detected and Validated!
Masonry
True




In [960]:
user_data

Unnamed: 0,city,state,street,streetSuffix,zip,latitude,longitude,landUseDescription,zoningDescription,lotSizeAcres,lotSizeSquareFeet,lotTopography,condition,architecturalStyle,yearBuilt,totalStories,totalRooms,bedrooms,baths,heating,airConditioning,foundation,fireplace,soldYear
0,Stone Mountain,GA,Dee,Ct,30087,33.857828,-84.167465,Single Family Residential,R100-Single Family Residence,0.41,17859.6,Level Grade,Average,Ranch/Rambler,1975,1,8,4,2,Forced air,Yes,Crawl Space/Raised,Masonry,2021


In [961]:
for i, name in enumerate(categorical_cols):
  user_data[name] = encodings_database[name][user_data[name].values[0].lower()]

In [962]:
user_data

Unnamed: 0,city,state,street,streetSuffix,zip,latitude,longitude,landUseDescription,zoningDescription,lotSizeAcres,lotSizeSquareFeet,lotTopography,condition,architecturalStyle,yearBuilt,totalStories,totalRooms,bedrooms,baths,heating,airConditioning,foundation,fireplace,soldYear
0,18,9,2214,14,30087,33.857828,-84.167465,13,43,0.41,17859.6,1,0,5,1975,1,8,4,2,1,1,0,0,2021


In [963]:
# Loading the MinMaxScalar to normalize the categorical features

filename = "/content/drive/MyDrive/Software Engg Regression Analysis/[LATEST] Manish New Data/minMaxScalar.joblib"
minMaxScalar = load(filename)

In [964]:
# MINMAX NORMALIZATION:

categorical_cols.remove("state")
categorical_data = user_data[categorical_cols]

user_data[categorical_cols] = minMaxScalar.transform(categorical_data)

#Encoding state separately, since we did not have all the 50 states in the data
user_data["state"] /= 49

In [965]:
user_data

Unnamed: 0,city,state,street,streetSuffix,zip,latitude,longitude,landUseDescription,zoningDescription,lotSizeAcres,lotSizeSquareFeet,lotTopography,condition,architecturalStyle,yearBuilt,totalStories,totalRooms,bedrooms,baths,heating,airConditioning,foundation,fireplace,soldYear
0,0.9,0.183673,0.241334,0.215385,30087,33.857828,-84.167465,0.8125,0.494253,0.41,17859.6,0.2,0.0,1.0,1975,1,8,4,2,0.25,1.0,0.0,0.0,2021


In [966]:
# Now we have all numerical values but in string format, so we convert all the values to numeric
user_data = user_data.apply(pd.to_numeric, errors="coerce")

In [967]:
# Loading the StandardScalar to normalize the numerical features

filename = "/content/drive/MyDrive/Software Engg Regression Analysis/[LATEST] Manish New Data/standardScalar.joblib"
standardScalar = load(filename)

In [968]:
numerical_data = user_data[numerical_cols]

user_data[numerical_cols] = standardScalar.transform(numerical_data)

In [969]:
user_data.shape

(1, 24)

In [970]:
user_data

Unnamed: 0,city,state,street,streetSuffix,zip,latitude,longitude,landUseDescription,zoningDescription,lotSizeAcres,lotSizeSquareFeet,lotTopography,condition,architecturalStyle,yearBuilt,totalStories,totalRooms,bedrooms,baths,heating,airConditioning,foundation,fireplace,soldYear
0,0.9,0.183673,0.241334,0.215385,-0.179133,0.06974,-0.11446,0.8125,0.494253,-0.051856,-0.051856,0.2,0.0,1.0,-1.429949,-1.10741,0.325799,0.573313,-1.007367,0.25,1.0,0.0,0.0,0.0


In [971]:
# Now the new test data is ready to feed into the Random Forest Regression Model
# rf_model = load("/content/drive/MyDrive/Software Engg Regression Analysis/[LATEST] Manish New Data/final_rf_model.joblib"
rf_model = load("/content/drive/MyDrive/Software Engg Regression Analysis/[LATEST] Manish New Data/rf_model_95_86_randomState27.joblib")

pred = rf_model.predict(user_data.values)
pred = round(pred[0], 3)
print("Predicted Housing Price:", pred)

Predicted Housing Price: 200529.766


In [972]:
# Dividing the predicted housing price by lotSizeSquareFeet

predPerSqFt = round((pred/lotSizqSqFt)[0], 3)
print(f"House Price per SqFt: {predPerSqFt}")

House Price per SqFt: 11.228


In [973]:
amortization_table = pd.read_csv("/content/drive/MyDrive/Software Engg Regression Analysis/[LATEST] Manish New Data/demo_amortization_table.csv")

In [974]:
amortization_table.head()

Unnamed: 0,id,Mortgage Starting Date,House Value,Paid In Cash,Principal Loan Amount,Annual Interest Rate,Total Months Elapsed,Loan Period (Years),Monthly Payment Amount,Total Principal Paid,Total Interest Paid,Remaining Principal
0,108.0,5/13/2011,196100.0,74518.0,"$121,582",4%,126.0,30.0,($580.45),-27374.96705,-45761.86708,94207.03295
1,113.0,2/4/2014,254700.0,28017.0,"$226,683",4%,93.0,30.0,"($1,082.22)",-35540.5563,-65105.83992,191142.4437
2,,,,,,,,,,,,
3,,,,,,,,,,,,


In [975]:
user_amortization_data = amortization_table.loc[amortization_table["id"] == user_id]
user_amortization_data

Unnamed: 0,id,Mortgage Starting Date,House Value,Paid In Cash,Principal Loan Amount,Annual Interest Rate,Total Months Elapsed,Loan Period (Years),Monthly Payment Amount,Total Principal Paid,Total Interest Paid,Remaining Principal
0,108.0,5/13/2011,196100.0,74518.0,"$121,582",4%,126.0,30.0,($580.45),-27374.96705,-45761.86708,94207.03295


In [976]:
user_amortization_data["Predicted House Price"] = pred

In [977]:
user_amortization_data.head()

Unnamed: 0,id,Mortgage Starting Date,House Value,Paid In Cash,Principal Loan Amount,Annual Interest Rate,Total Months Elapsed,Loan Period (Years),Monthly Payment Amount,Total Principal Paid,Total Interest Paid,Remaining Principal,Predicted House Price
0,108.0,5/13/2011,196100.0,74518.0,"$121,582",4%,126.0,30.0,($580.45),-27374.96705,-45761.86708,94207.03295,200529.766


In [978]:
user_amortization_data["Predicted Square Feet Price"] = predPerSqFt

In [979]:
user_amortization_data["Equity Value"] = user_amortization_data["Predicted House Price"] - user_amortization_data["Remaining Principal"]

In [980]:
user_amortization_data

Unnamed: 0,id,Mortgage Starting Date,House Value,Paid In Cash,Principal Loan Amount,Annual Interest Rate,Total Months Elapsed,Loan Period (Years),Monthly Payment Amount,Total Principal Paid,Total Interest Paid,Remaining Principal,Predicted House Price,Predicted Square Feet Price,Equity Value
0,108.0,5/13/2011,196100.0,74518.0,"$121,582",4%,126.0,30.0,($580.45),-27374.96705,-45761.86708,94207.03295,200529.766,11.228,106322.73305


In [981]:
!pip install cryptography



In [982]:
# Encrypting the data to get the cipher text using Fernet Encryption

from cryptography.fernet import Fernet

# Message to encrypt
id = str(int(user_amortization_data["id"]))
house_price = str(float(user_amortization_data["Predicted House Price"]))
house_price_per_sqft = str(float(user_amortization_data["Predicted Square Feet Price"]))
equity_value = str(round(float(user_amortization_data["Equity Value"]), 3))

# Generating the Key
key = Fernet.generate_key()
fernet = Fernet(key)

print(f"Encryption Key: {key}\n")

# Encrpyting the messages
enc_id = fernet.encrypt(id.encode())
enc_house_price = fernet.encrypt(house_price.encode())
enc_equity_value = fernet.encrypt(equity_value.encode())
enc_house_price_per_sqft = fernet.encrypt(house_price_per_sqft.encode())

print("User ID: ", id)
print(f"Encrypted User ID: {enc_id}\n")

print("Predicted House Price: ", house_price)
print(f"Encrypted House Price: {enc_house_price}\n")

print("Predicted House Price Per Sqft: ", house_price_per_sqft)
print(f"Encrypted House Price Per Sqft:  {enc_house_price_per_sqft}\n")

print("Predicted Equity Value: ", equity_value)
print(f"Encrypted Equity Value: {enc_equity_value}\n")

Encryption Key: b'dvvYw10XikryZ7lFhkec9M5G8DPJlPxXhvl3gBAoT6g='

User ID:  108
Encrypted User ID: b'gAAAAABhsq3oMsah0aHeNL45dHXx-LRsqJumxIQ-JOSWWwe6hKPSwnK5B2_mEtPXeRlqjgyirMis_pwvIzdguyb8FqJQv3rRLg=='

Predicted House Price:  200529.766
Encrypted House Price: b'gAAAAABhsq3oOUzU6NwUqSlj2PS3TZBeo9D5m8z0QtvhT7rCZBMbNUUl37BqFCB1HVyeshoSWJAhlGlCJZWPH6uF9L_XTGt_uQ=='

Predicted House Price Per Sqft:  11.228
Encrypted House Price Per Sqft:  b'gAAAAABhsq3oIFGEtPDgNkqicel_CKCaOr-GkQOMNxt63o3b0xaK9vrNGFmzwbWqM5TtYsrnFJ1kNKTevJL1w3PRe4QbmGt2Ig=='

Predicted Equity Value:  106322.733
Encrypted Equity Value: b'gAAAAABhsq3oijXnXZRznQ-NlptbLG-i_7rM-J4GgeRQd8g7_l0cy-uUyGIlltfQRJAs_qaQJfCZiYCWhSSKUpOThVvxdHS2Vw=='



In [983]:
print(f"User {user_id} belongs to the zip {zip}, and the average price for that zip is {zip_average_price[zip]}")

User 108 belongs to the zip 30087, and the average price for that zip is 190259.9042


In [984]:
# Decrypting the messages
dec_id = fernet.decrypt(enc_id).decode()
dec_house_price = fernet.decrypt(enc_house_price).decode()
dec_equity_value = fernet.decrypt(enc_equity_value).decode()
dec_house_price_per_sqft = fernet.decrypt(enc_house_price_per_sqft).decode()

print(f"Decrypted User ID: {dec_id}\n")
print("Decrypted House Price: ", dec_house_price)
print("Decrypted House Price Per Sqft: ", dec_house_price_per_sqft)
print("Decrypted Equity Value: ", dec_equity_value)


Decrypted User ID: 108

Decrypted House Price:  200529.766
Decrypted House Price Per Sqft:  11.228
Decrypted Equity Value:  106322.733
