In [1]:
import os
# Find the latest version of spark 3.x  from https://downloads.apache.org/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.1'
spark_version = 'spark-3.5.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,376 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2,128 kB]
Get:13 http://archive.ubuntu.com/ubuntu jamm

In [2]:
 # Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [44]:
 # Read in real_estate_sales.csv data from the S3 Bucket realestateproj
from pyspark import SparkFiles
url ="https://realestateproj.s3.ap-southeast-2.amazonaws.com/real-estate-sales.csv"
spark.sparkContext.addFile(url)
df_real_estate_sales = spark.read.csv(SparkFiles.get("real-estate-sales.csv"), header=True, inferSchema=True)

# Show DataFrame
df_real_estate_sales.show(3)

+-------------+---------+-------------+-------+---------------+--------------+-----------+-----------+-------------+----------------+------------+----------------+-----------+--------------------+
|Serial Number|List Year|Date Recorded|   Town|        Address|Assessed Value|Sale Amount|Sales Ratio|Property Type|Residential Type|Non Use Code|Assessor Remarks|OPM remarks|            Location|
+-------------+---------+-------------+-------+---------------+--------------+-----------+-----------+-------------+----------------+------------+----------------+-----------+--------------------+
|      2020348|     2020|   09/13/2021|Ansonia|230 WAKELEE AVE|      150500.0|   325000.0|      0.463|   Commercial|            NULL|        NULL|            NULL|       NULL|                NULL|
|        20002|     2020|   10/02/2020|Ashford|390 TURNPIKE RD|      253000.0|   430000.0|     0.5883|  Residential|   Single Family|        NULL|            NULL|       NULL|                NULL|
|       210317|

In [45]:
# Drop rows with missing values
df_real_estate_sales = df_real_estate_sales.dropna()

# Select relevant columns
df_real_estate_sales = df_real_estate_sales.select(
    'List Year', 'Assessed Value', 'Sale Amount', 'Sales Ratio', 'Town', 'Property Type', 'Residential Type', 'Non Use Code'
)

# Show the cleaned DataFrame
df_real_estate_sales.show(3)

+---------+--------------+-----------+-----------+---------+-------------+----------------+------------+
|List Year|Assessed Value|Sale Amount|Sales Ratio|     Town|Property Type|Residential Type|Non Use Code|
+---------+--------------+-----------+-----------+---------+-------------+----------------+------------+
|     2021|      172300.0|   240000.0|0.717916667|   Berlin|  Residential|   Single Family|  25 - Other|
|     2021|      264320.0|   334000.0|0.791377246|Fairfield|  Residential|   Single Family|  25 - Other|
|     2021|      146600.0|   285000.0|0.514385965|  Danbury|  Residential|           Condo|  25 - Other|
+---------+--------------+-----------+-----------+---------+-------------+----------------+------------+
only showing top 3 rows



In [63]:
# Convert Spark DataFrame to Pandas DataFrame
df_real_estate_sales_pandas = df_real_estate_sales.toPandas()

# Print the first few rows of the Pandas DataFrame
df_real_estate_sales_pandas.head(10)

Unnamed: 0,List Year,Assessed Value,Sale Amount,Sales Ratio,Town,Property Type,Residential Type,Non Use Code
0,2021,172300.0,240000.0,0.717917,Berlin,Residential,Single Family,25 - Other
1,2021,264320.0,334000.0,0.791377,Fairfield,Residential,Single Family,25 - Other
2,2021,146600.0,285000.0,0.514386,Danbury,Residential,Condo,25 - Other
3,2021,151600.0,350000.0,0.433143,Danbury,Residential,Condo,25 - Other
4,2021,130600.0,75000.0,1.741333,East Granby,Residential,Two Family,07 - Change in Property
5,2021,88800.0,232000.0,0.382759,Ansonia,Residential,Single Family,07 - Change in Property
6,2021,131840.0,210000.0,0.62781,Litchfield,Residential,Single Family,25 - Other
7,2021,159700.0,335000.0,0.476716,Berlin,Residential,Single Family,25 - Other
8,2020,121600.0,146216.0,0.831646,Danbury,Residential,Single Family,25 - Other
9,2020,263600.0,415000.0,0.635181,Danbury,Residential,Single Family,25 - Other


In [66]:
# Check the data type of panda df
df_real_estate_sales_pandas.dtypes

# Generate categorical variable list if data type is object
categorical_variables = df_real_estate_sales_pandas.dtypes[df_real_estate_sales_pandas.dtypes == 'object'].index.tolist()
df_real_estate_sales_pandas[categorical_variables].nunique()

Town                87
Property Type        6
Residential Type     5
Non Use Code        12
dtype: int64

In [68]:
# Create a OneHotEncoder instance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
import pandas as pd

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# fit and transform the onehotencoder using the category variable list
encode_df = pd.DataFrame(enc.fit_transform(df_real_estate_sales_pandas[categorical_variables]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(categorical_variables)
encode_df.head()



Unnamed: 0,Town_Ansonia,Town_Ashford,Town_Avon,Town_Barkhamsted,Town_Berlin,Town_Bethel,Town_Bridgeport,Town_Bristol,Town_Brookfield,Town_Burlington,...,Non Use Code_07 - Change in Property,Non Use Code_08 - Part Interest,Non Use Code_14 - Foreclosure,Non Use Code_17 - Two Towns,Non Use Code_22 - Money and Personal Property,Non Use Code_24 - Plottage,Non Use Code_25 - Other,Non Use Code_26 - Rehabilitation Deferred,Non Use Code_28 - Use Assessment,Non Use Code_30 - Auction
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
# Merge one-hot encoded features and drop the originals
df_real_estate_sales_df = df_real_estate_sales_pandas.merge(encode_df,left_index=True,right_index=True)
df_real_estate_sales_df = df_real_estate_sales_df.drop(columns = categorical_variables)
df_real_estate_sales_df

Unnamed: 0,List Year,Assessed Value,Sale Amount,Sales Ratio,Town_Ansonia,Town_Ashford,Town_Avon,Town_Barkhamsted,Town_Berlin,Town_Bethel,...,Non Use Code_07 - Change in Property,Non Use Code_08 - Part Interest,Non Use Code_14 - Foreclosure,Non Use Code_17 - Two Towns,Non Use Code_22 - Money and Personal Property,Non Use Code_24 - Plottage,Non Use Code_25 - Other,Non Use Code_26 - Rehabilitation Deferred,Non Use Code_28 - Use Assessment,Non Use Code_30 - Auction
0,2021,172300.0,240000.0,0.717917,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2021,264320.0,334000.0,0.791377,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,2021,146600.0,285000.0,0.514386,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2021,151600.0,350000.0,0.433143,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,2021,130600.0,75000.0,1.741333,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381,2019,73640.0,68000.0,1.082941,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
382,2019,69790.0,35000.0,1.994000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
383,2019,68200.0,157000.0,0.434400,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
384,2019,398690.0,465000.0,0.857398,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [74]:
# Define the target variable
target = 'Sale Amount'

# Split the data into features and target
X = df_real_estate_sales_df.drop(columns=[target])
y = df_real_estate_sales_df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data
X_scaler = scaler.fit(X_train)

# Transform the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [82]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the neural network model
model = Sequential()

# Add input layer and first hidden layer
model.add(Dense(units=128, input_dim=X_train_scaled.shape[1], activation='relu'))

# Add a second hidden layer
model.add(Dense(units=64, activation='relu'))

# Add the output layer
model.add(Dense(units=1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [83]:
# Evaluate the model on the test data
test_loss, test_mae = model.evaluate(X_test_scaled, y_test)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

Test Loss: 140813041664.0, Test MAE: 268702.90625
