# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>

#### <center> **Final Project: Batch Processing** </center>
---

**Date**: October, 2025

**Student Name**:

**Professor**: Pablo Camarillo Ramirez

# Introduction

OrangeStone wishes to enter the House Market, and has access to a database that is updated monthly with information pertaining to global house purchases. They want to use the most of these information to make decisions on the real state they buy. As such, they decided to transform their raw data into a star model for ease of analysis.

# Dataset

The dataset they have access to is https://www.kaggle.com/datasets/mohankrishnathalla/global-house-purchase-decision-dataset


# Transformations and Actions

## Transformations

In [24]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ML: Logistic Regression") \
    .master("spark://spark-master:7077") \
    .config("spark.jars", "/opt/spark/work-dir/jars/postgresql-42.7.8.jar") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

25/10/27 02:07:26 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
from regalado_floriano.spark_utils import SparkUtils 

In [3]:
houses_schema = SparkUtils.generate_schema(
 
    (("property_id","int"),
("country","string"),
("city","string"),
("property_type","string"),
("furnishing_status","string"),
("property_size_sqft","int"),
("price","int"),
("constructed_year","int"),
("previous_owners","int"),
("rooms","int"),
("bathrooms","int"),
("garage","bool"),
("garden","bool"),
("crime_cases_reported","int"),
("legal_cases_on_property","bool"),
("customer_salary","int"),
("loan_amount","int"),
("loan_tenure_years","int"),
("monthly_expenses","int"),
("down_payment","int"),
("emi_to_income_ratio","float"),
("satisfaction_score","int"),
("neighbourhood_rating","int"),
("connectivity_score","int"),
("decision","bool")
    )
)

In [4]:
house_df = spark.read \
                .option("header", "true") \
                .schema(houses_schema) \
                .csv("/opt/spark/work-dir/data/house_purchases")
 

In [5]:
house_df= house_df.na.fill(False)
 

In [6]:
#in order to prepare for ease of analysis, all categorical data will be put in it's own frame. That is to say, each country and city will get 
# their own id

In [7]:
from pyspark.sql.functions import col
from pyspark.sql.functions import monotonically_increasing_id

In [8]:
categories = "country city property_type furnishing_status".split()

In [9]:
_localMap = SparkUtils.generate_keyed_distinct_column(house_df)

categoricalTables =   {
     key:     _localMap(key) for key in categories
 }

In [10]:
id_house = house_df
for cat in categories:
    cur_df = categoricalTables[cat]
    id_house = SparkUtils.replace_column_for_key(id_house)(cur_df)(cat)

    

In [11]:
id_bak = id_house

In [12]:
locations = id_house.select( "country_id", "city_id").distinct().withColumn("id",monotonically_increasing_id())

In [13]:
id_house = id_house.join(
    locations ,
    on=[id_house["country_id"] == locations["country_id"],
        id_house["city_id"] == locations["city_id"]],
    how="left"
).drop("country_id", "city_id").withColumnRenamed("id", "location_id")


25/10/27 01:56:36 WARN Column: Constructing trivially true equals predicate, 'm.country_id == m.country_id'. Perhaps you need to use aliases.
25/10/27 01:56:36 WARN Column: Constructing trivially true equals predicate, 'm.city_id == m.city_id'. Perhaps you need to use aliases.


In [14]:
house_details = id_house.select("property_id","legal_cases_on_property", 
                                "previous_owners","rooms","bathrooms", "garage","garden","crime_cases_reported","legal_cases_on_property",
                               "neighbourhood_rating", "satisfaction_score", "property_size_sqft", "price"
                               ,"constructed_year", "furnishing_status_id", "property_type_id"
                               )



In [15]:
loan_details = house_df.select( 
            "property_id",
            "loan_amount","loan_tenure_years","down_payment"
                              ) 

In [16]:
buyer_details = id_house.select(
    "property_id", "customer_salary","emi_to_income_ratio", "monthly_expenses", "connectivity_score"
                               )



In [17]:
houses_k =  id_house.select("property_id", "decision", "location_id" )

In [22]:
all_details = [house_details , loan_details, buyer_details]

## Actions

In [18]:
houses_k.show()

                                                                                

+-----------+--------+-----------+
|property_id|decision|location_id|
+-----------+--------+-----------+
|          1|   false|         22|
|          2|   false|          3|
|          3|   false|          4|
|          4|   false|          5|
|          5|   false|          4|
|          6|   false|          0|
|          7|   false|         23|
|          8|   false|         14|
|          9|   false|          1|
|         10|   false|          6|
|         11|   false|         23|
|         12|   false|         23|
|         13|   false|          5|
|         14|   false|          3|
|         15|   false|          7|
|         16|   false|         24|
|         17|   false|         15|
|         18|   false|         25|
|         19|   false|          8|
|         20|   false|         32|
+-----------+--------+-----------+
only showing top 20 rows


In [19]:
locations.show()

                                                                                

+----------+-------+---+
|country_id|city_id| id|
+----------+-------+---+
|         4|      9|  0|
|        10|     33|  1|
|        12|     17|  2|
|         9|      0|  3|
|         9|     28|  4|
|         6|     31|  5|
|         5|     10|  6|
|         8|      2|  7|
|        11|     34|  8|
|         1|     35|  9|
|        12|      4| 10|
|         1|     22| 11|
|         0|      7| 12|
|        12|     39| 13|
|         7|     32| 14|
|         1|     19| 15|
|         4|     36| 16|
|         3|     21| 17|
|        11|     37| 18|
|         1|      5| 19|
+----------+-------+---+
only showing top 20 rows


In [21]:
for table in categoricalTables:
    categoricalTables[table].show()

+------------+---+
|     country| id|
+------------+---+
|      France|  0|
|         USA|  1|
|   Singapore|  2|
|       Japan|  3|
|      Canada|  4|
|   Australia|  5|
|     Germany|  6|
|      Brazil|  7|
|          UK|  8|
|South Africa|  9|
|         UAE| 10|
|       China| 11|
|       India| 12|
+------------+---+

+--------------+---+
|          city| id|
+--------------+---+
|     Cape Town|  0|
|Rio de Janeiro|  1|
|        London|  2|
|     Singapore|  3|
|       Chennai|  4|
|       Chicago|  5|
|      Brisbane|  6|
|         Paris|  7|
|     Bangalore|  8|
|      Montreal|  9|
|     Melbourne| 10|
|     Liverpool| 11|
|       Beijing| 12|
|         Tokyo| 13|
|    Birmingham| 14|
|        Munich| 15|
|        Sydney| 16|
|        Mumbai| 17|
|     Hyderabad| 18|
| San Francisco| 19|
+--------------+---+
only showing top 20 rows
+-----------------+---+
|    property_type| id|
+-----------------+---+
|           Studio|  0|
|        Apartment|  1|
|        Townhouse|  2|
|  

In [23]:
for det in all_details:
    det.show()

+-----------+-----------------------+---------------+-----+---------+------+------+--------------------+-----------------------+--------------------+------------------+------------------+-------+----------------+--------------------+----------------+
|property_id|legal_cases_on_property|previous_owners|rooms|bathrooms|garage|garden|crime_cases_reported|legal_cases_on_property|neighbourhood_rating|satisfaction_score|property_size_sqft|  price|constructed_year|furnishing_status_id|property_type_id|
+-----------+-----------------------+---------------+-----+---------+------+------+--------------------+-----------------------+--------------------+------------------+------------------+-------+----------------+--------------------+----------------+
|          1|                  false|              6|    6|        2| false| false|                   1|                  false|                   5|                 1|               991| 412935|            1989|                   1|              

# Persistence Data

In [25]:
jdbc_url = "jdbc:postgresql://postgres-iteso:5432/postgres"

In [None]:
houses_k

# DAG