In [4]:
import pyspark
import pyspark.sql
import io
import pandas as pd
import requests
from pyspark.sql.functions import col, translate, element_at, split, to_date, explode, lower

In [2]:
spark = pyspark.sql.SparkSession.builder \
                    .appName("LocalSparkProject") \
                    .master("local[4]") \
                    .config("spark.jars", "./psql_adapter/postgresql-42.7.4.jar") \
                    .config("spark.executor.memory", "6g") \
                    .getOrCreate()

24/11/11 19:32:27 WARN Utils: Your hostname, MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.10.229.62 instead (on interface en0)
24/11/11 19:32:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/11/11 19:32:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
def load_data_from_url_as_spark_df(url, **params):
    """Loads data from url. Optional keyword arguments are passed to pandas.read_csv."""
    response = requests.get(url)
    dx = pd.read_csv(io.BytesIO(response.content), **params)  
    return spark.createDataFrame(dx)

In [4]:
listings_url = "https://data.insideairbnb.com/france/ile-de-france/paris/2024-09-06/data/listings.csv.gz"

In [5]:
listings_df = load_data_from_url_as_spark_df(listings_url, sep=',', index_col=0, quotechar='"', compression='gzip')

In [6]:
listings_df.show()

24/11/11 19:33:19 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/11/11 19:33:19 WARN TaskSetManager: Stage 0 contains a task of very large size (41281 KiB). The maximum recommended task size is 1000 KiB.
24/11/11 19:33:24 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 0 (TID 0): Attempting to kill Python Worker
                                                                                

+--------------------+--------------+------------+---------------+--------------------+--------------------+---------------------+--------------------+-------+--------------------+-----------------+----------+-------------+--------------------+------------------+------------------+--------------------+-----------------+--------------------+--------------------+--------------------+-------------------+-------------------------+--------------------+--------------------+----------------------+--------------------+----------------------+----------------------------+-----------------+-----------------+--------------------+---------------+------------+---------+--------------+--------+----+--------------------+-------+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+----------------+---------------+---------------+---------------+----------------+-----------

## Craete table "paris" in Postgres

In [8]:
p_url = "jdbc:postgresql://localhost:5445/postgres"

properties = {
    "user": "postgres",
    "password": "1234",
    "driver": "org.postgresql.Driver"
}

listings_df.write \
    .format("jdbc") \
    .option("url", p_url) \
    .option("dbtable", "paris") \
    .option("user", properties["user"]) \
    .option("password", properties["password"]) \
    .option("driver", properties["driver"]) \
    .mode("append") \
    .save()

24/11/11 19:35:26 WARN TaskSetManager: Stage 1 contains a task of very large size (41281 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

## Check that the table exists

In [9]:
sample_df = spark.read \
    .format("jdbc") \
    .option("url", p_url) \
    .option("dbtable", "(SELECT * FROM public.paris LIMIT 1) AS temp") \
    .option("user", properties["user"]) \
    .option("password", properties["password"]) \
    .option("driver", properties["driver"]) \
    .load()

# Show the result
sample_df.show()

+--------------------+--------------+------------+---------------+--------------------+--------------------+---------------------+--------------------+---------+--------------------+-----------------+----------+-------------+----------+------------------+------------------+--------------------+-----------------+--------------------+--------------------+--------------------+-------------------+-------------------------+------------------+--------------------+----------------------+-------------+----------------------+----------------------------+--------+---------+-------------+---------------+------------+---------+--------------+--------+----+--------------------+-----+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+----------------+---------------+---------------+---------------+----------------+---------------------+-----------------+--------------

In [10]:
listings_df.columns

['listing_url',
 'scrape_id',
 'last_scraped',
 'source',
 'name',
 'description',
 'neighborhood_overview',
 'picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bathrooms_text',
 'bedrooms',
 'beds',
 'amenities',
 'price',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'calendar_updated',
 'has_availability',
 'availability_30',
 'av

In [11]:
columns_to_keep = [
    'host_id','host_since','host_is_superhost','latitude','longitude','property_type','room_type','accommodates','bathrooms','bathrooms_text','bedrooms','beds','amenities','price','minimum_nights','maximum_nights', 'number_of_reviews','review_scores_rating','license','instant_bookable','reviews_per_month'
]

base_df = listings_df.select(columns_to_keep)
clean_df = base_df.dropna()
clean_df.show()

24/11/11 19:35:46 WARN TaskSetManager: Stage 3 contains a task of very large size (41281 KiB). The maximum recommended task size is 1000 KiB.
[Stage 3:>                                                          (0 + 1) / 1]

+-------+----------+-----------------+-----------------+-----------------+------------------+---------------+------------+---------+--------------+--------+----+--------------------+---------+--------------+--------------+-----------------+--------------------+--------------------+----------------+-----------------+
|host_id|host_since|host_is_superhost|         latitude|        longitude|     property_type|      room_type|accommodates|bathrooms|bathrooms_text|bedrooms|beds|           amenities|    price|minimum_nights|maximum_nights|number_of_reviews|review_scores_rating|             license|instant_bookable|reviews_per_month|
+-------+----------+-----------------+-----------------+-----------------+------------------+---------------+------------+---------+--------------+--------+----+--------------------+---------+--------------+--------------+-----------------+--------------------+--------------------+----------------+-----------------+
|   3631|2008-10-14|                f|        

24/11/11 19:35:50 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 3 (TID 6): Attempting to kill Python Worker
                                                                                

In [None]:
fixed_price_df = base_df.withColumn("price", translate(col("price"), "$,", "").cast("float"))

fixed_price_df.show()

24/11/11 19:35:53 WARN TaskSetManager: Stage 4 contains a task of very large size (41281 KiB). The maximum recommended task size is 1000 KiB.
[Stage 4:>                                                          (0 + 1) / 1]

+-------+----------+-----------------+-----------------+-----------------+--------------------+---------------+------------+---------+--------------+--------+----+--------------------+-----+--------------+--------------+-----------------+--------------------+--------------------+----------------+-----------------+
|host_id|host_since|host_is_superhost|         latitude|        longitude|       property_type|      room_type|accommodates|bathrooms|bathrooms_text|bedrooms|beds|           amenities|price|minimum_nights|maximum_nights|number_of_reviews|review_scores_rating|             license|instant_bookable|reviews_per_month|
+-------+----------+-----------------+-----------------+-----------------+--------------------+---------------+------------+---------+--------------+--------+----+--------------------+-----+--------------+--------------+-----------------+--------------------+--------------------+----------------+-----------------+
|   3631|2008-10-14|                f|         48.83

24/11/11 19:35:57 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 4 (TID 7): Attempting to kill Python Worker
                                                                                

In [13]:
clean_df = fixed_price_df.dropna()
clean_df.show()

24/11/11 19:35:59 WARN TaskSetManager: Stage 5 contains a task of very large size (41281 KiB). The maximum recommended task size is 1000 KiB.
[Stage 5:>                                                          (0 + 1) / 1]

+-------+----------+-----------------+-----------------+-----------------+------------------+---------------+------------+---------+--------------+--------+----+--------------------+------+--------------+--------------+-----------------+--------------------+--------------------+----------------+-----------------+
|host_id|host_since|host_is_superhost|         latitude|        longitude|     property_type|      room_type|accommodates|bathrooms|bathrooms_text|bedrooms|beds|           amenities| price|minimum_nights|maximum_nights|number_of_reviews|review_scores_rating|             license|instant_bookable|reviews_per_month|
+-------+----------+-----------------+-----------------+-----------------+------------------+---------------+------------+---------+--------------+--------+----+--------------------+------+--------------+--------------+-----------------+--------------------+--------------------+----------------+-----------------+
|   3631|2008-10-14|                f|         48.83191

24/11/11 19:36:03 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 5 (TID 8): Attempting to kill Python Worker
                                                                                

In [14]:
clean_df = clean_df.coalesce(100) 
clean_df.select("price").describe().show()

24/11/11 19:36:05 WARN TaskSetManager: Stage 6 contains a task of very large size (41282 KiB). The maximum recommended task size is 1000 KiB.


+-------+-----------------+
|summary|            price|
+-------+-----------------+
|  count|            45916|
|   mean|212.8508363097831|
| stddev|404.3842515189904|
|    min|              8.0|
|    max|          24200.0|
+-------+-----------------+



In [15]:
clean_df \
    .groupBy("minimum_nights").count() \
    .orderBy(col("count").desc(), col("minimum_nights")) \
    .show()

24/11/11 19:36:08 WARN TaskSetManager: Stage 9 contains a task of very large size (41282 KiB). The maximum recommended task size is 1000 KiB.


+--------------+-----+
|minimum_nights|count|
+--------------+-----+
|             2|12533|
|             1|11501|
|             3|10124|
|             4| 3309|
|            30| 2857|
|             5| 2272|
|             7|  831|
|             6|  647|
|           365|  444|
|            90|  292|
|            31|  235|
|            10|  150|
|             8|   81|
|            91|   72|
|            14|   71|
|            15|   70|
|            60|   62|
|            28|   47|
|            20|   45|
|            12|   26|
+--------------+-----+
only showing top 20 rows



In [16]:
min_nights_df = clean_df.filter(col("minimum_nights") <= 90)

min_nights_df.show()

24/11/11 19:36:10 WARN TaskSetManager: Stage 12 contains a task of very large size (41282 KiB). The maximum recommended task size is 1000 KiB.
[Stage 12:>                                                         (0 + 1) / 1]

+-------+----------+-----------------+-----------------+-----------------+------------------+---------------+------------+---------+--------------+--------+----+--------------------+-----+--------------+--------------+-----------------+--------------------+--------------------+----------------+-----------------+
|host_id|host_since|host_is_superhost|         latitude|        longitude|     property_type|      room_type|accommodates|bathrooms|bathrooms_text|bedrooms|beds|           amenities|price|minimum_nights|maximum_nights|number_of_reviews|review_scores_rating|             license|instant_bookable|reviews_per_month|
+-------+----------+-----------------+-----------------+-----------------+------------------+---------------+------------+---------+--------------+--------+----+--------------------+-----+--------------+--------------+-----------------+--------------------+--------------------+----------------+-----------------+
|   3631|2008-10-14|                f|         48.83191|  

24/11/11 19:36:14 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 12 (TID 19): Attempting to kill Python Worker
                                                                                

In [17]:
min_nights_df.select('bathrooms', 'bathrooms_text').show()

24/11/11 19:36:17 WARN TaskSetManager: Stage 13 contains a task of very large size (41282 KiB). The maximum recommended task size is 1000 KiB.
[Stage 13:>                                                         (0 + 1) / 1]

+---------+--------------+
|bathrooms|bathrooms_text|
+---------+--------------+
|      1.0|        1 bath|
|      1.0|        1 bath|
|      1.0|        1 bath|
|      1.0|        1 bath|
|      1.0|        1 bath|
|      1.0|        1 bath|
|      1.0|        1 bath|
|      1.0|        1 bath|
|      1.5|     1.5 baths|
|      1.0|        1 bath|
|      1.0|        1 bath|
|      1.0|        1 bath|
|      1.0|        1 bath|
|      1.0|        1 bath|
|      1.0|        1 bath|
|      1.0|        1 bath|
|      2.5|     2.5 baths|
|      1.0|        1 bath|
|      1.0|        1 bath|
|      2.0|       2 baths|
+---------+--------------+
only showing top 20 rows



24/11/11 19:36:21 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 13 (TID 20): Attempting to kill Python Worker
                                                                                

In [None]:
bathrooms_df = min_nights_df.withColumn('bathrooms', element_at(split('bathrooms_text', ' '), 1).cast('double')).drop('bathrooms_text')

bathrooms_df.show()

24/11/11 19:36:23 WARN TaskSetManager: Stage 14 contains a task of very large size (41282 KiB). The maximum recommended task size is 1000 KiB.
[Stage 14:>                                                         (0 + 1) / 1]

+-------+----------+-----------------+-----------------+-----------------+------------------+---------------+------------+---------+--------+----+--------------------+-----+--------------+--------------+-----------------+--------------------+--------------------+----------------+-----------------+
|host_id|host_since|host_is_superhost|         latitude|        longitude|     property_type|      room_type|accommodates|bathrooms|bedrooms|beds|           amenities|price|minimum_nights|maximum_nights|number_of_reviews|review_scores_rating|             license|instant_bookable|reviews_per_month|
+-------+----------+-----------------+-----------------+-----------------+------------------+---------------+------------+---------+--------+----+--------------------+-----+--------------+--------------+-----------------+--------------------+--------------------+----------------+-----------------+
|   3631|2008-10-14|                f|         48.83191|           2.3187|Entire rental unit|Entire hom

24/11/11 19:36:27 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 14 (TID 21): Attempting to kill Python Worker
                                                                                

In [19]:
boolean_df = (
  bathrooms_df
  .withColumn('instant_bookable', col('instant_bookable') == 't')
  .withColumn('host_is_superhost', col('host_is_superhost') == 't')
)

In [None]:
amenities_df = boolean_df.withColumn('amenities', split(translate('amenities', '\\]\\[\\"', ''), ','))


amenities_df \
  .select(explode('amenities')) \
  .withColumn('item', lower('col')) \
  .groupBy('item').count() \
  .sort('count') \
  .show()


24/11/11 19:36:30 WARN TaskSetManager: Stage 15 contains a task of very large size (41282 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+--------------------+-----+
|                item|count|
+--------------------+-----+
| leibherr refrige...|    1|
| chaleur tournant...|    1|
| plaque de cuisso...|    1|
| kerastase /redke...|    1|
| ru00e9frigerateu...|    1|
| fast wifi u2013 ...|    1|
| speakers harman ...|    1|
| 48 inch hdtv wit...|    1|
| chaipas stainles...|    1|
| molton brown  bo...|    1|
| 27 inch hdtv wit...|    1|
| cabasse eole 4 -...|    1|
| natural conditioner|    1|
|             klorane|    1|
| toshiba sound sy...|    1|
| create sound sys...|    1|
| petit savon body...|    1|
| fast wifi u2013 ...|    1|
| distributeur ron...|    1|
| savon en bouteil...|    1|
+--------------------+-----+
only showing top 20 rows



## Ultimate function which combines all previous transformation

In [21]:
def prepare_paris_listings(raw_df):

  columns_to_keep = [
      'host_id','host_since','host_is_superhost','latitude','longitude','property_type','room_type','accommodates','bathrooms','bathrooms_text','bedrooms','beds','amenities','price','minimum_nights','maximum_nights', 'number_of_reviews','review_scores_rating','license','instant_bookable','reviews_per_month'
  ]
  
  df = (
    raw_df.select(columns_to_keep)
    .withColumn("price", translate(col("price"), "$,", "").cast("float"))
    .dropna()
    .filter(col("price") <= 10000)
    .filter(col("minimum_nights") <= 90)
    .withColumn('bathrooms', element_at(split('bathrooms_text', ' '), 1).cast('double')).drop('bathrooms_text')
    .withColumn('host_since', to_date('host_since'))
    .withColumn('instant_bookable', col('instant_bookable') == 't')
    .withColumn('host_is_superhost', col('host_is_superhost') == 't')
    .withColumn('amenities', split(translate('amenities', '\\]\\[\\"', ''), ','))
  )

  return df 

df = prepare_paris_listings(listings_df)
df.show()

24/11/11 19:36:33 WARN TaskSetManager: Stage 18 contains a task of very large size (41281 KiB). The maximum recommended task size is 1000 KiB.
[Stage 18:>                                                         (0 + 1) / 1]

+-------+----------+-----------------+-----------------+-----------------+------------------+---------------+------------+---------+--------+----+--------------------+-----+--------------+--------------+-----------------+--------------------+--------------------+----------------+-----------------+
|host_id|host_since|host_is_superhost|         latitude|        longitude|     property_type|      room_type|accommodates|bathrooms|bedrooms|beds|           amenities|price|minimum_nights|maximum_nights|number_of_reviews|review_scores_rating|             license|instant_bookable|reviews_per_month|
+-------+----------+-----------------+-----------------+-----------------+------------------+---------------+------------+---------+--------+----+--------------------+-----+--------------+--------------+-----------------+--------------------+--------------------+----------------+-----------------+
|   3631|2008-10-14|            false|         48.83191|           2.3187|Entire rental unit|Entire hom

24/11/11 19:36:37 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 18 (TID 27): Attempting to kill Python Worker
                                                                                

24/11/11 20:12:15 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 386928 ms exceeds timeout 120000 ms
24/11/11 20:12:15 WARN SparkContext: Killing executors is not supported by current scheduler.
24/11/11 20:12:18 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$