In [1]:
!pip install --upgrade duckdb pandas

Collecting pandas
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.0.3
    Uninstalling pandas-2.0.3:
      Successfully uninstalled pandas-2.0.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.0.3, but you have pandas 2.2.2 which is incompatible.[0m[31m
[0mSuccessfully installed pandas-2.2.2


In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=1bd35d82bd9cd2f5e469cf67ed1c198313768d0be7d0b4a8a2f5e554d09087ca
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [3]:
# We should have the same version (0.10.1) of duckdb to load the database without any problem:
!pip show duckdb

Name: duckdb
Version: 0.10.2
Summary: DuckDB in-process database
Home-page: https://www.duckdb.org
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: malloy


In [4]:
!pip install -U duckdb==0.10.1

Collecting duckdb==0.10.1
  Downloading duckdb-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.1/18.1 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: duckdb
  Attempting uninstall: duckdb
    Found existing installation: duckdb 0.10.2
    Uninstalling duckdb-0.10.2:
      Successfully uninstalled duckdb-0.10.2
Successfully installed duckdb-0.10.1


In [5]:
!wget -O "duckdb.jar" "https://repo1.maven.org/maven2/org/duckdb/duckdb_jdbc/0.10.1/duckdb_jdbc-0.10.1.jar"

--2024-04-25 17:01:43--  https://repo1.maven.org/maven2/org/duckdb/duckdb_jdbc/0.10.1/duckdb_jdbc-0.10.1.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209, 2a04:4e42:4c::209, ...
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 64009472 (61M) [application/java-archive]
Saving to: ‘duckdb.jar’


2024-04-25 17:01:44 (143 MB/s) - ‘duckdb.jar’ saved [64009472/64009472]



## Chatbot class

In this data analysis pipeline, we will create a chatbot that will help us find an apartment based on the user's preferences. The chatbot will ask the user a series of questions to understand the user's preferences and then recommend an apartment based on the user's answers.

The questions will be:

- How many people are you?
- What is the maximum price you want?
- What types of rooms are you interested in?
- How many rooms do you want?
- Which temperature you want?
- How important is it for you that it doesn't rain?

Then, the chatbot will recommend an apartment based on the user's answers on a random city sorted by the price.

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .config("spark.jars", "duckdb.jar") \
    .getOrCreate()

In [94]:
class Chatbot:
    def __init__(self):
        self.room_type_list = None
        self.min_temp = None
        self.max_temp = None
        self.min_sum = None
        self.max_sum = None
        self.min_pers = None
        self.max_pers = None
        self.initialize_parameters()

    def initialize_parameters(self):
        """
        Function that gets the min and max amount for some variables and the different types of rooms.
        """

        query_room_type = """
        SELECT DISTINCT
          a.room_type
        FROM
          airbnb a
        """

        query_temp = """
        SELECT
          min(w.avg_temperature_2m) AS min, max(w.avg_temperature_2m) AS max
        FROM
          weather w
        WHERE
          w.date = '2024-03-21'
        """

        query_real_sum = """
        SELECT DISTINCT
          min(a.realSum) AS min, max(a.realSum) AS max
        FROM
          airbnb a
        """

        query_person = """
        SELECT DISTINCT
          min(a.person_capacity) AS min, max(a.person_capacity) AS max
        FROM
          airbnb a
        """

        DF = spark.read \
          .format("jdbc") \
          .option("url", "jdbc:duckdb:exploitation_database.duckdb") \
          .option("driver", "org.duckdb.DuckDBDriver") \
          .option("query", query_room_type) \
          .load()

        self.room_type_list = DF.select("room_type").rdd.flatMap(lambda x: x).collect()

        DF = spark.read \
          .format("jdbc") \
          .option("url", "jdbc:duckdb:exploitation_database.duckdb") \
          .option("driver", "org.duckdb.DuckDBDriver") \
          .option("query", query_temp) \
          .load()

        self.min_temp = round(DF.select("min").rdd.flatMap(lambda x: x).collect()[0],1)
        self.max_temp = round(DF.select("max").rdd.flatMap(lambda x: x).collect()[0],1)

        DF = spark.read \
          .format("jdbc") \
          .option("url", "jdbc:duckdb:exploitation_database.duckdb") \
          .option("driver", "org.duckdb.DuckDBDriver") \
          .option("query", query_real_sum) \
          .load()

        self.min_sum = round(DF.select("min").rdd.flatMap(lambda x: x).collect()[0],1)
        self.max_sum = round(DF.select("max").rdd.flatMap(lambda x: x).collect()[0],1)

        DF = spark.read \
          .format("jdbc") \
          .option("url", "jdbc:duckdb:exploitation_database.duckdb") \
          .option("driver", "org.duckdb.DuckDBDriver") \
          .option("query", query_person) \
          .load()

        self.min_pers = int(DF.select("min").rdd.flatMap(lambda x: x).collect()[0])
        self.max_pers = int(DF.select("max").rdd.flatMap(lambda x: x).collect()[0])

    def ask(self):
        print("-------------------------------------------------")
        print("---------------------CHATBOT---------------------")
        print("--------We will find your ideal apartment--------")
        print("-------------------------------------------------")

        complete = False

        print()

        while not complete:
            day_type = input("Are you traveling on Weekdays or Weekends? ")
            if day_type in ["Weekdays", "Weekends"]:
                complete = True
        complete = False

        while not complete:
            people = int(input(f"How many people are you? ({self.min_pers} - {self.max_pers}): "))
            if self.min_pers <= people <= self.max_pers:
                complete = True
        complete = False

        while not complete:
            price = int(input(f"What is the maximum price you want? ({self.min_sum}€ - {self.max_sum}€): "))
            if self.min_sum <= price <= self.max_sum:
                complete = True
        complete = False

        str_room_types = ", ".join(self.room_type_list)
        while not complete:
            room_type = input(f"What types of rooms are you interested in? ({str_room_types}): ")
            if room_type in self.room_type_list:
                complete = True
        complete = False

        if room_type == "Entire home/apt":
            while not complete:
                rooms = int(input("How many rooms do you want? (1 - 4): "))
                if 1 <= rooms <= 4:
                    complete = True
        else:
            rooms = None
        complete = False

        while not complete:
            temp = int(input(f"Which temperature you want? ({self.min_temp}º - {self.max_temp}º): "))
            if self.min_temp <= temp <= self.max_temp:
                complete = True
        complete = False

        while not complete:
            rain_importance = input("How important is it for you that it doesn't rain? (Important, Not important): ")
            if rain_importance in ["Important", "Not important"]:
                if rain_importance == "Important":
                    rain_importance = 0
                elif rain_importance == "Not important":
                    rain_importance = 1000
                complete = True

        self.find_apartment(people, day_type, price, room_type, temp, rain_importance, rooms)

    def find_apartment(self, people, day_type, price, room_type, temp, rain_importance, rooms):
        query = f"""SELECT *
        FROM
            airbnb a, weather w
        WHERE
            a.latitude_w = w.latitude AND
            a.longitude_w = w.longitude AND
            a.person_capacity >= '{people}' AND
            a.day_type = '{day_type}' AND
            a.realSum <= {price} AND
            a.room_type = '{room_type}' AND
            w.avg_temperature_2m >= {temp-2} AND
            w.avg_temperature_2m <= {temp+2} AND
            w.date = '2024-03-19' AND
            w.total_precipitation <= {rain_importance}
        """
        if rooms:
            query += f"AND bedrooms >= '{rooms}' "

        DF = spark.read \
          .format("jdbc") \
          .option("url", "jdbc:duckdb:exploitation_database.duckdb") \
          .option("driver", "org.duckdb.DuckDBDriver") \
          .option("query", query) \
          .load()

        DF = DF.sort("realSum")
        DF = DF.limit(10).collect()
        print()
        print()
        for fila in DF:
          print(f"The apartment {fila['id']} with coordinates {round(fila['latitude_w'],4)} lat and {round(fila['longitude_w'],4)} lon, in the city {fila['city']}, has the sky: {fila['estat_cel']}")


## Try the chatbot

In [95]:
chatbot = Chatbot()

In [96]:
chatbot.ask()

-------------------------------------------------
---------------------CHATBOT---------------------
--------We will find your ideal apartment--------
-------------------------------------------------

Are you traveling on Weekdays or Weekends? Weekends
How many people are you? (2 - 6): 3
What is the maximum price you want (34.8€ - 18545.5€): 4000
What types of rooms are you interested in? (Entire home/apt, Private room, Shared room): Private room
Which temperature you want? (8.6º - 15.1º): 13
How important is it for you that it doesn't rain? (Important, Not important): Important


The apartment 41384 with coordinates 38.7625 lat and -9.124 lon, in the city Lisbon, has the sky: Nublado
The apartment 43519 with coordinates 51.4527 lat and -0.0478 lon, in the city London, has the sky: Nublado
The apartment 10142 with coordinates 51.4696 lat and -0.0478 lon, in the city London, has the sky: Nublado
The apartment 33908 with coordinates 51.4443 lat and -0.0478 lon, in the city London, has th