In [1]:
from datetime import datetime, date

from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    StringType,
    DateType,
    TimestampType
)

spark = (
    SparkSession
    .builder
    .appName('SQL')
    .master('local[*]')
    .getOrCreate()
)

In [8]:
taxi_data_2024_09 = (
    spark
    .read
    .format("parquet")
    .load('../cubix_de_pyspark/data/yellow_tripdata_2024-09.parquet')
)

taxi_data_2024_09.createOrReplaceTempView("taxi_2024_09")

#### SQL recap

In [None]:
spark.sql("""
select * from taxi_2024_09
""").show(10)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2024-09-01 00:05:51|  2024-09-01 00:45:03|              1|          9.8|         1|                 N|         138|          48|           1|       47.8|10.25|    0.5|      13.

In [None]:
spark.sql("""
SELECT
          VendorID,
          passenger_count,
          total_amount
FROM
          taxi_2024_09
WHERE
          passenger_count > 4
          AND total_amount < 50
ORDER BY
          passenger_count DESC
""").show(10)

+--------+---------------+------------+
|VendorID|passenger_count|total_amount|
+--------+---------------+------------+
|       2|              8|       -93.0|
|       2|              7|       -74.0|
|       2|              7|        40.8|
|       2|              6|       19.68|
|       2|              6|       11.76|
|       2|              6|        20.5|
|       2|              6|       31.08|
|       2|              6|       23.16|
|       2|              6|       21.84|
|       2|              6|        21.2|
+--------+---------------+------------+
only showing top 10 rows



In [None]:
spark.sql("""
    SELECT
          VendorID,
          max(passenger_count) AS max_passenger_count,
          avg(total_amount) AS avg_total_amount
    FROM
          taxi_2024_09
    GROUP BY
          VendorID
    HAVING
          avg_total_amount > 28

""").show(10)

+--------+-------------------+------------------+
|VendorID|max_passenger_count|  avg_total_amount|
+--------+-------------------+------------------+
|       2|                  9|28.754969323884783|
|       6|               NULL| 32.86155963302752|
+--------+-------------------+------------------+



#### Advanced concepts

##### Handling dates

In [None]:
# commas needed after values inside ()
data = [
    ("12/01/2023",),
    ("11/15/2023",),
    ("10/25/2023",),
    ("09/17/2023",)
]

schema = StructType([StructField("date_string", StringType(), True)])

date_df = spark.createDataFrame(data, schema)
date_df.createOrReplaceTempView("dates")

In [None]:
# convert string to DATE to to be able to use date functions

spark.sql("""
    SELECT 
          date_string,
          TO_DATE(date_string, 'MM/dd/yyyy') AS parsed_date
    FROM
          dates
""").show()

+-----------+-----------+
|date_string|parsed_date|
+-----------+-----------+
| 12/01/2023| 2023-12-01|
| 11/15/2023| 2023-11-15|
| 10/25/2023| 2023-10-25|
| 09/17/2023| 2023-09-17|
+-----------+-----------+



In [None]:
# extracting parts from date

spark.sql("""
    SELECT 
          TO_DATE(date_string, 'MM/dd/yyyy') AS parsed_date,
          YEAR(TO_DATE(date_string, 'MM/dd/yyyy')) AS year,
          MONTH(TO_DATE(date_string, 'MM/dd/yyyy')) AS month,
          DAY(TO_DATE(date_string, 'MM/dd/yyyy')) AS day,
          DATE_FORMAT(TO_DATE(date_string, 'MM/dd/yyyy'), 'E') AS weekday_short,
          DATE_FORMAT(TO_DATE(date_string, 'MM/dd/yyyy'), 'EEEE') AS weekday
    FROM
          dates

""").show()

+-----------+----+-----+---+-------------+---------+
|parsed_date|year|month|day|weekday_short|  weekday|
+-----------+----+-----+---+-------------+---------+
| 2023-12-01|2023|   12|  1|          Fri|   Friday|
| 2023-11-15|2023|   11| 15|          Wed|Wednesday|
| 2023-10-25|2023|   10| 25|          Wed|Wednesday|
| 2023-09-17|2023|    9| 17|          Sun|   Sunday|
+-----------+----+-----+---+-------------+---------+



In [None]:
# adding and removing dates

spark.sql("""
    SELECT 
          TO_DATE(date_string, 'MM/dd/yyyy') AS parsed_date,
          DATE_ADD(TO_DATE(date_string, 'MM/dd/yyyy'), 10) AS add_10_days,
          DATE_SUB(TO_DATE(date_string, 'MM/dd/yyyy'), 5) AS subtract_5_days,
          ADD_MONTHS(TO_DATE(date_string, 'MM/dd/yyyy'), 2) AS add_2_months,
          ADD_MONTHS(TO_DATE(date_string, 'MM/dd/yyyy'), -1) AS subtract_1_month,
          ADD_MONTHS(TO_DATE(date_string, 'MM/dd/yyyy'), 12) AS add_1_year
    FROM
          dates

""").show()

+-----------+-----------+---------------+------------+----------------+----------+
|parsed_date|add_10_days|subtract_5_days|add_2_months|subtract_1_month|add_1_year|
+-----------+-----------+---------------+------------+----------------+----------+
| 2023-12-01| 2023-12-11|     2023-11-26|  2024-02-01|      2023-11-01|2024-12-01|
| 2023-11-15| 2023-11-25|     2023-11-10|  2024-01-15|      2023-10-15|2024-11-15|
| 2023-10-25| 2023-11-04|     2023-10-20|  2023-12-25|      2023-09-25|2024-10-25|
| 2023-09-17| 2023-09-27|     2023-09-12|  2023-11-17|      2023-08-17|2024-09-17|
+-----------+-----------+---------------+------------+----------------+----------+



In [None]:
# using BETWEEN and AND

# filtering variable in WHERE must be in date format if we filter dates!
spark.sql("""
    SELECT 
          date_string,
          TO_DATE(date_string, 'MM/dd/yyyy') AS parsed_date
    FROM
          dates
    WHERE
          TO_DATE(date_string, 'MM/dd/yyyy') BETWEEN '2023-11-01' AND '2023-12-31'
""").show()

+-----------+-----------+
|date_string|parsed_date|
+-----------+-----------+
| 12/01/2023| 2023-12-01|
| 11/15/2023| 2023-11-15|
+-----------+-----------+



In [None]:
# difference between dates with CTE

spark.sql("""
    WITH date_diff_example AS (
        SELECT
            TO_DATE(date_string, 'MM/dd/yyyy') AS parsed_date,
            CURRENT_DATE() AS today
        FROM
            dates
    )
    SELECT 
        parsed_date,
        today,
        DATEDIFF(today, parsed_date) AS days_difference
    FROM
        date_diff_example
""").show()

+-----------+----------+---------------+
|parsed_date|     today|days_difference|
+-----------+----------+---------------+
| 2023-12-01|2025-11-26|            726|
| 2023-11-15|2025-11-26|            742|
| 2023-10-25|2025-11-26|            763|
| 2023-09-17|2025-11-26|            801|
+-----------+----------+---------------+



In [None]:
# timestamps

data = [
    (date(2024, 12, 31), datetime(2024, 12, 31, 23, 59, 59)),
    (date(2023, 1, 1), datetime(2023, 1, 1, 10, 15, 30))
]

schema = StructType([
    StructField("date_column", DateType(), True),
    StructField("timestamp_column", TimestampType(), True)
])

df = spark.createDataFrame(data, schema)
df.createOrReplaceTempView("timestamps")

df.show()

+-----------+-------------------+
|date_column|   timestamp_column|
+-----------+-------------------+
| 2024-12-31|2024-12-31 23:59:59|
| 2023-01-01|2023-01-01 10:15:30|
+-----------+-------------------+



In [None]:
# timestamp functions

spark.sql("""
    SELECT 
          timestamp_column,
          TO_DATE(timestamp_column, 'MM/dd/yyyy') AS parsed_date,
          HOUR(timestamp_column) AS hour,
          MINUTE(timestamp_column) AS minute,
          SECOND(timestamp_column) AS second,
          DATE_TRUNC('HOUR', timestamp_column) AS truncated_hour
    FROM
          timestamps
""").show()

+-------------------+-----------+----+------+------+-------------------+
|   timestamp_column|parsed_date|hour|minute|second|     truncated_hour|
+-------------------+-----------+----+------+------+-------------------+
|2024-12-31 23:59:59| 2024-12-31|  23|    59|    59|2024-12-31 23:00:00|
|2023-01-01 10:15:30| 2023-01-01|  10|    15|    30|2023-01-01 10:00:00|
+-------------------+-----------+----+------+------+-------------------+



##### Union/Union all

In [None]:
data1 = [
    (1, "Alice", 25, "F"),
    (2, "Bob", 17, "M"),
    (3, "Catherine", 40, "F")
]

data2 = [
    (3, "Catherine", 40, "F"),
    (4, "David", 15, "M"),
    (5, "Eva", 29, "F")
]

columns = ["id", "name", "age", "gender"]

df1 = spark.createDataFrame(data1, columns)
df2 = spark.createDataFrame(data2, columns)
df1.createOrReplaceTempView("df1")
df2.createOrReplaceTempView("df2")

print("DF1:")
df1.show()

print("DF2:")
df2.show()

DF1:
+---+---------+---+------+
| id|     name|age|gender|
+---+---------+---+------+
|  1|    Alice| 25|     F|
|  2|      Bob| 17|     M|
|  3|Catherine| 40|     F|
+---+---------+---+------+

DF2:
+---+---------+---+------+
| id|     name|age|gender|
+---+---------+---+------+
|  3|Catherine| 40|     F|
|  4|    David| 15|     M|
|  5|      Eva| 29|     F|
+---+---------+---+------+



In [None]:
# number of columns and data types must be the same

spark.sql("""
    SELECT * FROM df1
    UNION
    SELECT * FROM df2
""").show()

+---+---------+---+------+
| id|     name|age|gender|
+---+---------+---+------+
|  1|    Alice| 25|     F|
|  2|      Bob| 17|     M|
|  3|Catherine| 40|     F|
|  4|    David| 15|     M|
|  5|      Eva| 29|     F|
+---+---------+---+------+



In [None]:
spark.sql("""
    SELECT * FROM df1
    UNION ALL
    SELECT * FROM df2
""").show()

+---+---------+---+------+
| id|     name|age|gender|
+---+---------+---+------+
|  1|    Alice| 25|     F|
|  2|      Bob| 17|     M|
|  3|Catherine| 40|     F|
|  3|Catherine| 40|     F|
|  4|    David| 15|     M|
|  5|      Eva| 29|     F|
+---+---------+---+------+



##### Case when

In [None]:
data = [
    (1, "Alice", 25, "F"),
    (2, "Bob", 17, "M"),
    (3, "Catherine", 40, "F"),
    (4, "David", 15, "M"),
    (5, "Eva", 29, "F")
]

columns = ["id", "name", "age", "gender"]

df = spark.createDataFrame(data, columns)
df.createOrReplaceTempView("people")

df.show()

+---+---------+---+------+
| id|     name|age|gender|
+---+---------+---+------+
|  1|    Alice| 25|     F|
|  2|      Bob| 17|     M|
|  3|Catherine| 40|     F|
|  4|    David| 15|     M|
|  5|      Eva| 29|     F|
+---+---------+---+------+



In [None]:
spark.sql("""
   SELECT
          id,
          name,
          age,
          CASE
            WHEN age >= 18 THEN 'Adult'
            ELSE 'Minor'
          END AS age_group
    FROM people
""").show()

+---+---------+---+---------+
| id|     name|age|age_group|
+---+---------+---+---------+
|  1|    Alice| 25|    Adult|
|  2|      Bob| 17|    Minor|
|  3|Catherine| 40|    Adult|
|  4|    David| 15|    Minor|
|  5|      Eva| 29|    Adult|
+---+---------+---+---------+



In [None]:
spark.sql("""
   SELECT
          id,
          name,
          gender,
          CASE
            WHEN gender == 'F' THEN 'Eligible'
            ELSE 'Not eligible'
          END AS eligibility
    FROM people
""").show()

+---+---------+------+------------+
| id|     name|gender| eligibility|
+---+---------+------+------------+
|  1|    Alice|     F|    Eligible|
|  2|      Bob|     M|Not eligible|
|  3|Catherine|     F|    Eligible|
|  4|    David|     M|Not eligible|
|  5|      Eva|     F|    Eligible|
+---+---------+------+------------+



In [None]:
spark.sql("""
   SELECT
          id,
          name,
          age,
          CASE
            WHEN age >= 18 and GENDER == 'F' THEN 'Adult Female'
            WHEN age >= 18 and GENDER == 'M' THEN 'Adult Male'
          ELSE 'Minor'
          END AS category
    FROM people
""").show()

+---+---------+---+------------+
| id|     name|age|    category|
+---+---------+---+------------+
|  1|    Alice| 25|Adult Female|
|  2|      Bob| 17|       Minor|
|  3|Catherine| 40|Adult Female|
|  4|    David| 15|       Minor|
|  5|      Eva| 29|Adult Female|
+---+---------+---+------------+



##### Coalesce

In [None]:
data = [
    (1, "Alice", 25, "F"),
    (2, None, 17, "M"),
    (3, "Catherine", None, "F"),
    (4, "David", 15, "M"),
    (5, "Eva", 29, "F")
]

columns = ["id", "name", "age", "gender"]

df = spark.createDataFrame(data, columns)
df.createOrReplaceTempView("people")

df.show()

+---+---------+----+------+
| id|     name| age|gender|
+---+---------+----+------+
|  1|    Alice|  25|     F|
|  2|     NULL|  17|     M|
|  3|Catherine|NULL|     F|
|  4|    David|  15|     M|
|  5|      Eva|  29|     F|
+---+---------+----+------+



In [None]:
spark.sql("""
   SELECT
          id,
          COALESCE(name, "unknown"),
          COALESCE(age, 0),
          gender
    FROM people
""").show()

+---+-----------------------+----------------+------+
| id|coalesce(name, unknown)|coalesce(age, 0)|gender|
+---+-----------------------+----------------+------+
|  1|                  Alice|              25|     F|
|  2|                unknown|              17|     M|
|  3|              Catherine|               0|     F|
|  4|                  David|              15|     M|
|  5|                    Eva|              29|     F|
+---+-----------------------+----------------+------+



##### Subquery and CTE

In [2]:
employee_data = [
    (1, "Alice", "HR", 55000, "2020-01-01"),
    (2, "Bob", "Engineering", 80000, "2020-01-01"),
    (3, "Charlie", "HR", 60000, "2019-05-15"),
    (4, "David", "Engineering", 95000, "2018-07-01"),
    (5, "Eva", "Marketing", 70000, "2022-09-10"),
    (6, "Frank", "Marketing", 72000, "2021-12-05"),
    (7, "Grace", "Engineering", 85000, "2019-11-23"),
    (8, "Hank", "HR", 63000, "2020-11-11")
]

employee_columns = ["employee_id", "employee_name", "department", "salary", "hire_date"]

employee_df = spark.createDataFrame(employee_data, employee_columns)
employee_df.createOrReplaceTempView("employees")

employee_df.show()

departments_data = [
    (1, "HR", "Human Resources"),
    (2, "Engineering", "Software Engineering"),
    (3, "Marketing", "Marketing Department")
]

departments_columns = ["department_id", "department_code", "department_name"]

departments_df = spark.createDataFrame(departments_data, departments_columns)
departments_df.createOrReplaceTempView("departments")

departments_df.show()

+-----------+-------------+-----------+------+----------+
|employee_id|employee_name| department|salary| hire_date|
+-----------+-------------+-----------+------+----------+
|          1|        Alice|         HR| 55000|2020-01-01|
|          2|          Bob|Engineering| 80000|2020-01-01|
|          3|      Charlie|         HR| 60000|2019-05-15|
|          4|        David|Engineering| 95000|2018-07-01|
|          5|          Eva|  Marketing| 70000|2022-09-10|
|          6|        Frank|  Marketing| 72000|2021-12-05|
|          7|        Grace|Engineering| 85000|2019-11-23|
|          8|         Hank|         HR| 63000|2020-11-11|
+-----------+-------------+-----------+------+----------+

+-------------+---------------+--------------------+
|department_id|department_code|     department_name|
+-------------+---------------+--------------------+
|            1|             HR|     Human Resources|
|            2|    Engineering|Software Engineering|
|            3|      Marketing|Marketi

In [3]:
spark.sql("""
   SELECT
          employee_id,
          employee_name,
          department,
          salary
    FROM 
          employees e
    WHERE
          salary > (
            SELECT AVG(salary)
            FROM employees
            WHERE department = e.department
          )
""").show()

+-----------+-------------+-----------+------+
|employee_id|employee_name| department|salary|
+-----------+-------------+-----------+------+
|          3|      Charlie|         HR| 60000|
|          4|        David|Engineering| 95000|
|          6|        Frank|  Marketing| 72000|
|          8|         Hank|         HR| 63000|
+-----------+-------------+-----------+------+



In [5]:
spark.sql("""
          
    WITH avg_salaries AS (
            SELECT department, AVG(salary) AS avg_salary
            FROM employees
            GROUP BY department
    )
    SELECT
          e.employee_id,
          e.employee_name,
          e.department,
          e.salary
    FROM 
          employees e
    JOIN
          avg_salaries a
          ON e.department = a.department
    WHERE
          e.salary > a.avg_salary
    
""").show()

+-----------+-------------+-----------+------+
|employee_id|employee_name| department|salary|
+-----------+-------------+-----------+------+
|          3|      Charlie|         HR| 60000|
|          4|        David|Engineering| 95000|
|          6|        Frank|  Marketing| 72000|
|          8|         Hank|         HR| 63000|
+-----------+-------------+-----------+------+



In [7]:
spark.sql("""
          
    WITH avg_salaries AS (
            SELECT department, AVG(salary) AS avg_salary
            FROM employees
            GROUP BY department
    ),
    employee_info AS (
            SELECT e.employee_id, e.employee_name, e.department, e.salary
            FROM employees e     
    )
    SELECT
          e.employee_id,
          e.employee_name,
          e.salary,
          d.department_name
    FROM 
          employee_info e
    JOIN
          avg_salaries a
          ON e.department = a.department
    JOIN 
          departments d
          ON e.department = d.department_code
    WHERE
          e.salary > a.avg_salary
    
""").show()

+-----------+-------------+------+--------------------+
|employee_id|employee_name|salary|     department_name|
+-----------+-------------+------+--------------------+
|          4|        David| 95000|Software Engineering|
|          3|      Charlie| 60000|     Human Resources|
|          8|         Hank| 63000|     Human Resources|
|          6|        Frank| 72000|Marketing Department|
+-----------+-------------+------+--------------------+



##### Window functions

In [None]:
# aggregation

spark.sql("""
SELECT
          VendorID,
          fare_amount,
          COUNT(*) OVER (PARTITION BY VendorID) AS trip_count,
          MIN(fare_amount) OVER (PARTITION BY VendorID) AS min_fare,
          MAX(fare_amount) OVER (PARTITION BY VendorID) AS max_fare,
          AVG(fare_amount) OVER (PARTITION BY VendorID) AS avg_fare
FROM
          taxi_2024_09
WHERE
          VendorID = 1
""").show(10)

+--------+-----------+----------+--------+--------+-----------------+
|VendorID|fare_amount|trip_count|min_fare|max_fare|         avg_fare|
+--------+-----------+----------+--------+--------+-----------------+
|       1|       47.8|    827157|     0.0|   788.4|19.46594648899902|
|       1|        5.1|    827157|     0.0|   788.4|19.46594648899902|
|       1|        8.6|    827157|     0.0|   788.4|19.46594648899902|
|       1|       0.01|    827157|     0.0|   788.4|19.46594648899902|
|       1|       44.3|    827157|     0.0|   788.4|19.46594648899902|
|       1|        6.5|    827157|     0.0|   788.4|19.46594648899902|
|       1|       19.1|    827157|     0.0|   788.4|19.46594648899902|
|       1|        3.0|    827157|     0.0|   788.4|19.46594648899902|
|       1|        3.0|    827157|     0.0|   788.4|19.46594648899902|
|       1|        3.0|    827157|     0.0|   788.4|19.46594648899902|
+--------+-----------+----------+--------+--------+-----------------+
only showing top 10 

In [None]:
# ROW_NUMBER
# WHERE clause is to show different VendorIDs, if needed

spark.sql("""
SELECT
          VendorID,
          PULocationID,
          DOLocationID,
          fare_amount,
          ROW_NUMBER() OVER (PARTITION BY VendorID ORDER BY fare_amount DESC) AS row_num
FROM
          taxi_2024_09
ORDER BY
          row_num
""").show(10)

+--------+------------+------------+-----------+-------+
|VendorID|PULocationID|DOLocationID|fare_amount|row_num|
+--------+------------+------------+-----------+-------+
|       1|         132|         265|      788.4|      1|
|       2|         132|         265|     1862.2|      1|
|       6|         124|         223|        5.8|      1|
|       2|          10|          10|      999.0|      2|
|       1|         132|         265|      657.5|      2|
|       6|          85|          77|        5.8|      2|
|       2|         197|         197|      999.0|      3|
|       1|         215|         265|      600.0|      3|
|       6|         244|         223|        5.8|      3|
|       2|          14|          14|      975.0|      4|
+--------+------------+------------+-----------+-------+
only showing top 10 rows



In [12]:
# RANK: assign a rank based on a specific column and partition by another column, 1,2,2,4!,5
# DENSE RANK: doesn't skip duplicates, 1,2,2,3,4

spark.sql("""
SELECT
          VendorID,
          PULocationID,
          DOLocationID,
          fare_amount,
          RANK() OVER (PARTITION BY VendorID ORDER BY fare_amount DESC) AS rank,
          DENSE_RANK() OVER (PARTITION BY VendorID ORDER BY fare_amount DESC) AS dense_rank
FROM
          taxi_2024_09
WHERE
          VendorID = 1
""").show(10)

+--------+------------+------------+-----------+----+----------+
|VendorID|PULocationID|DOLocationID|fare_amount|rank|dense_rank|
+--------+------------+------------+-----------+----+----------+
|       1|         132|         265|      788.4|   1|         1|
|       1|         132|         265|      657.5|   2|         2|
|       1|         215|         265|      600.0|   3|         3|
|       1|         132|         265|      600.0|   3|         3|
|       1|         132|         265|      598.7|   5|         4|
|       1|         145|         145|     500.55|   6|         5|
|       1|          10|         265|      500.0|   7|         6|
|       1|         264|         264|      500.0|   7|         6|
|       1|          82|         265|      499.0|   9|         7|
|       1|         265|         265|      434.0|  10|         8|
+--------+------------+------------+-----------+----+----------+
only showing top 10 rows



In [13]:
# SUM in window functions: calculate cumulative sum of fare_amount by VendorID

spark.sql("""
SELECT
          VendorID,
          fare_amount,
          tpep_pickup_datetime,
          SUM(fare_amount) OVER (PARTITION BY VendorID ORDER BY tpep_pickup_datetime) AS cumulative_fare
FROM
          taxi_2024_09
""").show(10)

+--------+-----------+--------------------+------------------+
|VendorID|fare_amount|tpep_pickup_datetime|   cumulative_fare|
+--------+-----------+--------------------+------------------+
|       1|       24.0| 2024-09-01 00:00:01|              24.0|
|       1|       24.7| 2024-09-01 00:00:04|              48.7|
|       1|       63.9| 2024-09-01 00:00:06|             112.6|
|       1|       12.1| 2024-09-01 00:00:09|             148.0|
|       1|       23.3| 2024-09-01 00:00:09|             148.0|
|       1|       28.9| 2024-09-01 00:00:19|            185.41|
|       1|       8.51| 2024-09-01 00:00:19|            185.41|
|       1|        7.9| 2024-09-01 00:00:22|            193.31|
|       1|       12.1| 2024-09-01 00:00:30|            205.41|
|       1|       10.7| 2024-09-01 00:00:32|216.10999999999999|
+--------+-----------+--------------------+------------------+
only showing top 10 rows



In [14]:
# LEAD and LAG: get the next and previous fare_amount by VendorID
spark.sql("""
SELECT
          VendorID,
          fare_amount,
          LAG(fare_amount) OVER (PARTITION BY VendorID ORDER BY tpep_pickup_datetime) AS prev_fare,
          LEAD(fare_amount) OVER (PARTITION BY VendorID ORDER BY tpep_pickup_datetime) AS next_fare
FROM
          taxi_2024_09
""").show(10)

+--------+-----------+---------+---------+
|VendorID|fare_amount|prev_fare|next_fare|
+--------+-----------+---------+---------+
|       1|       24.0|     NULL|     24.7|
|       1|       24.7|     24.0|     63.9|
|       1|       63.9|     24.7|     12.1|
|       1|       12.1|     63.9|     23.3|
|       1|       23.3|     12.1|     28.9|
|       1|       28.9|     23.3|     8.51|
|       1|       8.51|     28.9|      7.9|
|       1|        7.9|     8.51|     12.1|
|       1|       12.1|      7.9|     10.7|
|       1|       10.7|     12.1|      7.9|
+--------+-----------+---------+---------+
only showing top 10 rows



#### SQLite

In [3]:
# ACID

# poetry add pandas

import pandas as pd
import sqlite3

conn = sqlite3.connect('C:\cubix_de/python_projects\cubix_de_pyspark\src\cubix_de_pyspark\data\imdb_database.db')


In [4]:
# create title.basics and title.ratings tables to our db

title_basics = pd.read_csv('C:/cubix_de/python_projects/cubix_de_pyspark/src/cubix_de_pyspark/data/title.basics.tsv.gz', sep='\t')
title_basics.to_sql('title_basics', conn, if_exists='replace', index=False)

title_ratings = pd.read_csv('C:/cubix_de/python_projects/cubix_de_pyspark/src/cubix_de_pyspark/data/title.ratings.tsv.gz', sep='\t')
title_ratings.to_sql('title_ratings', conn, if_exists='replace', index=False)

  title_basics = pd.read_csv('C:/cubix_de/python_projects/cubix_de_pyspark/src/cubix_de_pyspark/data/title.basics.tsv.gz', sep='\t')


1518645

In [5]:
# trditional way to get the first 10 rows of the title_basics table

cursor = conn.cursor()

cursor.execute("SELECT * FROM title_basics LIMIT 10;")
rows = cursor.fetchall()

for row in rows:
    print(row)

('tt0000001', 'short', 'Carmencita', 'Carmencita', '0', '1894', '\\N', '1', 'Documentary,Short')
('tt0000002', 'short', 'Le clown et ses chiens', 'Le clown et ses chiens', '0', '1892', '\\N', '5', 'Animation,Short')
('tt0000003', 'short', 'Poor Pierrot', 'Pauvre Pierrot', '0', '1892', '\\N', '5', 'Animation,Comedy,Romance')
('tt0000004', 'short', 'Un bon bock', 'Un bon bock', '0', '1892', '\\N', '12', 'Animation,Short')
('tt0000005', 'short', 'Blacksmith Scene', 'Blacksmith Scene', '0', '1893', '\\N', '1', 'Short')
('tt0000006', 'short', 'Chinese Opium Den', 'Chinese Opium Den', '0', '1894', '\\N', '1', 'Short')
('tt0000007', 'short', 'Corbett and Courtney Before the Kinetograph', 'Corbett and Courtney Before the Kinetograph', '0', '1894', '\\N', '1', 'Short,Sport')
('tt0000008', 'short', 'Edison Kinetoscopic Record of a Sneeze', 'Edison Kinetoscopic Record of a Sneeze', '0', '1894', '\\N', '1', 'Documentary,Short')
('tt0000009', 'movie', 'Miss Jerry', 'Miss Jerry', '0', '1894', '\\N',

In [6]:
# pandas to fetch SQL results is not mthe most effective way to deal with large datasets, as the entire result of the query is fethed 
# from the db and loaded into memory, but good for exploration purposes

df = pd.read_sql_query("SELECT * FROM title_basics LIMIT 10;", conn)

df.head(10)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,Short
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
7,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894,\N,1,"Documentary,Short"
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
9,tt0000010,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,0,1895,\N,1,"Documentary,Short"


#### Indexes