<a href="https://colab.research.google.com/github/luismiguelmartinluengo/PySpark_Demos/blob/main/Basicos_PySpark_Sql.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import lower, upper, avg, min, max, sum

In [3]:
sparkSession = SparkSession.builder.appName('Basicos Datafrae').getOrCreate()

In [9]:
sales = sparkSession.read.csv ('/content/drive/MyDrive/Colab Notebooks/data/sales_data.csv',
                            sep = ',',
                            header = True)
sales.show(5)

+--------------+----------+---------------+-------------+--------------+----------------+-------------+--------------+
|transaction_id|product_id|   product_name|quantity_sold|price_per_unit|transaction_date|       region|payment_method|
+--------------+----------+---------------+-------------+--------------+----------------+-------------+--------------+
|             1|      1002|         Laptop|            1|        754.47|      2024-02-15|       Europe|    Debit Card|
|             2|      1003|     Headphones|            2|        692.86|      2023-01-01|       Africa|   Credit Card|
|             3|      1010|Fitness Tracker|            7|         80.19|      2022-10-26|North America|    Debit Card|
|             4|      1004|         Tablet|            9|        621.92|      2024-05-04|       Africa|    Debit Card|
|             5|      1009| Gaming Console|            7|        259.42|      2023-05-08|       Africa|        PayPal|
+--------------+----------+---------------+-----

In [10]:
sales.createOrReplaceTempView('sales')

In [11]:
sqlDistinctRegions = sparkSession.sql('SELECT DISTINCT region FROM sales')
sqlDistinctRegions.show()

+-------------+
|       region|
+-------------+
|       Europe|
|       Africa|
|North America|
|South America|
|         Asia|
+-------------+



In [12]:
dfAgg = sales.groupBy('product_name').agg({'price_per_unit':'avg',
                                        'price_per_unit':'max',
                                        'price_per_unit':'min',
                                        'price_per_unit':'sum'})
dfAgg.show() #Solo muestra la última agregación
dfAgg = sales.groupBy('product_name').agg(avg('price_per_unit').alias('Avg_price_per_unit'),
                                       min('price_per_unit').alias('Min_price_per_unit'),
                                       max('price_per_unit').alias('Max_price_per_unit'),
                                       sum('price_per_unit').alias('Sum_price_per_unit'))
dfAgg.show()

+-----------------+-------------------+
|     product_name|sum(price_per_unit)|
+-----------------+-------------------+
|       Smartwatch|  55122.37000000001|
|Bluetooth Speaker| 53639.130000000005|
|           Laptop| 49244.149999999994|
|  Fitness Tracker| 45093.060000000005|
|           Camera|           56107.65|
|       Television|           43345.93|
|           Tablet|           46965.87|
|       Smartphone|  45193.32999999998|
|       Headphones|           67448.02|
|   Gaming Console|  60786.26000000001|
+-----------------+-------------------+

+-----------------+------------------+------------------+------------------+------------------+
|     product_name|Avg_price_per_unit|Min_price_per_unit|Max_price_per_unit|Sum_price_per_unit|
+-----------------+------------------+------------------+------------------+------------------+
|Bluetooth Speaker| 515.7608653846154|            100.93|            989.08|53639.130000000005|
|           Camera| 524.3705607476636|            101.3

In [25]:
customers = sparkSession.read.csv('/content/drive/MyDrive/Colab Notebooks/data/customers.csv',
                                  header = True,
                                  inferSchema = True)
customers.printSchema()
customers.show(5)

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- join_date: date (nullable = true)
 |-- country: string (nullable = true)
 |-- loyalty_points: integer (nullable = true)

+-----------+-----------------+--------------------+----------+-------------+--------------+
|customer_id|             name|               email| join_date|      country|loyalty_points|
+-----------+-----------------+--------------------+----------+-------------+--------------+
|          1|     Norma Fisher| ysullivan@yahoo.com|2023-10-23|Guinea-Bissau|          6311|
|          2|     Susan Wagner|katelynmontgomery...|2024-04-14|      Romania|          6890|
|          3|Stephanie Collins|thomas15@stewart-...|2024-06-15|      Lesotho|           663|
|          4|     Joseph Brown|cortezraymond@gar...|2021-10-30|  Saint Lucia|          4242|
|          5|         Seth Lee|lindathomas@west.net|2023-12-02|     Botswana|          8376|
+-------

In [26]:
purchases = sparkSession.read.csv('/content/drive/MyDrive/Colab Notebooks/data/purchases.csv',
                                  header = True,
                                  inferSchema = True)
purchases.printSchema()
purchases.show(5)

root
 |-- purchase_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- purchase_amount: double (nullable = true)
 |-- purchase_date: date (nullable = true)
 |-- product_category: string (nullable = true)
 |-- payment_method: string (nullable = true)

+-----------+-----------+---------------+-------------+----------------+--------------+
|purchase_id|customer_id|purchase_amount|purchase_date|product_category|payment_method|
+-----------+-----------+---------------+-------------+----------------+--------------+
|          1|         67|          263.9|   2023-09-12|          Beauty|        PayPal|
|          2|        240|         192.87|   2023-11-05|           Books|        PayPal|
|          3|        597|          93.98|   2024-09-28|     Electronics| Bank Transfer|
|          4|        517|         170.66|   2024-04-25|            Toys|   Credit Card|
|          5|        832|          73.61|   2023-02-20|           Books|    Debit Card|
+-----------+-----

In [27]:
customers = customers.filter(~customers.name.endswith('r'))
purchases = purchases.filter(purchases.product_category != 'Books')
print('Registros customers:', customers.count())
print('Registros purchases:', purchases.count())

Registros customers: 909
Registros purchases: 839


In [28]:
inner = purchases.join(customers, on = 'customer_id', how = 'inner')
print('Registros inner:', inner.count())
inner.show(5)

Registros inner: 759
+-----------+-----------+---------------+-------------+----------------+--------------+-----------------+--------------------+----------+-----------+--------------+
|customer_id|purchase_id|purchase_amount|purchase_date|product_category|payment_method|             name|               email| join_date|    country|loyalty_points|
+-----------+-----------+---------------+-------------+----------------+--------------+-----------------+--------------------+----------+-----------+--------------+
|          3|        714|          157.4|   2024-03-02|          Beauty|    Debit Card|Stephanie Collins|thomas15@stewart-...|2024-06-15|    Lesotho|           663|
|          4|        151|          28.71|   2024-02-26|     Electronics| Bank Transfer|     Joseph Brown|cortezraymond@gar...|2021-10-30|Saint Lucia|          4242|
|          5|         95|          43.99|   2023-02-04|          Beauty|   Credit Card|         Seth Lee|lindathomas@west.net|2023-12-02|   Botswana|     

In [30]:
left = purchases.join(customers, on = 'customer_id', how = 'left')
print('Registros left:', left.count())
left.filter(left.name.isNull()).show(5)

Registros left: 839
+-----------+-----------+---------------+-------------+----------------+--------------+----+-----+---------+-------+--------------+
|customer_id|purchase_id|purchase_amount|purchase_date|product_category|payment_method|name|email|join_date|country|loyalty_points|
+-----------+-----------+---------------+-------------+----------------+--------------+----+-----+---------+-------+--------------+
|        362|         10|         308.16|   2023-03-26|         Fashion|     Gift Card|NULL| NULL|     NULL|   NULL|          NULL|
|        317|         13|          16.19|   2023-10-10|         Fashion| Bank Transfer|NULL| NULL|     NULL|   NULL|          NULL|
|        503|         24|         317.64|   2023-10-20|     Electronics|    Debit Card|NULL| NULL|     NULL|   NULL|          NULL|
|        665|         29|         171.12|   2024-02-28|          Beauty|     Gift Card|NULL| NULL|     NULL|   NULL|          NULL|
|        758|         52|         497.88|   2023-02-20| 

In [32]:
right = purchases.join(customers, on = 'customer_id', how = 'right')
print('Registros right:', right.count())
right.filter(right.product_category.isNull()).show(5)

Registros right: 1160
+-----------+-----------+---------------+-------------+----------------+--------------+------------------+--------------------+----------+-----------+--------------+
|customer_id|purchase_id|purchase_amount|purchase_date|product_category|payment_method|              name|               email| join_date|    country|loyalty_points|
+-----------+-----------+---------------+-------------+----------------+--------------+------------------+--------------------+----------+-----------+--------------+
|         11|       NULL|           NULL|         NULL|            NULL|          NULL|    Susan Ferguson|martinezjacob@wil...|2020-01-25|Netherlands|          9558|
|         17|       NULL|           NULL|         NULL|            NULL|          NULL|Gabriella Williams|johnbenton@yahoo.com|2022-03-16|       Peru|          1553|
|         19|       NULL|           NULL|         NULL|            NULL|          NULL|       Jerome Page|moralesjacqueline...|2020-02-26|       Cha

In [33]:
outer = purchases.join(customers, on = 'customer_id', how = 'outer')
print('Registros outer:', outer.count())
outer.filter(outer.name.isNull() | outer.product_category.isNull()).show(5)

Registros outer: 1240
+-----------+-----------+---------------+-------------+----------------+--------------+------------------+--------------------+----------+-----------+--------------+
|customer_id|purchase_id|purchase_amount|purchase_date|product_category|payment_method|              name|               email| join_date|    country|loyalty_points|
+-----------+-----------+---------------+-------------+----------------+--------------+------------------+--------------------+----------+-----------+--------------+
|          1|        642|         317.67|   2023-02-15|          Beauty|        PayPal|              NULL|                NULL|      NULL|       NULL|          NULL|
|          2|        455|         407.47|   2023-08-20|         Fashion|     Gift Card|              NULL|                NULL|      NULL|       NULL|          NULL|
|         11|       NULL|           NULL|         NULL|            NULL|          NULL|    Susan Ferguson|martinezjacob@wil...|2020-01-25|Netherland