In [1]:
import seaborn as sns
import pandas as pd

In [None]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName('sparksql1').getOrCreate()
df = spark.createDataFrame(sns.load_dataset('tips'))
df.show(3)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
+----------+----+------+------+---+------+----+
only showing top 3 rows



In [None]:
# Crear la tabla de sql
df.createOrReplaceTempView('tips_table')

In [4]:
# Filtro normal con DataFrame de spark
from pyspark.sql.functions import col
df.filter(col('tip') > 2).show(3)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 3 rows



In [5]:
# Filtro en spark sql
spark.sql('SELECT * FROM tips_table WHERE tip > 2;').show(3)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 3 rows



In [None]:
# Consulta SQL
spark.sql('''
          SELECT day,
          AVG(total_bill) as avg_total_bill,
          AVG(tip) as avg_tip
          FROM tips_table
          GROUP BY day;          
          ''').show()

+----+------------------+------------------+
| day|    avg_total_bill|           avg_tip|
+----+------------------+------------------+
| Sun|21.409999999999997|3.2551315789473683|
| Sat|20.441379310344825| 2.993103448275862|
|Thur|17.682741935483868|2.7714516129032254|
| Fri| 17.15157894736842| 2.734736842105263|
+----+------------------+------------------+



In [7]:
# Ordenar con spark sql
spark.sql('''
          SELECT total_bill, tip, sex
          FROM tips_table
          ORDER BY total_bill DESC
          LIMIT 5;
          ''').show()

+----------+----+----+
|total_bill| tip| sex|
+----------+----+----+
|     50.81|10.0|Male|
|     48.33| 9.0|Male|
|     48.27|6.73|Male|
|     48.17| 5.0|Male|
|     45.35| 3.5|Male|
+----------+----+----+



In [None]:
# Crear una nueva columna (Porcentaje de la propina sobre el total de la cuenta)
spark.sql('''
          SELECT *,
          tip/total_bill as tip_ratio
          FROM tips_table
          ''').show(4)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|          tip_ratio|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
+----------+----+------+------+---+------+----+-------------------+
only showing top 4 rows



In [None]:
# Creamos un dataframe con la consulta sql con el que podemos interactuar
df_tips_ratio = spark.sql('''
          SELECT *,
          tip/total_bill as tip_ratio
          FROM tips_table
          ''')

df_tips_ratio.show(2)
df_tips_ratio.printSchema()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|          tip_ratio|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
+----------+----+------+------+---+------+----+-------------------+
only showing top 2 rows

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: long (nullable = true)
 |-- tip_ratio: double (nullable = true)



In [None]:
# Filtro
spark.sql('''
          SELECT 
            *,
            CASE WHEN tip > 3 THEN 'alta' ELSE 'baja' END as tip_category
          FROM tips_table;
          ''').show(4)

+----------+----+------+------+---+------+----+------------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_category|
+----------+----+------+------+---+------+----+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        baja|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        baja|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        alta|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        alta|
+----------+----+------+------+---+------+----+------------+
only showing top 4 rows



In [13]:
# group by por 2 campos
spark.sql('''
          SELECT day, time, count(*) as count_booking
          FROM tips_table
          GROUP BY day, time;
          ''').show()

+----+------+-------------+
| day|  time|count_booking|
+----+------+-------------+
| Sat|Dinner|           87|
| Sun|Dinner|           76|
|Thur| Lunch|           61|
| Fri|Dinner|           12|
| Fri| Lunch|            7|
|Thur|Dinner|            1|
+----+------+-------------+

