# Avanzado Data Frame

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import min, max, col
from pyspark.sql.functions import sum, sum_distinct, avg, count
# directorio donde estan los datos
path = 'files/'

In [4]:
spark = SparkSession.builder.getOrCreate()
df_vuelos = spark.read.parquet(path+'vuelos.parquet')

                                                                                

In [16]:
df_vuelos.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: integer (nullable = true)
 |-- TAIL_NUMBER: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- SCHEDULED_DEPARTURE: integer (nullable = true)
 |-- DEPARTURE_TIME: integer (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- TAXI_OUT: integer (nullable = true)
 |-- WHEELS_OFF: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- ELAPSED_TIME: integer (nullable = true)
 |-- AIR_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- WHEELS_ON: integer (nullable = true)
 |-- TAXI_IN: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- ARRIVAL_TIME: integer (nullable = true)
 |-- ARRIVAL_DELAY: integer (null

In [17]:
df_vuelos.show(5, truncate=False)

+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+
|YEAR|MONTH|DAY|DAY_OF_WEEK|AIRLINE|FLIGHT_NUMBER|TAIL_NUMBER|ORIGIN_AIRPORT|DESTINATION_AIRPORT|SCHEDULED_DEPARTURE|DEPARTURE_TIME|DEPARTURE_DELAY|TAXI_OUT|WHEELS_OFF|SCHEDULED_TIME|ELAPSED_TIME|AIR_TIME|DISTANCE|WHEELS_ON|TAXI_IN|SCHEDULED_ARRIVAL|ARRIVAL_TIME|ARRIVAL_DELAY|DIVERTED|CANCELLED|CANCELLATION_REASON|AIR_SYSTEM_DELAY|SECURITY_DELAY|AIRLINE_DELAY|LATE_AIRCRAFT_DELAY|WEATHER_DELAY|
+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+-

## Funciones

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import approx_count_distinct

In [19]:
spark = SparkSession.builder.getOrCreate()

df = spark.read.parquet(path+'dataframe.parquet')

df.printSchema()

root
 |-- nombre: string (nullable = true)
 |-- color: string (nullable = true)
 |-- cantidad: long (nullable = true)



In [20]:
df.show()

+------+-----+--------+
|nombre|color|cantidad|
+------+-----+--------+
|  Jose| azul|    1900|
|  NULL| NULL|    1700|
|  NULL| rojo|    1300|
|  Juan| rojo|    1500|
+------+-----+--------+



### Count

In [25]:
# count
df.select(
    count('nombre').alias('conteo_nombre'),
    count('color').alias('conteo_color'),
    count('cantidad').alias('conteo_cantidad')
).show()

+-------------+------------+---------------+
|conteo_nombre|conteo_color|conteo_cantidad|
+-------------+------------+---------------+
|            2|           3|              4|
+-------------+------------+---------------+



In [26]:
df.select(
    count('*').alias('conteo_general')
).show()


+--------------+
|conteo_general|
+--------------+
|             4|
+--------------+



### countDistinct

In [28]:
# countDistinct
df.select(
    countDistinct('nombre').alias('nombre_dif'),
    countDistinct('color').alias('color_dif'),
    countDistinct('cantidad').alias('cantidad_dif'),
    countDistinct('*').alias('general_dif')
).show()


+----------+---------+------------+-----------+
|nombre_dif|color_dif|cantidad_dif|general_dif|
+----------+---------+------------+-----------+
|         2|        2|           4|          2|
+----------+---------+------------+-----------+



### approx_count_distinct

In [24]:
# approx_count_distinct
# esta funcion da una aproximacion del conteo
# para reducir el costo del conteo
df_vuelos.select(
    countDistinct('AIRLINE'),
    approx_count_distinct('AIRLINE')
).show()



+-----------------------+------------------------------+
|count(DISTINCT AIRLINE)|approx_count_distinct(AIRLINE)|
+-----------------------+------------------------------+
|                     14|                            13|
+-----------------------+------------------------------+



                                                                                

### Min & Max

In [32]:
df_vuelos.select(
    min('AIR_TIME').alias('menor_timepo'),
    max('AIR_TIME').alias('mayor_tiempo'),
    min('AIRLINE_DELAY'),
    max('AIRLINE_DELAY')
).show()

+------------+------------+------------------+------------------+
|menor_timepo|mayor_tiempo|min(AIRLINE_DELAY)|max(AIRLINE_DELAY)|
+------------+------------+------------------+------------------+
|           7|         690|                 0|              1971|
+------------+------------+------------------+------------------+



### Sum

In [5]:
df_vuelos.select(
    sum('DISTANCE').alias('sum_dis')
).show()

                                                                                

+----------+
|   sum_dis|
+----------+
|4785357409|
+----------+



### SumDistinct

In [8]:
df_vuelos.select(
    sum_distinct('DISTANCE').alias('sum_dis_dif')
).show()

+-----------+
|sum_dis_dif|
+-----------+
|    1442300|
+-----------+



### AVG

In [9]:
df_vuelos.select(
    avg('AIR_TIME').alias('promedio_aire'),
    (sum('AIR_TIME') / count('AIR_TIME')).alias('prom_manual')
).show()

+------------------+------------------+
|     promedio_aire|       prom_manual|
+------------------+------------------+
|113.51162809012519|113.51162809012519|
+------------------+------------------+



In [None]:
spark.sparkContext.stop()