<a href="https://colab.research.google.com/github/luismiguelmartinluengo/PySpark_Demos/blob/main/Labo_DataFrame_Shipping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [54]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, lit, when, count, expr, date_format, date_add, date_diff, asc_nulls_last

In [4]:
sparkSession = SparkSession.builder.appName("Labo DataFrame Shipping").getOrCreate()

In [13]:
df = sparkSession.read.csv('/content/drive/MyDrive/Colab Notebooks/data/shipping_log.csv',
                           header = True,
                           inferSchema = True)
df.show(5)
print('Registros:', df.count())
df.printSchema()

+----------+-----------+-------------+------------+------------+----------+------+
|ShipmentID|     Origin|  Destination|ShipmentDate|DeliveryDate|    Status|  Cost|
+----------+-----------+-------------+------------+------------+----------+------+
|         1|      Miami|       Boston|  2023-04-13|  2023-04-15|In Transit|147.92|
|         2|      Miami|San Francisco|  2023-04-15|  2023-04-17| Delivered|149.56|
|         3|    Houston|San Francisco|  2023-05-18|  2023-05-19|In Transit|191.56|
|         4|    Houston|      Houston|  2023-07-26|  2023-07-31| Delivered|121.13|
|         5|Los Angeles|       Dallas|  2023-12-07|  2023-12-09|In Transit|163.55|
+----------+-----------+-------------+------------+------------+----------+------+
only showing top 5 rows

Registros: 1000
root
 |-- ShipmentID: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Destination: string (nullable = true)
 |-- ShipmentDate: date (nullable = true)
 |-- DeliveryDate: date (nullable = true)

In [10]:
df.printSchema()

root
 |-- ShipmentID: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Destination: string (nullable = true)
 |-- ShipmentDate: date (nullable = true)
 |-- DeliveryDate: date (nullable = true)
 |-- Status: string (nullable = true)
 |-- Cost: double (nullable = true)



In [15]:
#Eliminar registros con nulos
dfClean = df.dropna()
print('Registros tras eliminar nulos:', dfClean.count())
if (dfClean.count() < df.count()):
  print('Registros eliminados:', df.count() - dfClean.count())
  df = dfClean
#End if

Registros tras eliminar nulos: 1000


In [46]:
#Analizar tiempos de entrega
df = df.withColumn('Delay', expr('datediff(DeliveryDate, ShipmentDate)'))
promedioDelay = int(round(df.select(avg('Delay')).first()[0]))
print('Promedio de retraso:', promedioDelay)
dfDelays = df.groupBy('Delay').count()
dfDelays.orderBy('Delay').show()


Promedio de retraso: 4
+-----+-----+
|Delay|count|
+-----+-----+
|    1|  148|
|    2|  151|
|    3|  140|
|    4|  149|
|    5|  130|
|    6|  132|
|    7|  150|
+-----+-----+



In [47]:
#Calcular fecha estimada de entrega para los envios
df = df.withColumn('Estimated Delivery Date', date_add(col('ShipmentDate'), promedioDelay))
df.select('ShipmentDate', 'Estimated Delivery Date').sample(0.01).show()

+------------+-----------------------+
|ShipmentDate|Estimated Delivery Date|
+------------+-----------------------+
|  2023-10-20|             2023-10-24|
|  2023-05-29|             2023-06-02|
|  2023-06-16|             2023-06-20|
|  2023-03-03|             2023-03-07|
|  2023-11-26|             2023-11-30|
|  2023-06-07|             2023-06-11|
|  2023-08-20|             2023-08-24|
|  2023-01-25|             2023-01-29|
|  2023-07-25|             2023-07-29|
+------------+-----------------------+



In [48]:
#Identificar en Status los envíos entregados con retraso
df = df.withColumn('Status', when(date_diff('DeliveryDate','Estimated Delivery Date')>0, 'Delivered whith Delay').otherwise(col('Status')))
df.groupBy('Status').count().show(truncate = False)

+---------------------+-----+
|Status               |count|
+---------------------+-----+
|Delivered whith Delay|412  |
|In Transit           |214  |
|Delivered            |192  |
|Delayed              |182  |
+---------------------+-----+



In [49]:
#Promedio de envíos entregados sin retraso:
dfPerformance = df.withColumn('OnTime', when(col('Delay')<=promedioDelay, 1).otherwise(0))
dfPerformance.agg(avg('OnTime').alias('% of OnTime')).show()

+-----------+
|% of OnTime|
+-----------+
|      0.588|
+-----------+



In [30]:
#Identificar la ruta más usada
dfPaths = df.groupBy('Origin', 'Destination').count()
dfPaths.orderBy(col('count').desc()).show(1)

+--------+-----------+-----+
|  Origin|Destination|count|
+--------+-----------+-----+
|New York|    Chicago|   64|
+--------+-----------+-----+
only showing top 1 row



In [35]:
#Determinar envíos con sobrecoste
promedioCost = df.agg(avg('Cost')).collect()[0][0]
print('Costo medio:', promedioCost)
dfOverruns = df.filter(col('Cost')>promedioCost)
dfOverruns.show(5)

Costo medio: 199.13505000000015
+----------+-----------+-------------+------------+------------+----------+------+-----------------------+
|ShipmentID|     Origin|  Destination|ShipmentDate|DeliveryDate|    Status|  Cost|Estimated Delivery Date|
+----------+-----------+-------------+------------+------------+----------+------+-----------------------+
|         7|   New York|      Houston|  2023-08-28|  2023-09-02|   Delayed|222.68|             2023-09-01|
|         8|    Houston|      Houston|  2023-09-22|  2023-09-24|In Transit| 258.9|             2023-09-26|
|        10|    Chicago|      Houston|  2023-08-26|  2023-09-01|In Transit|296.93|             2023-08-30|
|        11|    Houston|       Boston|  2023-07-06|  2023-07-11|In Transit|271.69|             2023-07-10|
|        13|   New York|      Chicago|  2023-04-20|  2023-04-22|In Transit|279.13|             2023-04-24|
|        14|      Miami|       Dallas|  2023-06-01|  2023-06-02|   Delayed|224.32|             2023-06-05|
|    

In [55]:
#Análisis de envíos y costes por origen y destino
df.rollup('Origin','Destination').agg(avg('Cost').alias('Average Cost'),
                                       count('ShipmentId').alias('Total Shipments'))\
                                       .orderBy(asc_nulls_last('Origin'), asc_nulls_last('Destination')).show()

+-----------+-------------+------------------+---------------+
|     Origin|  Destination|      Average Cost|Total Shipments|
+-----------+-------------+------------------+---------------+
|    Chicago|       Boston| 203.9380952380953|             42|
|    Chicago|      Chicago| 181.5591891891892|             37|
|    Chicago|       Dallas|193.91325000000006|             40|
|    Chicago|      Houston|218.21971428571428|             35|
|    Chicago|San Francisco|200.49695652173915|             46|
|    Chicago|         NULL|199.50084999999984|            200|
|    Houston|       Boston| 208.0913888888889|             36|
|    Houston|      Chicago|192.80810810810814|             37|
|    Houston|       Dallas|184.47959183673467|             49|
|    Houston|      Houston|206.00150000000002|             40|
|    Houston|San Francisco|190.91210526315794|             38|
|    Houston|         NULL| 195.7970499999999|            200|
|Los Angeles|       Boston|201.91062499999998|         