In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import functions as F

In [2]:
conf = SparkConf().setMaster("local[4]").setAppName("transport")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [3]:
path = "file:///home/nicolas/github/improve_transport/datasets/data.parquet/"

In [4]:
df = sqlContext.read.parquet(path)

In [5]:
df.show()

+-------------------+-------------+------------------+-----------+-----------+--------------------+
|       fechahoratrx|codigoentidad|     nombreentidad|codigositio|nombresitio|          nrotarjeta|
+-------------------+-------------+------------------+-----------+-----------+--------------------+
|2019-02-28 21:00:15|           13|       U6 - Redbus|       2596|    CJRC-35|612d6a01a6158dd63...|
|2019-02-28 21:02:39|            5|U5 - Metropolitana|       7660|    FLXG-38|eb7a60b4eb2e7b0a9...|
|2019-02-28 21:11:25|            5|U5 - Metropolitana|      11338|    FLXT-94|b4a501a26eaab4f8a...|
|2019-02-28 21:14:22|           13|       U6 - Redbus|       2596|    CJRC-35|a727f39a7a85cb6c1...|
|2019-02-28 21:17:02|            5|U5 - Metropolitana|      14379|    BJFJ-15|dcb9033eb5920016b...|
|2019-02-28 21:17:07|            5|U5 - Metropolitana|      14379|    BJFJ-15|8647ac944c4fa5928...|
|2019-02-28 21:20:12|           13|       U6 - Redbus|       2596|    CJRC-35|4339814f270bfab3d...|


In [6]:
df.count()

2433466

In [7]:
df.select("nombreentidad").distinct().count()

8

In [9]:
df.select("nombreentidad").distinct().show()

+------------------+
|     nombreentidad|
+------------------+
|       U2 - Su Bus|
|          U7 - STP|
|        METRO - OT|
|      Tren Central|
|       U6 - Redbus|
|         U3 - Vule|
|      U4 - Express|
|U5 - Metropolitana|
+------------------+



In [10]:
df.select("nrotarjeta").distinct().count()

1161560

In [13]:
df.select("nrotarjeta").groupBy("nrotarjeta").count().orderBy("count", ascending=False).show()

+--------------------+-----+
|          nrotarjeta|count|
+--------------------+-----+
|f4221acc3ec26146b...|   25|
|0b286ef4a356304ef...|   24|
|f1cc16d8093662c10...|   21|
|e21255cc625840b88...|   20|
|9bc9ad4835dae1ca9...|   20|
|4650034e1690026c9...|   20|
|ba602fc72c91fc9bd...|   20|
|e9904008854abc186...|   20|
|05ce66ed7948936ac...|   18|
|5b040cedf7146d46b...|   18|
|b5a4026d77cad264d...|   18|
|f975001ffe1b87792...|   18|
|ca3454d9131f8b02a...|   18|
|c58c3d4bd2924126d...|   18|
|12b382c2f5297029b...|   17|
|64db36ddb326c94b0...|   17|
|198ff7943b4d2a61b...|   17|
|8697342494cdf1400...|   16|
|b50ddb477cb5d0957...|   16|
|4d68bcadbc68638f9...|   16|
+--------------------+-----+
only showing top 20 rows



In [14]:
df.select("nombresitio").groupBy("nombresitio").count().orderBy("count", ascending=False).show()

+--------------------+-----+
|         nombresitio|count|
+--------------------+-----+
|    Estacion Central|49772|
|      Plaza de Armas|31163|
|         Cal y Canto|27702|
|Universidad de Sa...|27541|
|Universidad de Chile|23872|
|      Manquehue - L1|23307|
|Plaza de Puente Alto|22499|
|         TOBALABA_L1|21939|
|      La Cisterna L2|21821|
|    Plaza Maip√∫ - L5|21634|
|           Las Rejas|21096|
|         Santa Lucia|20836|
|     Escuela Militar|18962|
|    Estacion Alameda|16873|
|       Pudahuel - L5|16739|
|  Los Dominicos - L1|16437|
| San Alberto Hurtado|15712|
|      Vespucio Norte|15670|
|           Pajaritos|15009|
|          La Florida|14938|
+--------------------+-----+
only showing top 20 rows



In [20]:
df.select(F.date_format("fechahoratrx", "dd-MM-yyyy").alias("date")).show()

+----------+
|      date|
+----------+
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
|28-02-2019|
+----------+
only showing top 20 rows



In [23]:
df.select(F.date_format("fechahoratrx", "dd-MM-yyyy").alias("date")).groupBy("date").count().orderBy("count", ascending=False).show()

+----------+-------+
|      date|  count|
+----------+-------+
|02-03-2019|1745562|
|03-03-2019| 625026|
|01-03-2019|  41399|
|28-02-2019|   9516|
|27-02-2019|   6374|
|26-02-2019|   3338|
|25-02-2019|   1829|
|24-02-2019|    290|
|23-02-2019|    132|
+----------+-------+



In [25]:
df.select("fechahoratrx").show()

+-------------------+
|       fechahoratrx|
+-------------------+
|2019-02-28 21:00:15|
|2019-02-28 21:02:39|
|2019-02-28 21:11:25|
|2019-02-28 21:14:22|
|2019-02-28 21:17:02|
|2019-02-28 21:17:07|
|2019-02-28 21:20:12|
|2019-02-28 21:21:34|
|2019-02-28 21:21:42|
|2019-02-28 21:21:44|
|2019-02-28 21:31:21|
|2019-02-28 21:37:10|
|2019-02-28 22:01:06|
|2019-02-28 22:01:13|
|2019-02-28 22:01:17|
|2019-02-28 22:01:20|
|2019-02-28 22:01:23|
|2019-02-28 22:01:32|
|2019-02-28 22:01:35|
|2019-02-28 22:03:17|
+-------------------+
only showing top 20 rows



In [28]:
df.select(F.date_format("fechahoratrx", "HH").alias("hour")).groupBy("hour").count().orderBy("hour").show()

+----+------+
|hour| count|
+----+------+
|  00|  1040|
|  01|   977|
|  02|  4735|
|  03| 44862|
|  04| 94140|
|  05|117877|
|  06|132925|
|  07|140874|
|  08|151073|
|  09|163193|
|  10|178809|
|  11|167546|
|  12|153121|
|  13|151113|
|  14|155574|
|  15|160705|
|  16|159717|
|  17|159276|
|  18|144508|
|  19|106242|
+----+------+
only showing top 20 rows

