In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import functions as F

In [2]:
conf = SparkConf().setMaster("local[4]").setAppName("transport")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [3]:
path = "file:///home/nicolas/github/improve_transport/datasets/data.parquet/"

In [4]:
df = sqlContext.read.parquet(path)

In [5]:
df.show()

+-------------------+-------------+------------------+-----------+-----------+--------------------+
|       fechahoratrx|codigoentidad|     nombreentidad|codigositio|nombresitio|          nrotarjeta|
+-------------------+-------------+------------------+-----------+-----------+--------------------+
|2019-02-28 21:00:15|           13|       U6 - Redbus|       2596|    CJRC-35|612d6a01a6158dd63...|
|2019-02-28 21:02:39|            5|U5 - Metropolitana|       7660|    FLXG-38|eb7a60b4eb2e7b0a9...|
|2019-02-28 21:11:25|            5|U5 - Metropolitana|      11338|    FLXT-94|b4a501a26eaab4f8a...|
|2019-02-28 21:14:22|           13|       U6 - Redbus|       2596|    CJRC-35|a727f39a7a85cb6c1...|
|2019-02-28 21:17:02|            5|U5 - Metropolitana|      14379|    BJFJ-15|dcb9033eb5920016b...|
|2019-02-28 21:17:07|            5|U5 - Metropolitana|      14379|    BJFJ-15|8647ac944c4fa5928...|
|2019-02-28 21:20:12|           13|       U6 - Redbus|       2596|    CJRC-35|4339814f270bfab3d...|


In [6]:
df.printSchema()

root
 |-- fechahoratrx: timestamp (nullable = true)
 |-- codigoentidad: long (nullable = true)
 |-- nombreentidad: string (nullable = true)
 |-- codigositio: long (nullable = true)
 |-- nombresitio: string (nullable = true)
 |-- nrotarjeta: string (nullable = true)



In [7]:
df.count()

2433466

In [8]:
df.groupBy("nrotarjeta").count().orderBy("count", ascending=False).show()

+--------------------+-----+
|          nrotarjeta|count|
+--------------------+-----+
|f4221acc3ec26146b...|   25|
|0b286ef4a356304ef...|   24|
|f1cc16d8093662c10...|   21|
|e21255cc625840b88...|   20|
|4650034e1690026c9...|   20|
|9bc9ad4835dae1ca9...|   20|
|ba602fc72c91fc9bd...|   20|
|e9904008854abc186...|   20|
|f975001ffe1b87792...|   18|
|ca3454d9131f8b02a...|   18|
|b5a4026d77cad264d...|   18|
|5b040cedf7146d46b...|   18|
|c58c3d4bd2924126d...|   18|
|05ce66ed7948936ac...|   18|
|12b382c2f5297029b...|   17|
|198ff7943b4d2a61b...|   17|
|64db36ddb326c94b0...|   17|
|8052f93490092aa4a...|   16|
|55070b9c28e40eed0...|   16|
|f8cda3a4bbbf1c2ef...|   16|
+--------------------+-----+
only showing top 20 rows



In [9]:
df.createOrReplaceTempView('df')

In [23]:
query = """
    SELECT nrotarjeta, nombreentidad, COUNT(*) as count
    FROM df
    GROUP BY nrotarjeta, nombreentidad
"""
result = sqlContext.sql(query)
result.show()

+--------------------+------------------+-----+
|          nrotarjeta|     nombreentidad|count|
+--------------------+------------------+-----+
|6cbcf07f6b9502933...|       U6 - Redbus|    1|
|494c93b4f91813b1a...|U5 - Metropolitana|    1|
|d4d5aefb798c8a9fe...|U5 - Metropolitana|    1|
|b0bb29fde7565da7c...|U5 - Metropolitana|    1|
|4cd7438eace49cfd7...|       U6 - Redbus|    2|
|a3b629b2a227e9e23...|U5 - Metropolitana|    1|
|33447cc0954101c9a...|       U6 - Redbus|    1|
|84f192fe4e3298eb1...|U5 - Metropolitana|    1|
|8f5a57848989c0f00...|U5 - Metropolitana|    1|
|27689d10932716361...|U5 - Metropolitana|    1|
|dbad6f8dd5196036b...|         U3 - Vule|    1|
|7709d1be22bf32852...|U5 - Metropolitana|    1|
|668a196b4437e23fc...|U5 - Metropolitana|    2|
|d9ecd35d5c6d7617d...|       U6 - Redbus|    1|
|cef59dd4cfa92d770...|U5 - Metropolitana|    2|
|b56b54f26203e2162...|U5 - Metropolitana|    1|
|2c3e783d7f8aba7d7...|         U3 - Vule|    1|
|b7017908a27ee4e25...|U5 - Metropolitana

In [33]:
query = """
    SELECT nrotarjeta, nombreentidad, count
    FROM (
        SELECT nrotarjeta, nombreentidad, count,
            DENSE_RANK() OVER(
                PARTITION BY nombreentidad
                ORDER BY count DESC
            ) AS rank
        FROM (
            SELECT nrotarjeta, nombreentidad, COUNT(*) as count
            FROM df
            GROUP BY nrotarjeta, nombreentidad
            ORDER BY count DESC
        )
    )
    WHERE rank <= 2
    ORDER BY count DESC
"""
result = sqlContext.sql(query)
result.show()

+--------------------+------------------+-----+
|          nrotarjeta|     nombreentidad|count|
+--------------------+------------------+-----+
|0b286ef4a356304ef...|        METRO - OT|   24|
|f1cc16d8093662c10...|        METRO - OT|   21|
|ae472f38ce14d7347...|U5 - Metropolitana|   11|
|c909309d0c5c313ea...|U5 - Metropolitana|   11|
|3ed0d4f78603b46e2...|      U4 - Express|   11|
|4e9f3beb896b8273b...|      Tren Central|   10|
|60c31e815e3f380d8...|U5 - Metropolitana|   10|
|314f4188e4c591a28...|       U2 - Su Bus|   10|
|5b040cedf7146d46b...|       U6 - Redbus|   10|
|f947f0bcc72db9ad8...|U5 - Metropolitana|   10|
|37c17fc6843cd5a7d...|          U7 - STP|    9|
|45d4c63334c9dacd2...|       U2 - Su Bus|    9|
|862447def346871c4...|       U2 - Su Bus|    9|
|27d9e5f3e90685047...|       U2 - Su Bus|    9|
|63d9ab80e538a085c...|         U3 - Vule|    9|
|bab194b3a8930aed8...|       U2 - Su Bus|    9|
|f45d87cd753b25fae...|       U2 - Su Bus|    9|
|eb6fc0bba47e80ba5...|         U3 - Vule

In [None]:
SELECT
  product,
  category,
  revenue
FROM (
  SELECT
    product,
    category,
    revenue,
    dense_rank() OVER (PARTITION BY category ORDER BY revenue DESC) as rank
  FROM productRevenue) tmp
WHERE
  rank <= 2

In [13]:
query = """
    SELECT *, SUM(codigoentidad)
    OVER (partition by nrotarjeta) AS total
    FROM df
"""

result = sqlContext.sql(query)

In [14]:
result.show()

+-------------------+-------------+------------------+-----------+--------------------+--------------------+-----+
|       fechahoratrx|codigoentidad|     nombreentidad|codigositio|         nombresitio|          nrotarjeta|total|
+-------------------+-------------+------------------+-----------+--------------------+--------------------+-----+
|2019-03-02 08:35:55|            1|        METRO - OT|        151|     Laguna Sur - L5|0019a77744e1d066e...|    1|
|2019-03-02 16:57:59|            1|        METRO - OT|        100|      Manquehue - L1|00262da83972ed902...|    2|
|2019-03-02 20:05:11|            1|        METRO - OT|         13|Universidad de Chile|00262da83972ed902...|    2|
|2019-03-03 08:32:44|            9|          U7 - STP|       7265|             LDJJ-67|00285c70482182094...|   22|
|2019-03-03 10:23:28|           13|       U6 - Redbus|      11319|             FLXT-88|00285c70482182094...|   22|
|2019-03-03 11:06:25|            5|U5 - Metropolitana|      13593|             B