In [24]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.window import Window

In [3]:
conf = SparkConf().setMaster("local[4]").setAppName("transport")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [4]:
path = "file:///home/nicolas/github/improve_transport/datasets/data.parquet/"

In [5]:
df = sqlContext.read.parquet(path)

In [14]:
df.createOrReplaceTempView("df")

In [9]:
cards = ["cee991f606e867cd3232dfdb0b3dab066aa01281f222c8602513148d860f343c", 
         "a48e9d07b409708c52cd9804ad2fd9af88a0691682ccb9a5bb3987338e0bed39",
         "27d9e5f3e906850470885c810a58d5be56d051240012d9fd8ae2cb60871143e2",
         "533515c696a82329ffe62e53a215fc10f01e97cb763ba3ccaf2e6b3a489c14e6"]
df = df.select("nrotarjeta", "nombresitio", "fechahoratrx").where(col("nrotarjeta").isin(cards))

In [10]:
df.count()

38

In [15]:
query = """
    SELECT COUNT(*)
    FROM df
"""
sqlContext.sql(query).show()

+--------+
|count(1)|
+--------+
|      38|
+--------+



In [19]:
query = """
    SELECT * 
    FROM df
    ORDER BY nrotarjeta, fechahoratrx
"""
sqlContext.sql(query).show(40)

+--------------------+--------------+-------------------+
|          nrotarjeta|   nombresitio|       fechahoratrx|
+--------------------+--------------+-------------------+
|27d9e5f3e90685047...|       CJRP-81|2019-03-02 07:09:28|
|27d9e5f3e90685047...|La Cisterna L2|2019-03-02 07:27:05|
|27d9e5f3e90685047...|       BJFK-11|2019-03-02 15:45:15|
|27d9e5f3e90685047...|       BJFK-60|2019-03-02 15:55:39|
|27d9e5f3e90685047...|       WA-9796|2019-03-02 18:01:24|
|27d9e5f3e90685047...|       BJFK-37|2019-03-03 10:22:03|
|27d9e5f3e90685047...|       WB-9729|2019-03-03 10:43:02|
|27d9e5f3e90685047...|       BJFG-89|2019-03-03 13:11:28|
|27d9e5f3e90685047...|       CJRR-69|2019-03-03 15:39:14|
|27d9e5f3e90685047...|       BJFF-99|2019-03-03 16:22:20|
|27d9e5f3e90685047...|       BJFF-99|2019-03-03 16:45:41|
|27d9e5f3e90685047...|       RM-0812|2019-03-03 17:10:27|
|533515c696a82329f...|       FLXF-49|2019-03-02 19:49:19|
|533515c696a82329f...|       FLXJ-54|2019-03-02 20:07:23|
|533515c696a82

In [None]:
query = """
    SELECT *, 
        RANK() OVER (
            PARTITION BY nrotarjeta
            ORDER BY fechahoratrx
            RANGE BETWEEN INTERVAL 5 MINUTES PRECEDING AND CURRENT ROW
        ) AS count
    FROM df
"""
result = sqlContext.sql(query)
result.show(40)

In [28]:
df = df.withColumn('timestamp', df['fechahoratrx'].astype('Timestamp').cast("long"))

In [29]:
df.show()

+--------------------+--------------+-------------------+----------+
|          nrotarjeta|   nombresitio|       fechahoratrx| timestamp|
+--------------------+--------------+-------------------+----------+
|a48e9d07b409708c5...|       BJFJ-91|2019-03-02 05:58:14|1551517094|
|27d9e5f3e90685047...|       CJRP-81|2019-03-02 07:09:28|1551521368|
|27d9e5f3e90685047...|La Cisterna L2|2019-03-02 07:27:05|1551522425|
|a48e9d07b409708c5...|       BJFF-70|2019-03-02 15:19:31|1551550771|
|a48e9d07b409708c5...|       ZU-5557|2019-03-02 15:33:56|1551551636|
|27d9e5f3e90685047...|       BJFK-11|2019-03-02 15:45:15|1551552315|
|a48e9d07b409708c5...|       WC-1129|2019-03-02 15:50:18|1551552618|
|27d9e5f3e90685047...|       BJFK-60|2019-03-02 15:55:39|1551552939|
|cee991f606e867cd3...|       CJRH-10|2019-03-02 17:08:36|1551557316|
|cee991f606e867cd3...|       CJRH-10|2019-03-02 17:08:47|1551557327|
|cee991f606e867cd3...|       CJRH-10|2019-03-02 17:08:55|1551557335|
|27d9e5f3e90685047...|       WA-97

In [30]:
w = Window.partitionBy('nrotarjeta').orderBy('timestamp').rangeBetween(-60 * 60 * 2, 0)

In [31]:
df = df.withColumn('trips', F.count('timestamp').over(w))

In [33]:
df.orderBy(["nrotarjeta", "fechahoratrx"]).show(40)

+--------------------+--------------+-------------------+----------+-----------+
|          nrotarjeta|   nombresitio|       fechahoratrx| timestamp|occurrences|
+--------------------+--------------+-------------------+----------+-----------+
|27d9e5f3e90685047...|       CJRP-81|2019-03-02 07:09:28|1551521368|          1|
|27d9e5f3e90685047...|La Cisterna L2|2019-03-02 07:27:05|1551522425|          2|
|27d9e5f3e90685047...|       BJFK-11|2019-03-02 15:45:15|1551552315|          1|
|27d9e5f3e90685047...|       BJFK-60|2019-03-02 15:55:39|1551552939|          2|
|27d9e5f3e90685047...|       WA-9796|2019-03-02 18:01:24|1551560484|          1|
|27d9e5f3e90685047...|       BJFK-37|2019-03-03 10:22:03|1551619323|          1|
|27d9e5f3e90685047...|       WB-9729|2019-03-03 10:43:02|1551620582|          2|
|27d9e5f3e90685047...|       BJFG-89|2019-03-03 13:11:28|1551629488|          1|
|27d9e5f3e90685047...|       CJRR-69|2019-03-03 15:39:14|1551638354|          1|
|27d9e5f3e90685047...|      