In [1]:
import json
import pyspark.sql.functions as F
from utils import get_range, count_occurrences, origin_destination_product
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, IntegerType

In [2]:
CELL_AREA    = '/data/cell_area.csv'
PARQUETS_DIR = '/data/parquets'
CURRENT_DIR  = '/home/hellscream/Documents/backendSpark'

## DataFrame Schema

## Create Spark Session

In [3]:
class SparkSessionBase():
    def __init__(self):
        self.spark = SparkSession\
                     .builder\
                     .appName('Mobility')\
                     .getOrCreate()
        
        self.cell_area_df = self.spark.read.format('csv').options(header='true', delimiter='\t')\
                         .load(CURRENT_DIR + CELL_AREA)\
                         .select('id', 'area_correlator', 'latitude', 'longitude', 'province')

In [4]:
class Mobility(SparkSessionBase):
    def __init__(self, date, time_start_lower, time_start_high, time_end_lower,
        time_end_high, time_sleep_lower='01:00', time_sleep_high='04:00'):
        super().__init__()
        self.date = date
        
    def get_mobility_at_time_interval(self, time_start, time_end):
        # load correspondent parquet
        df = self.spark.read.parquet(CURRENT_DIR + PARQUETS_DIR + '/' + self.date)

        get_range_udf = F.udf(lambda elems, a, b : get_range(elems, a, b), ArrayType(IntegerType()))

        df = df.withColumn('range', get_range_udf(df.times, F.lit(time_start), F.lit(time_end)))\
               .select(df.code,\
               F.slice(df.towers, F.col('range')[0], F.col('range')[1]).alias('towers'),\
               F.slice(df.times,  F.col('range')[0], F.col('range')[1]).alias('times'))\
               .where(F.size(F.col('towers')) > 0)
        
        count_occurrences_udf = F.udf(lambda x : count_occurrences(x), ArrayType(ArrayType(StringType())))
        
        df = df.select('code', count_occurrences_udf(F.col('towers')).alias('towers-count'))
        
        df.printSchema()
        df.show(10)
        
        return df

In [5]:
mobility_instance = Mobility('sample', None, None, None, None)
user_cells_start = mobility_instance.get_mobility_at_time_interval(30, 1000)
user_cells_end   = mobility_instance.get_mobility_at_time_interval(1400, 2700)

root
 |-- code: string (nullable = true)
 |-- towers-count: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)

+-------+--------------------+
|   code|        towers-count|
+-------+--------------------+
|4582376|        [[d7b9d, 1]]|
|6892952|       [[d5yy1, 10]]|
| 819378|       [[d5zur, 39]]|
|1588148|        [[d78yd, 2]]|
|6901043|        [[d5yvc, 1]]|
|1229925|        [[d795s, 1]]|
|4647869|        [[d7b9d, 1]]|
|3049134|[[d7b9d, 6], [d7b...|
|3998644|        [[d797z, 1]]|
|1705846|       [[d7c49, 12]]|
+-------+--------------------+
only showing top 10 rows

root
 |-- code: string (nullable = true)
 |-- towers-count: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)

+-------+--------------------+
|   code|        towers-count|
+-------+--------------------+
|6892952|       [[d5yy1, 13]]|
| 819378|       [[d5zur, 50]]|
|1588148|      

In [6]:
user_cells_start.show()

+-------+--------------------+
|   code|        towers-count|
+-------+--------------------+
|4582376|        [[d7b9d, 1]]|
|6892952|       [[d5yy1, 10]]|
| 819378|       [[d5zur, 39]]|
|1588148|        [[d78yd, 2]]|
|6901043|        [[d5yvc, 1]]|
|1229925|        [[d795s, 1]]|
|4647869|        [[d7b9d, 1]]|
|3049134|[[d7b9d, 6], [d7b...|
|3998644|        [[d797z, 1]]|
|1705846|       [[d7c49, 12]]|
| 379374|        [[d5yvg, 1]]|
|6202650|        [[d5yy1, 1]]|
|4230783|        [[d5zhh, 8]]|
|6620846|        [[d5yy4, 1]]|
|7075816|        [[d79r0, 1]]|
| 565307|       [[dhjcx, 10]]|
| 115154|       [[d795t, 34]]|
|5292136|        [[d5zxh, 1]]|
|2181906|[[d79eb, 22], [d7...|
|6922882|        [[d5yyn, 1]]|
+-------+--------------------+
only showing top 20 rows



In [7]:
user_cells_end.show()

+-------+--------------------+
|   code|        towers-count|
+-------+--------------------+
|6892952|       [[d5yy1, 13]]|
| 819378|       [[d5zur, 50]]|
|1588148|        [[d78yd, 1]]|
|6901043|        [[d5yvc, 1]]|
|7325994|        [[d79r1, 1]]|
|3049134|[[d7b9d, 1], [d7b...|
| 379374|        [[d5yvg, 1]]|
|4230783|        [[d5zhh, 8]]|
|6620846|        [[d5yy4, 1]]|
| 565307|       [[dhjcx, 15]]|
|2181906|[[d79eb, 22], [d7...|
|6922882|        [[d5yyn, 1]]|
|1461404|        [[d7bh0, 1]]|
| 113609|        [[d5yvg, 3]]|
| 221420|       [[d79eb, 10]]|
| 314712|       [[d7b9d, 11]]|
|7593700|        [[d78tc, 4]]|
|2097102|[[d79qz, 1], [d79...|
|7211033|        [[d5yv8, 1]]|
|4280644|       [[d5zkh, 15]]|
+-------+--------------------+
only showing top 20 rows



In [71]:
# TODO: Normalize users_cells values?

In [8]:
join_user_cells = user_cells_start\
                  .join(user_cells_end\
                        .withColumnRenamed('code', 'code_1')\
                        .withColumnRenamed('towers-count', 'towers-count_1'))
intersection = join_user_cells.select('*').filter(F.col('code') == F.col('code_1'))
intersection.show()

+-------+--------------------+-------+--------------------+
|   code|        towers-count| code_1|      towers-count_1|
+-------+--------------------+-------+--------------------+
|1023947|[[dhj7tg, 1], [dh...|1023947|[[dhj7tg, 2], [dh...|
|1028327|        [[d795t, 1]]|1028327|[[d795s, 4], [d79...|
|1029426|        [[d7c88, 1]]|1029426|        [[d7c88, 2]]|
|1030428|[[dhjgp, 1], [dhn...|1030428|[[dhn50, 1], [dhj...|
|1033423|[[dhj7z1, 18], [d...|1033423|[[dhj7z1, 13], [d...|
|1056865|[[dhj7jq, 1], [dh...|1056865|       [[dhj7jg, 2]]|
|1060235|         [[dhhc, 5]]|1060235|         [[dhhc, 1]]|
|1096857|[[dhj6, 3], [dhj4...|1096857|         [[dhj4, 3]]|
| 111710|       [[dhj7t2, 3]]| 111710|       [[dhj7t2, 5]]|
|1147404|       [[dhjcx, 14]]|1147404|       [[dhjcx, 13]]|
|1148935|       [[dhj7wx, 3]]|1148935|       [[dhj7wx, 4]]|
|1160180|        [[dhjgp, 1]]|1160180|        [[dhjgp, 1]]|
|1175424|       [[d7d4w, 46]]|1175424|       [[d7d4w, 63]]|
|1181914|[[dhj7mz, 2], [dh...|1181914|[[

In [16]:
origin_destination_product_udf = F.udf(lambda x, y : origin_destination_product(x, y), \
                                       ArrayType(ArrayType(StringType())))

matriz = intersection.select('code', origin_destination_product_udf(F.col('towers-count'), F.col('towers-count_1'))\
            .alias('start-end-val'))
matriz.show()

+-------+--------------------+
|   code|       start-end-val|
+-------+--------------------+
|1023947|[[dhj7tg, dhj7tg,...|
|1028327|[[d795t, d795s, 4...|
|1029426| [[d7c88, d7c88, 2]]|
|1030428|[[dhjgp, dhn50, 1...|
|1033423|[[dhj7z1, dhj7z1,...|
|1056865|[[dhj7jq, dhj7jg,...|
|1060235|   [[dhhc, dhhc, 5]]|
|1096857|[[dhj6, dhj4, 9],...|
| 111710|[[dhj7t2, dhj7t2,...|
|1147404|[[dhjcx, dhjcx, 1...|
|1148935|[[dhj7wx, dhj7wx,...|
|1160180| [[dhjgp, dhjgp, 1]]|
|1175424|[[d7d4w, d7d4w, 2...|
|1181914|[[dhj7mz, dhj7ky,...|
|1197655|[[dhj7mz, dhj7w6,...|
|1230899| [[d5zum, d5zum, 2]]|
|1266322|[[d7d1g, d7d1g, 18]]|
| 129880|[[dhj75y, dhj75y,...|
|1357494|[[d79kc, d79kc, 16]]|
|1380971| [[d7d6m, d7d6m, 1]]|
+-------+--------------------+
only showing top 20 rows



In [21]:
matriz.withColumn('explode', F.explode(F.col('start-end-val')))\
      .select([F.col('explode')[i] for i in range(3)])\
      .withColumnRenamed('explode[0]', 'start')\
      .withColumnRenamed('explode[1]', 'end')\
      .withColumnRenamed('explode[2]', 'count')\
      .groupBy('start', 'end')\
      .agg(F.sum(F.col('count')))\
      .show()

+------+------+----------+
| start|   end|sum(count)|
+------+------+----------+
|dhj7tg|dhj7te|      55.0|
| d7d1f| d7d1f|  185190.0|
|dhjeg3|dhjefb|    1207.0|
|dhjecc|dhjecc|    7995.0|
| d5zud| d7bh0|      18.0|
| d5zrm| d5zqg|      93.0|
| d5zt9| d5zjm|       7.0|
| d5zyh| d5zwr|     286.0|
| d79ps| d79ps|    1227.0|
| d7972| d7969|      52.0|
| d7b9d| d7b97|       9.0|
| dhn5q| dhn5w|     293.0|
| d7dtk| d7dtw|     566.0|
| d79zb| d79z9|    2933.0|
|dhj7su|  dhj4|      10.0|
| d5zhh| d5zh5|      62.0|
|dhj7hu|dhj7mp|       4.0|
| dhjgx| dhn50|      14.0|
|dhj7mm|dhj7my|       1.0|
| dhn5w| dhn7c|      16.0|
+------+------+----------+
only showing top 20 rows

