In [1]:
import json
import pyspark.sql.functions as F
from utils import get_range
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, IntegerType

In [2]:
CELL_AREA    = '/data/cell_area.csv'
PARQUETS_DIR = '/data/parquets'
CURRENT_DIR  = '/home/hellscream/Documents/backendSpark'

## DataFrame Schema

## Create Spark Session

In [6]:
class SparkSessionBase():
    def __init__(self):
        self.spark = SparkSession\
                     .builder\
                     .appName('Mobility')\
                     .getOrCreate()
        
        self.cell_area_df = self.spark.read.format('csv').options(header='true', delimiter='\t')\
                         .load(CURRENT_DIR + CELL_AREA)\
                         .select('id', 'area_correlator', 'latitude', 'longitude', 'province')

In [15]:
class Mobility(SparkSessionBase):
    def __init__(self, date, time_start_lower, time_start_high, time_end_lower,
        time_end_high, time_sleep_lower='01:00', time_sleep_high='04:00'):
        super().__init__()
        self.date = date
        
    def get_mobility_at_time_interval(self, time_start, time_end):
        # load correspondent parquet
        df = self.spark.read.parquet(CURRENT_DIR + PARQUETS_DIR + '/' + self.date)

        get_range_udf = F.udf(lambda elems, a, b : get_range(elems, a, b), ArrayType(IntegerType()))

        df = df.withColumn('range', get_range_udf(df.times, F.lit(time_start), F.lit(time_end)))\
               .select(df.code,\
               F.slice(df.towers, F.col('range')[0], F.col('range')[1]).alias('towers'),\
               F.slice(df.times,  F.col('range')[0], F.col('range')[1]).alias('times'))\
               .where(F.size(F.col('towers')) > 0)
        
        # df.printSchema()
        # df.show(10)
        
        df = df.select('code', F.explode('towers').alias('cell'))\
               .groupBy('code', 'cell')\
               .agg(F.count('cell').alias('count'))
        
        return df

In [20]:
mobility_instance = Mobility('2020-11-02', None, None, None, None)
user_cells_start = mobility_instance.get_mobility_at_time_interval(30, 7000)
user_cells_end   = mobility_instance.get_mobility_at_time_interval(1400, 2700)

In [None]:
# TODO: Normalize users_cells?

In [18]:
user_cells_start.show()

+-------+------+-----+
|   code|  cell|count|
+-------+------+-----+
| 565307| dhjcx|   67|
|6962467| d7b9d|    3|
|5151469| d7b9d|    1|
| 582127| d7d1f|  161|
|7548928| d7d1g|    2|
| 712215| d79sg|    2|
|4618061| d7d1f|    3|
|4460140| d7de0|    1|
|2163495| d7dmx|   75|
| 339945| d79eb|  154|
|1048936|dhj7w4|   22|
|1508789|  dhhc|    4|
|6542869|dhj7z0|   35|
|7063108|dhjec6|   37|
|1583783|dhj7qu|   78|
|2015061|dhj7x0|    1|
|7582935|dhjeb4|  271|
| 115859|dhj7w4|    2|
|2163248|dhj7wn|    1|
|1313928|dhj7z0|  113|
+-------+------+-----+
only showing top 20 rows



In [19]:
user_cells_end.show()

+-------+------+-----+
|   code|  cell|count|
+-------+------+-----+
| 565307| dhjcx|   15|
|6962467| d7b9d|    1|
| 582127| d7d1f|   32|
|2163495| d7dmx|   14|
| 339945| d79eb|   27|
|1048936|dhj7w4|    3|
|6542869|dhj7z0|    7|
|7063108|dhjec6|    4|
|1583783|dhj7qu|   15|
|7582935|dhjeb4|   62|
|1313928|dhj7z0|   34|
|4616098| d7d44|    7|
|4440475| d79yb|    1|
|2686226| d7d1f|    3|
|1742092| d79yu|    4|
|2002985| d79xy|    1|
|  64543| d7b6d|   59|
|  74738| d7d46|    2|
|7124727| d7d1f|   31|
|2114013| d7d1f|    1|
+-------+------+-----+
only showing top 20 rows

