In [1]:
import json
import time
import pyspark.sql.functions as F
from utils import get_range, count_occurrences_and_normalize, flat_origin_destination_product
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, IntegerType

In [2]:
CELL_AREA    = '/data/cell_area.csv'
PARQUETS_DIR = '/data/parquets'
CURRENT_DIR  = '/home/hellscream/Documents/backendSpark'

In [3]:
time_start = time.time()

## DataFrame Schema

## Create Spark Session

In [4]:
class SparkSessionBase():
    def __init__(self):
        self.spark = SparkSession\
                     .builder\
                     .appName('Mobility')\
                     .getOrCreate()
        
        self.cell_area_df = self.spark.read.format('csv').options(header='true', delimiter='\t')\
                         .load(CURRENT_DIR + CELL_AREA)\
                         .select('id', 'area_correlator', 'latitude', 'longitude', 'province')

In [17]:
class Mobility(SparkSessionBase):
    def __init__(self, date, time_start_lower, time_start_high, time_end_lower,
        time_end_high, time_sleep_lower='01:00', time_sleep_high='04:00'):
        super().__init__()
        self.date = date
        
    def get_mobility_at_time_interval(self, time_start, time_end):
        # load correspondent parquet
        df = self.spark.read.parquet(CURRENT_DIR + PARQUETS_DIR + '/' + self.date)

        get_range_udf = F.udf(lambda elems, a, b : get_range(elems, a, b), ArrayType(IntegerType()))

        df = df.withColumn('range', get_range_udf(df.times, F.lit(time_start), F.lit(time_end)))\
               .select(df.code,\
               F.slice(df.towers, F.col('range')[0], F.col('range')[1]).alias('towers'),\
               F.slice(df.times,  F.col('range')[0], F.col('range')[1]).alias('times'))\
               .where(F.size(F.col('towers')) > 0)
        
        count_occurrences_udf = F.udf(lambda x : count_occurrences_and_normalize(x),\
                                      ArrayType(ArrayType(StringType())))
        
        df = df.select('code', count_occurrences_udf(F.col('towers')).alias('towers-count'))
        
        #df.printSchema()
        #df.show(10)
        return df

In [6]:
mobility_instance = Mobility('2020-11-02', None, None, None, None)
user_cells_start = mobility_instance.get_mobility_at_time_interval(30, 1000)
user_cells_end   = mobility_instance.get_mobility_at_time_interval(1400, 2700)

In [7]:
user_cells_start.show()

+-------+--------------------+
|   code|        towers-count|
+-------+--------------------+
|4582376|      [[d7b9d, 1.0]]|
|6892952|      [[d5yy1, 1.0]]|
| 819378|      [[d5zur, 1.0]]|
|1588148|      [[d78yd, 1.0]]|
|6901043|      [[d5yvc, 1.0]]|
|1229925|      [[d795s, 1.0]]|
|4647869|      [[d7b9d, 1.0]]|
|3049134|[[d7b9d, 0.8571],...|
|3998644|      [[d797z, 1.0]]|
|1705846|      [[d7c49, 1.0]]|
| 379374|      [[d5yvg, 1.0]]|
|6202650|      [[d5yy1, 1.0]]|
|4230783|      [[d5zhh, 1.0]]|
|6620846|      [[d5yy4, 1.0]]|
|7075816|      [[d79r0, 1.0]]|
| 565307|      [[dhjcx, 1.0]]|
| 115154|      [[d795t, 1.0]]|
|5292136|      [[d5zxh, 1.0]]|
|2181906|[[d79eb, 0.8148],...|
|6922882|      [[d5yyn, 1.0]]|
+-------+--------------------+
only showing top 20 rows



In [8]:
user_cells_end.show()

+-------+--------------------+
|   code|        towers-count|
+-------+--------------------+
|6892952|      [[d5yy1, 1.0]]|
| 819378|      [[d5zur, 1.0]]|
|1588148|      [[d78yd, 1.0]]|
|6901043|      [[d5yvc, 1.0]]|
|7325994|      [[d79r1, 1.0]]|
|3049134|[[d7b9d, 0.5], [d...|
| 379374|      [[d5yvg, 1.0]]|
|4230783|      [[d5zhh, 1.0]]|
|6620846|      [[d5yy4, 1.0]]|
| 565307|      [[dhjcx, 1.0]]|
|2181906|[[d79eb, 0.7857],...|
|6922882|      [[d5yyn, 1.0]]|
|1461404|      [[d7bh0, 1.0]]|
| 113609|      [[d5yvg, 1.0]]|
| 221420|      [[d79eb, 1.0]]|
| 314712|      [[d7b9d, 1.0]]|
|7593700|      [[d78tc, 1.0]]|
|2097102|[[d79qz, 0.3333],...|
|7211033|      [[d5yv8, 1.0]]|
|4280644|      [[d5zkh, 1.0]]|
+-------+--------------------+
only showing top 20 rows



In [9]:
union_user_cells = user_cells_start.union(user_cells_end)
union_user_cells = union_user_cells.groupBy('code')\
                                   .agg(F.collect_list('towers-count').alias('cells'), F.count('code')\
                                        .alias('count'))\
                                   .filter(F.col('count') == 2)
union_user_cells.show()

+-------+--------------------+-----+
|   code|               cells|count|
+-------+--------------------+-----+
|1023947|[[[dhj7tg, 0.5], ...|    2|
|1028327|[[[d795t, 1.0]], ...|    2|
|1029426|[[[d7c88, 1.0]], ...|    2|
|1030428|[[[dhn50, 0.3333]...|    2|
|1033423|[[[dhj7z1, 0.9286...|    2|
|1056865|[[[dhj7jg, 1.0]],...|    2|
|1060235|[[[dhhc, 1.0]], [...|    2|
|1096857|[[[dhj4, 1.0]], [...|    2|
| 111710|[[[dhj7t2, 1.0]],...|    2|
|1147404|[[[dhjcx, 1.0]], ...|    2|
|1148935|[[[dhj7wx, 1.0]],...|    2|
|1160180|[[[dhjgp, 1.0]], ...|    2|
|1175424|[[[d7d4w, 1.0]], ...|    2|
|1181914|[[[dhj7mz, 0.5], ...|    2|
|1197655|[[[dhj7w6, 0.4], ...|    2|
|1230899|[[[d5zum, 1.0]], ...|    2|
|1266322|[[[d7d1g, 1.0]], ...|    2|
| 129880|[[[dhj75y, 1.0]],...|    2|
|1357494|[[[d79kc, 1.0]], ...|    2|
|1380971|[[[d7d6m, 1.0]], ...|    2|
+-------+--------------------+-----+
only showing top 20 rows



In [10]:
rdd = union_user_cells.select('cells').rdd.flatMap(lambda x : flat_origin_destination_product(x))

In [16]:
df = rdd.toDF(['start', 'end', 'value'])
df.show()
df.printSchema()

+------+------+--------------------+
| start|   end|               value|
+------+------+--------------------+
|dhj7tg|dhj7tg|             0.33335|
|dhj7tg|dhj7te|             0.16665|
|dhj7te|dhj7tg|             0.33335|
|dhj7te|dhj7te|             0.16665|
| d795t| d795s|              0.6667|
| d795t| d795t|              0.3333|
| d7c88| d7c88|                 1.0|
| dhn50| dhjgp|          0.11108889|
| dhn50| dhn50|          0.22221111|
| dhjgp| dhjgp|          0.22221111|
| dhjgp| dhn50| 0.44448888999999997|
|dhj7z1|dhj7z1|          0.79590306|
|dhj7z1|dhj7z0| 0.13269693999999999|
|dhj7z0|dhj7z1|0.061196940000000005|
|dhj7z0|dhj7z0|          0.01020306|
|dhj7jg|dhj7jq|                 0.5|
|dhj7jg|dhj7m5|                 0.5|
|  dhhc|  dhhc|                 1.0|
|  dhj4|  dhj6|                0.75|
|  dhj4|  dhj4|                0.25|
+------+------+--------------------+
only showing top 20 rows

root
 |-- start: string (nullable = true)
 |-- end: string (nullable = true)
 |-- valu

In [12]:
df = df.groupBy('start').pivot('end').agg(F.sum('value'))

In [13]:
matriz_pandas = df.toPandas()
matriz_pandas.to_json('/home/hellscream/Documents/mobility_spark/data/test.json', 'index')

In [14]:
time_end = time.time()
print('time elapsed:' + str((time_end - time_start) / 60))

time elapsed:4.554275365670522
