In [8]:
import json
import time
import pyspark.sql.functions as F
from utils import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, IntegerType
from collections import defaultdict

In [21]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [9]:
CELL_AREA    = '/data/cell_area.csv'
PARQUETS_DIR = '/data/parquets'
CURRENT_DIR  = '/home/hellscream/Documents/backendSpark'

In [10]:
time_start = time.time()

## DataFrame Schema

## Create Spark Session

In [11]:
class SparkSessionBase():
    def __init__(self):
        self.spark = SparkSession\
                     .builder\
                     .appName('Mobility')\
                     .getOrCreate()
        
        self.cell_area_df = self.spark.read.format('csv').options(header='true', delimiter='\t')\
                         .load(CURRENT_DIR + CELL_AREA)\
                         .select('id', 'area_correlator')
        
        cell_area_pandas = self.cell_area_df.toPandas()
        cell_area_idx = cell_area_pandas.to_dict('index')
        self.cell_area = {}
        
        for i in cell_area_idx:
            x = cell_area_idx[i]['id']
            y = cell_area_idx[i]['area_correlator']
            if y != None:
                self.cell_area[x] = y

## Mobility Matrix

In [12]:
class Mobility(SparkSessionBase):
    def __init__(self, date, time_start_lower, time_start_high, time_end_lower,
        time_end_high, time_sleep_lower='01:00', time_sleep_high='04:00'):
        super().__init__()
        self.date = date
        
    def get_mobility_at_time_interval(self, time_start, time_end):
        # load correspondent parquet
        
        time_start *= 3600
        time_end *= 3600
        
        df = self.spark.read.parquet(CURRENT_DIR + PARQUETS_DIR + '/' + self.date)

        get_range_udf = F.udf(lambda elems, a, b : get_range(elems, a, b), ArrayType(IntegerType()))

        df = df.withColumn('range', get_range_udf(df.times, F.lit(time_start), F.lit(time_end)))\
               .select(df.code,\
               F.slice(df.cell_ids, F.col('range')[0], F.col('range')[1]).alias('towers'),\
               F.slice(df.times,  F.col('range')[0], F.col('range')[1]).alias('times'))\
               .where(F.size(F.col('towers')) > 0)
        
        mapp = self.cell_area
        
        df = df.rdd.flatMap(lambda x : mapp_tow_cell(x, mapp)).toDF(['code', 'towers', 'times'])
        count_occurrences_udf = F.udf(lambda x : count_occurrences_and_normalize(x),\
                                      ArrayType(ArrayType(StringType())))
        df = df.select('code', count_occurrences_udf(F.col('towers')).alias('towers-count'))
        
        return df
    
    def build(self, users_cells_start, users_cells_end):
        union_user_cells = user_cells_start.union(user_cells_end)
        
        union_user_cells = union_user_cells.groupBy('code')\
                                           .agg(F.collect_list('towers-count').alias('cells'), F.count('code')\
                                                .alias('count'))\
                                           .filter(F.col('count') == 2)
        
        rdd = union_user_cells.select('cells').rdd.flatMap(lambda x : flat_origin_destination_product(x))
        df = rdd.toDF(['start', 'end', 'value'])
        df = df.groupBy('start').pivot('end').agg(F.sum('value'))
        
        matriz_pandas = df.toPandas()
        matriz_pandas.to_json('/home/hellscream/Documents/mobility_spark/data/2021-03-01.json', 'index')

### Testing Mobility

In [13]:
mobility_instance = Mobility('2021-03-01', None, None, None, None)

user_cells_start = mobility_instance.get_mobility_at_time_interval(6, 10)
user_cells_end   = mobility_instance.get_mobility_at_time_interval(16, 20)

mobility_instance.build(user_cells_start, user_cells_end)

In [14]:
time_end = time.time()
print('time elapsed:' + str((time_end - time_start) / 60))

time elapsed:9.911337133248647


## User Mobility

In [231]:
class UserMobility(SparkSessionBase):
    def __init__(self, date):
        super().__init__()
        
        self.date = date
        
        '''
        self.imsi_mobility = {}  # amount of km and cell_changes
        self.users_cells_start = {}  # count tower night to set home
        '''
        
    def get_users_mobility(self, start_time=25200, end_time=72000, sleep_time_start=3600, sleep_end_time=14400):
        df = self.spark.read.parquet(CURRENT_DIR + PARQUETS_DIR + '/' + self.date)
        df = df.rdd.flatMap(lambda rows : between_ab_OR_dc(rows, (start_time, end_time), (sleep_time_start, sleep_end_time))).toDF(['code', 'cell_ids', 'times'])
        df = df.select('*').where(F.size(df.times) > 0)
        return df

In [232]:
um = UserMobility('2021-03-01')
set1 = um.get_users_mobility()
set1.show()

+-----+--------------------+--------------------+
| code|            cell_ids|               times|
+-----+--------------------+--------------------+
| 1804|[176-1146, 176-11...|[6469, 7968, 8016...|
| 3108|[73-1350, 475-460...|[33565, 58725, 58...|
| 3905|[475-4593, 475-45...|[13447, 27849, 42...|
| 5905|[76-11563, 107-82...|[30240, 30242, 30...|
| 6616|[476-5363, 4731-1...|[41384, 41901, 42...|
| 7216|[453-69, 453-69, ...|[10380, 39310, 46...|
| 9413|[450-289, 450-289...|[8489, 12091, 264...|
|10006| [821-251, 821-8541]|      [30451, 68885]|
|11117|[338-96, 338-19, ...|[7223, 26725, 269...|
|13219|[71-735, 71-735, ...|[10524, 32229, 57...|
|14507|         [454-17061]|             [11175]|
|17019|           [74-8262]|              [8324]|
|20210|[74-10321, 76-108...|[32205, 39372, 40...|
|20702|[173-1516, 173-80...|[34040, 34057, 34...|
|21519|[322-130, 321-331...|[25787, 25818, 26...|
|25918|[455-676, 455-59,...|[6466, 7792, 8051...|
|26909| [76-1011, 76-10131]|      [67779, 68559]|


In [226]:
'''
TODO:
1. map cell_ids

'''