In [None]:
import json
import time
import numpy as np
import pyspark.sql.functions as F
from utils import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, IntegerType
from collections import defaultdict
from scipy.spatial import KDTree

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
CELL_AREA    = '/data/cell_area.csv'
PARQUETS_DIR = '/data/parquets'
CURRENT_DIR  = '/home/hellscream/Documents/backendSpark'

In [None]:
time_start = time.time()

## DataFrame Schema

## Create Spark Session

In [None]:
class SparkSessionBase():
    def __init__(self):
        self.spark = SparkSession\
                     .builder\
                     .appName('Mobility')\
                     .getOrCreate()
        
        self.kdtree = None
        
        # Map cell_area.csv
        self.dic_full_df = self.spark.read.format('csv').options(header='true', delimiter='\t')\
                         .load(CURRENT_DIR + CELL_AREA)\
                         .select('*')
                         
        dic_full_pandas = self.dic_full_df.toPandas()
        self.dic_full   = dic_full_pandas.to_dict('index')
        
        self.dic_tow_cell = {}
        for i in self.dic_full:
            self.dic_tow_cell[self.dic_full[i]['id']] = self.dic_full[i]['area_correlator'] 
        
        self.dic_cell_latlon = {}
        for i in self.dic_full:
            self.dic_cell_latlon[self.dic_full[i]['area_correlator']] = [self.dic_full[i]['latitude'], self.dic_full[i]['longitude']]

## Mobility Matrix

In [None]:
class Mobility(SparkSessionBase):
    def __init__(self, date, time_start_lower, time_start_high, time_end_lower,
        time_end_high, time_sleep_lower='01:00', time_sleep_high='04:00'):
        super().__init__()
        self.date = date
        
    def get_mobility_at_time_interval(self, time_start, time_end):
        # load correspondent parquet
        
        time_start *= 3600
        time_end *= 3600
        
        df = self.spark.read.parquet(CURRENT_DIR + PARQUETS_DIR + '/' + self.date)

        df = df.rdd.flatMap(lambda rows : get_range(rows, time_start, time_end))\
               .toDF(['code', 'towers', 'times']).select('*').filter(F.size(F.col('times')) > 0)
        
        mapp = self.dic_tow_cell
        df = df.rdd.flatMap(lambda x : mapp_tow_cell(x, mapp)).toDF(['code', 'towers', 'times'])
        
        count_occurrences_udf = F.udf(lambda x : count_occurrences_and_normalize(x),\
                                      ArrayType(ArrayType(StringType())))
        
        df = df.select('code', count_occurrences_udf(F.col('towers')).alias('towers-count'))
        
        return df
    
    
    def build(self, users_cells_start, users_cells_end):
        union_user_cells = user_cells_start.union(user_cells_end)
        
        union_user_cells = union_user_cells.groupBy('code')\
                                           .agg(F.collect_list('towers-count').alias('cells'), F.count('code')\
                                                .alias('count'))\
                                           .filter(F.col('count') == 2)
        
        rdd = union_user_cells.select('cells').rdd.flatMap(lambda x : flat_origin_destination_product(x))
        df = rdd.toDF(['start', 'end', 'value'])
        df = df.groupBy('start').pivot('end').agg(F.sum('value'))
        
        matriz_pandas = df.toPandas()
        matriz_pandas.to_json('/home/hellscream/Documents/mobility_spark/data/2021-03-01.json', 'index')

### Testing Mobility

In [None]:
mobility_instance = Mobility('2021-03-01', None, None, None, None)

user_cells_start = mobility_instance.get_mobility_at_time_interval(6, 10)
user_cells_end   = mobility_instance.get_mobility_at_time_interval(16, 20)

mobility_instance.build(user_cells_start, user_cells_end)

In [None]:
time_end = time.time()
print('time elapsed:' + str((time_end - time_start) / 60))

## User Mobility

In [None]:
class UserMobility(SparkSessionBase):
    def __init__(self, date):
        super().__init__()
        
        self.date = date
        
        '''
        self.imsi_mobility = {}  # amount of km and cell_changes
        self.users_cells_start = {}  # count tower night to set home
        '''
    
    def get_users_mobility(self, start_time=25200, end_time=72000, sleep_start_time=3600, sleep_end_time=14400):
        df = self.spark.read.parquet(CURRENT_DIR + PARQUETS_DIR + '/' + self.date)
        
        df = df.rdd.flatMap(lambda rows : between_ab_OR_dc(rows, (start_time, end_time), (sleep_start_time, sleep_end_time))).toDF(['code', 'cell_ids', 'times'])
        df = df.select('*').where(F.size(df.times) > 0)
        
        # imsi -> cell changes & amount of distinct cell
        distinct_cells_df = df.withColumn('distinct_cells', F.array_distinct(F.col('cell_ids'))).select('code', 'distinct_cells', F.size(F.col('distinct_cells')).alias('amount'))
        
        mapp = self.dic_tow_cell
        df = df.rdd.flatMap(lambda x : mapp_tow_cell(x, mapp)).toDF(['code', 'towers', 'times'])
                
        # imsi -> km displacement
        mapp = self.dic_cell_latlon
        km_displacement_df = df.rdd.flatMap(lambda x : mapp_cell_latlon(x, mapp)).toDF(['code', 'cells', 'lat_lon'])
        km_displacement_udf = F.udf(lambda coordinates : km_displacement(coordinates), StringType())
        km_displacement_df = km_displacement_df.withColumn('km_displacement', km_displacement_udf(F.col('lat_lon'))).select('code', 'km_displacement')
        
        # imsi -> amount of distinct towers
        distinct_towers_df = df.withColumn('distinct_towers', F.array_distinct(F.col('towers'))).select('code', 'distinct_towers', F.size(F.col('distinct_towers')).alias('amount')) 
        
        
        # process for establish sleeping zone
        users_cells_start = df.rdd.flatMap(lambda x : accumulate_count(x, sleep_start_time, sleep_end_time)).toDF(['code', 'towers', 'count'])
        users_cells_start = users_cells_start.select('*').where(F.size('towers') > 0)
        
        map_area_correlator_to_coord_udf = F.udf(lambda x : map_area_correlator_to_coord(x, mapp), ArrayType(ArrayType(StringType())))
        users_cells_start = users_cells_start.withColumn('coords', map_area_correlator_to_coord_udf(F.col('towers'))).select('*')

        # getting mean and home tower
        normalize_udf = F.udf(lambda values : [val / sum(values) for val in values], ArrayType(StringType()))
        mean_home_tower_udf = F.udf(lambda weight, coords : get_mean_home_tower(weight, coords), ArrayType(StringType()))
        # imsi -> spleep_area
        users_cells_start = users_cells_start.withColumn('weight', normalize_udf(F.col('count')))\
                                             .select('code', mean_home_tower_udf(F.col('weight'), F.col('coords')).alias('home_tower'))
        
        
        
        
        
        
        
        #return distinct_cells_df
        #return km_displacement_df
        #return distinct_towers_df
        
        return users_cells_start   

In [None]:
um = UserMobility('2021-03-01')
set1 = um.get_users_mobility()
set1.show()
#set1.printSchema()

## Traces

In [None]:
class Traces(SparkSessionBase):
    def __init__(self):
        super().__init__()
        
    def build(self):

In [None]:
points = list(map(np.array, [[5, 4], [2, 6], [13, 3], [3, 1], [10, 2], [8, 7]]))
points

In [None]:
kdtree = KDTree(points)
tar = np.array([9, 4])

In [None]:
q = kdtree.query(tar, 1)
res = kdtree.data[q[1]]
res = list(res)
type(res)

In [None]:
import pandas as pd
d = {0: {'start' : 'A', 'k1' : 0, 'k2' : 5}, 1: {'start' : 'B', 'k3' : 10}}

df = pd.DataFrame(data=d)
df

In [None]:
r = df.toPandas()
r1 = r.to_json('index')
print(r1)

In [None]:
df.transpose()

In [None]:
df.transpose().to_dict()

In [16]:
import pandas as pd

In [17]:
def fix_json_format(jobj):
    m = {}
    for i in jobj:
        m[jobj[i]['start']] = jobj[i]
        del m[jobj[i]['start']]['start']
    return m

In [20]:
d1 = {"0":{"start":"B","B":0,"C":8.0},"1":{"start":"C","A":0,"C":15.0},"2":{"start":"A","B":10.0,"C":0}}

d2 = {"start":{"0":"B","1":"C","2":"A"},"B":{"0":0,"1":0,"2":10.0},"C":{"0":8.0,"1":15.0,"2":0}}

In [21]:
m = fix_json_format(d1)
df = pd.DataFrame(m)
df

Unnamed: 0,B,C,A
B,0.0,,10.0
C,8.0,15.0,0.0
A,,0.0,


In [25]:
df.to_dict()

{'B': {'B': 0.0, 'C': 8.0, 'A': nan},
 'C': {'B': nan, 'C': 15.0, 'A': 0.0},
 'A': {'B': 10.0, 'C': 0.0, 'A': nan}}

In [22]:
t = df.transpose()
t

Unnamed: 0,B,C,A
B,0.0,8.0,
C,,15.0,0.0
A,10.0,0.0,


In [23]:
t.to_dict()

{'B': {'B': 0.0, 'C': nan, 'A': 10.0},
 'C': {'B': 8.0, 'C': 15.0, 'A': 0.0},
 'A': {'B': nan, 'C': 0.0, 'A': nan}}

In [26]:
test = {"0":{"start":"B","B":0,"C":8},"1":{"start":"C","B":0,"C":15},"2":{"start":"A","B":10,"C":0}}

In [27]:
print(fix_json_format(test))

{'B': {'B': 0, 'C': 8}, 'C': {'B': 0, 'C': 15}, 'A': {'B': 10, 'C': 0}}


In [None]:
from pyspark.sql import Row
    df = spark_session.createDataFrame([Row(1, [['a', '2'], ['b', '2']]), Row(2, [['a', '4']])], ['col1', 'col2'])
    df.show()

    df = df.rdd.flatMap(lambda row : ((tow, val) for tow, val in row[1]) ).toDF(['tower', 'val'])
    res = df.groupBy('tower').agg(F.sum('val').alias('sum')).toPandas().to_dict(orient='list')
    x = dict(zip(res['tower'], res['sum']))
    print(x)