In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colrs

from pyspark.sql import SparkSession
import pyspark.sql.functions as spark_functions
import pyspark.sql.types as spark_types
from pyspark.sql.functions import col, pandas_udf, PandasUDFType,udf,broadcast

In [2]:
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['CMU Serif Roman'] + plt.rcParams['font.serif']
plt.rcParams['font.size'] = 16

In [3]:
USER = os.getlogin()
WORKING_DIR = f'/home/{USER}/data/Land_use'
DATA_DIR = f'{WORKING_DIR}/data'
METROPOLES_SHAPE = f'{DATA_DIR}/cities'
IMG_DIR = f'{WORKING_DIR}/images'

In [4]:
spark = SparkSession.builder\
    .master('spark://santiago:7077')\
    .appName('Land use - SRCA median week')\
    .config('spark.network.timeout', 300)\
    .config('spark.dynamicAllocation.enabled', 'true')\
    .config('spark.shuffle.service.enabled', 'true')\
    .config('spark.dynamicAllocation.initialExecutors', 1)\
    .config('spark.dynamicAllocation.maxExecutors', 20)\
    .config('spark.dynamicAllocation.minExecutors', 0)\
    .config('spark.driver.maxResultSize', '120g')\
    .config('spark.executor.cores', 1)\
    .config('spark.executor.memory', '4g')\
    .config('spark.memory.fraction', 0.6)\
    .config('spark.cores.max', 20)\
    .config('spark.executor.memoryOverhead', '8g')\
    .config('spark.driver.memoryOverhead', '8g')\
    .getOrCreate()

spark.conf.set('spark.sql.session.timeZone', 'Europe/Paris')

2023-08-07 13:46:12,594 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2023-08-07 13:46:12,940 WARN spark.SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


## Load Data

### City mask

In [5]:
CITY_NAME = 'Lyon'
df_mask = pd.read_pickle(f'{DATA_DIR}/df_masks.pkl')
city_row = df_mask[df_mask['name'] == CITY_NAME].iloc[0]

city_shape = city_row['shape']
city_mask = city_row['mask']
city_left_x = city_row['left_x']
city_bottom_y = city_row['bottom_y']

### Median Week

In [6]:
filename = f'hdfs://santiago:9000/land_use/{CITY_NAME}_traffic_maps_median_week.parquet'
sdf_traffic = spark.read.parquet(filename)
#sdf_traffic = sdf_traffic.withColumnRenamed('median_week_traffic_map', 'traffic_map')
sdf_traffic.show(2)

[Stage 1:>                                                          (0 + 1) / 1]

+----+-------------------+--------+-----------+--------------------+
|city|                app|    time|day_of_week|         traffic_map|
+----+-------------------+--------+-----------+--------------------+
|Lyon|Amazon Web Services|02:30:00|  Wednesday|[[0.0, 0.0, 0.0, ...|
|Lyon|Amazon Web Services|08:00:00|  Wednesday|[[0.0, 0.0, 0.0, ...|
+----+-------------------+--------+-----------+--------------------+
only showing top 2 rows



                                                                                

In [None]:
[
    'fornite',
    'google docs',
    'skydrive',
    'miscrosoft store',
    'molotov tv',
    'orange tv',
    'team viewer',
    'tor',
    'web adult'
]

In [7]:
sdf_traffic.count()

                                                                                

22848

In [10]:
# some_rows = sdf_traffic.take(10)
# data = []
# for row in some_rows:
#     data.append(row.asDict())

# df = pd.DataFrame(data)
# sdf_small = spark.createDataFrame(df)
# sdf_small.show(2)

## Symmetric RCA

### Traffic maps and Traffic per app

In [8]:
schema_traffic_map = spark_types.ArrayType(spark_types.ArrayType(spark_types.FloatType()))

@pandas_udf(schema_traffic_map)
def total_traffic_map(traffic_maps: pd.Series)-> schema_traffic_map:
    traffic_maps = traffic_maps.apply(lambda traffic_map: np.array(list(traffic_map)))
    traffic_map = traffic_maps.sum(axis=0)

    return traffic_map.tolist()

In [9]:
schema_sum = spark_types.FloatType()

@udf(schema_sum)
def apply_sum(traffic_map) -> schema_sum:
    traffic_map = np.array(list(traffic_map))
    return float(np.sum(traffic_map))

In [10]:
sdf_traffic_time = sdf_traffic.groupBy('city', 'time','day_of_week').agg(total_traffic_map('traffic_map').alias('traffic_map'))
sdf_traffic_time = sdf_traffic_time.withColumn('traffic', apply_sum('traffic_map'))
sdf_traffic_time.show(2)

[Stage 7:>                                                          (0 + 1) / 1]

+----+--------+-----------+--------------------+-------------+
|city|    time|day_of_week|         traffic_map|      traffic|
+----+--------+-----------+--------------------+-------------+
|Lyon|09:00:00|  Wednesday|[[0.0, 0.0, 0.0, ...|3.79559215E11|
|Lyon|14:30:00|   Thursday|[[0.0, 0.0, 0.0, ...|4.03103777E11|
+----+--------+-----------+--------------------+-------------+
only showing top 2 rows



                                                                                

In [11]:
df_traffic_time = sdf_traffic_time.toPandas()
df_traffic_time['traffic_map'] = df_traffic_time['traffic_map'].apply(lambda traffic_map: np.array(traffic_map))
df_traffic_time.head(2)

                                                                                

Unnamed: 0,city,time,day_of_week,traffic_map,traffic
0,Lyon,09:00:00,Wednesday,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",379559200000.0
1,Lyon,14:30:00,Thursday,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",403103800000.0


In [14]:
# traffic_map_time = df_traffic_time[(df_traffic_time['time'] == '10:30:00')&(df_traffic_time['day_of_week'] == 'Monday')].iloc[0]['traffic_map']

# my_cmap_traffic = cm.get_cmap('Spectral_r').copy()
# my_cmap_traffic.set_under('w', 0)
# norm_traffic = colrs.LogNorm(vmin=1e6, vmax=5e11)

# fig = plt.figure(figsize=(6, 6))
# plt.imshow(traffic_map_time, origin='lower', cmap=my_cmap_traffic, norm=norm_traffic)
# plt.colorbar()
# plt.xticks([])
# plt.yticks([])
# plt.axis('off')
# plt.show()

## Symetric RCA

In [12]:
# Tij = traffic of app i in location j
# Tj = traffic of all apps in location j
# Ti = traffic of app i in all locations
# T = traffic of all apps in all locations

schema_traffic_map = spark_types.ArrayType(spark_types.ArrayType(spark_types.DoubleType()))

@udf(returnType=schema_traffic_map)
def compute_SRCA(time, day, traffic_map) -> schema_traffic_map:

    traffic_map = np.array(list(traffic_map))
    
    Tij = traffic_map
    Tj = df_traffic_time[(df_traffic_time['time'] == time)&(df_traffic_time['day_of_week'] == day)].iloc[0]['traffic_map']
    Ti = traffic_map.sum()
    T = Tj.sum()

    RCA = (Tij / Tj) / (Ti / T)
    SRCA = (RCA - 1) / (RCA + 1)
    SRCA[ city_mask == 0 ] = 0
    return SRCA.tolist()

In [13]:
sdf_traffic_map_rca = sdf_traffic.withColumn('traffic_map_srca', compute_SRCA('time', 'day_of_week', 'traffic_map'))
sdf_traffic_map_rca.show(2)

[Stage 10:>                                                         (0 + 1) / 1]

+----+-------------------+--------+-----------+--------------------+--------------------+
|city|                app|    time|day_of_week|         traffic_map|    traffic_map_srca|
+----+-------------------+--------+-----------+--------------------+--------------------+
|Lyon|Amazon Web Services|02:30:00|  Wednesday|[[0.0, 0.0, 0.0, ...|[[0.0, 0.0, 0.0, ...|
|Lyon|Amazon Web Services|08:00:00|  Wednesday|[[0.0, 0.0, 0.0, ...|[[0.0, 0.0, 0.0, ...|
+----+-------------------+--------+-----------+--------------------+--------------------+
only showing top 2 rows



                                                                                

## Save Data

### Parquet

In [14]:
filename = f'hdfs://santiago:9000/land_use/{CITY_NAME}_median_week_traffic_maps_srca.parquet'
sdf_traffic_map_rca.write.parquet(filename, mode='overwrite')

                                                                                

2023-08-07 13:52:16,959 WARN storage.BlockManagerMasterEndpoint: No more replicas available for broadcast_13_python !


## Test saved dataset

In [15]:
filename = f'hdfs://santiago:9000/land_use/{CITY_NAME}_median_week_traffic_maps_srca.parquet'
sdf_traffic = spark.read.parquet(filename)
#sdf_traffic = sdf_traffic.withColumnRenamed('median_week_traffic_map', 'traffic_map')
sdf_traffic.show(2)

[Stage 13:>                                                         (0 + 1) / 1]

+----+-------------------+--------+-----------+--------------------+--------------------+
|city|                app|    time|day_of_week|         traffic_map|    traffic_map_srca|
+----+-------------------+--------+-----------+--------------------+--------------------+
|Lyon|Amazon Web Services|02:30:00|  Wednesday|[[0.0, 0.0, 0.0, ...|[[0.0, 0.0, 0.0, ...|
|Lyon|Amazon Web Services|08:00:00|  Wednesday|[[0.0, 0.0, 0.0, ...|[[0.0, 0.0, 0.0, ...|
+----+-------------------+--------+-----------+--------------------+--------------------+
only showing top 2 rows



                                                                                

In [16]:
sdf_traffic.count()

                                                                                

22848