In [171]:
from math import asin, cos, radians, sin, sqrt

from pandas import HDFStore
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 10)

## Load data

In [80]:
cellular_traffic_data = pd.read_csv('traceset/cellular_traffic.csv', delimiter = ',', decimal='.')

In [81]:
topology_data = pd.read_csv('traceset/topology.csv', delimiter = ',', decimal='.')

## Filter data

In [82]:
# keeping bs with more than 100 hours recorded
keep = []
for i in cellular_traffic_data["bs"].unique():
    if len(cellular_traffic_data[cellular_traffic_data["bs"] == i]) > 100:
        keep.append(i)

In [83]:
cellular_traffic_data.shape

(1625680, 5)

In [84]:
cellular_traffic_data = cellular_traffic_data[cellular_traffic_data["bs"].isin(keep)]

In [85]:
cellular_traffic_data = cellular_traffic_data[cellular_traffic_data.bs.isin(topology_data.bs.unique())]
topology_data = topology_data[topology_data.bs.isin(cellular_traffic_data.bs.unique())]
print(f"traffic shape {cellular_traffic_data.shape}, topology shape {topology_data.shape}")

traffic shape (1453492, 5), topology shape (8864, 3)


## Adjust time field 

In [86]:
cellular_traffic_data['time_hour'] = pd.to_datetime(cellular_traffic_data['time_hour'], unit='s', utc='Asia/Shanghai')
cellular_traffic_data['hour'] = pd.to_datetime(cellular_traffic_data['time_hour'], unit='s', utc='Asia/Shanghai').dt.hour
cellular_traffic_data['day'] = pd.to_datetime(cellular_traffic_data['time_hour'], unit='s', utc='Asia/Shanghai').dt.day

## Cluster data

In [132]:
def find_clusters(data, k):
    mat = data[['lon', 'lat']]
    kmeans = KMeans(n_clusters = k, random_state = 0).fit(mat)
    return kmeans.labels_, kmeans.cluster_centers_

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def create_dist_df(df):
    df.reset_index(drop=True, inplace=True)

    dist_dict = {"from": [], "to": [], "distance": []}
    dist_matrix = np.zeros([len(df), len(df)])
    
    fog_list = df[df["type"] == "fog"]["bs"]
    
    for fog_id in fog_list:
        connected_nodes = df[(df["fog"] == fog_id) | (df["type"] == "fog") | (df["type"] == "cloud")]

        for i in connected_nodes.itertuples():
            idx_i, bs_from, lat_i, lon_i, type_i, fog_i = i[0], i[1], i[2], i[3], i[4], i[5]
            
            if type_i == "rrh" or type_i == "cloud" or fog_i != fog_id:
                continue
                
            for j in connected_nodes.itertuples():
                idx_j, bs_to, lat_j, lon_j, type_j = j[0], j[1], j[2], j[3], j[4]
                
                dist_dict["from"].append(bs_from)
                dist_dict["to"].append(bs_to)
                dist_dict["distance"].append(haversine(lon_i, lat_i, lon_j, lat_j))
    
    return pd.DataFrame.from_dict(dist_dict)

### Topology

In [96]:
total_fogs = 100
total_rrhs_per_fog = 3

In [97]:
topology_data['cluster'], _ = find_clusters(topology_data, total_fogs)

In [98]:
clusters = []
centroid = {}

for i in range(0, total_fogs):
    cluster_i = topology_data[topology_data["cluster"] == i]
    x, c = find_clusters(cluster_i, total_rrhs_per_fog)
    clusters.extend(x)
    centroid[i] = c

topology_data['rrh'] = clusters

In [99]:
columns=["bs", 'lat', 'lon']

fogs = topology_data.groupby(['cluster']).mean().reset_index()

fogs = fogs[["cluster", 'lat', 'lon']]
fogs['type'] = 'fog'
fogs = fogs.rename(columns = {'cluster':'bs'})
fogs.bs = fogs.bs*10
fogs['fog'] = fogs.bs
fogs

Unnamed: 0,bs,lat,lon,type,fog
0,0,13.289695,111.219577,fog,0
1,10,13.145352,111.066682,fog,10
2,20,13.123322,111.100477,fog,20
3,30,13.190210,111.163722,fog,30
4,40,13.157255,111.028934,fog,40
...,...,...,...,...,...
95,950,13.199163,111.000352,fog,950
96,960,13.242540,110.990282,fog,960
97,970,13.100445,110.959278,fog,970
98,980,13.218552,111.213962,fog,980


In [100]:
rrhs = topology_data.groupby(['cluster', 'rrh']).mean().reset_index()

rrhs = rrhs[["cluster", 'rrh', 'lat', 'lon']]
rrhs=rrhs.rename(columns = {'cluster':'fog'})
rrhs.fog = rrhs.fog*10
rrhs['type'] = 'rrh'
rrhs['bs'] = rrhs.fog + rrhs.rrh + 1
del(rrhs['rrh'])
cols = rrhs.columns.tolist()
cols = cols[-1:] + cols[:-1]
rrhs = rrhs[cols]

rrhs

Unnamed: 0,bs,fog,lat,lon,type
0,1,0,13.290225,111.219599,rrh
1,2,0,13.288756,111.219051,rrh
2,3,0,13.289965,111.220029,rrh
3,11,10,13.144891,111.066653,rrh
4,12,10,13.145852,111.066902,rrh
...,...,...,...,...,...
294,982,980,13.217111,111.213376,rrh
295,983,980,13.219272,111.214256,rrh
296,991,990,13.107568,111.059648,rrh
297,992,990,13.107236,111.059821,rrh


In [101]:
cluster_topology = fogs.append(rrhs)
cluster_topology

Unnamed: 0,bs,lat,lon,type,fog
0,0,13.289695,111.219577,fog,0
1,10,13.145352,111.066682,fog,10
2,20,13.123322,111.100477,fog,20
3,30,13.190210,111.163722,fog,30
4,40,13.157255,111.028934,fog,40
...,...,...,...,...,...
294,982,13.217111,111.213376,rrh,980
295,983,13.219272,111.214256,rrh,980
296,991,13.107568,111.059648,rrh,990
297,992,13.107236,111.059821,rrh,990


### Cellular traffic

In [102]:
df1 = topology_data[['bs', 'cluster', 'rrh']]
cellular_traffic_data = pd.merge(cellular_traffic_data, df1, on='bs', right_index=False, how='inner', sort=False)
cellular_traffic_data

Unnamed: 0,bs,time_hour,users,packets,bytes,hour,day,cluster,rrh
0,1,2012-08-18 16:00:00+00:00,1,3,3473.0,16,18,37,2
1,1,2012-08-19 00:00:00+00:00,4,25,3948.0,0,19,37,2
2,1,2012-08-19 01:00:00+00:00,4,196,253027.0,1,19,37,2
3,1,2012-08-19 03:00:00+00:00,3,318,408036.0,3,19,37,2
4,1,2012-08-19 04:00:00+00:00,6,473,532999.0,4,19,37,2
...,...,...,...,...,...,...,...,...,...
1453487,12816,2012-08-26 11:00:00+00:00,3,46,20861.0,11,26,69,2
1453488,12816,2012-08-26 12:00:00+00:00,2,544,23251154.0,12,26,69,2
1453489,12816,2012-08-26 13:00:00+00:00,6,172,200592.0,13,26,69,2
1453490,12816,2012-08-26 14:00:00+00:00,8,331,411794.0,14,26,69,2


In [104]:
fogs = cellular_traffic_data.groupby(['cluster', 'time_hour']).sum().reset_index()
fogs = fogs[['cluster', 'time_hour', 'users', 'packets', 'bytes']]
fogs=fogs.rename(columns = {'cluster':'bs'})
fogs.bs = fogs.bs * 10
fogs['fog'] = fogs.bs
fogs['type'] = 'fog'
fogs

Unnamed: 0,bs,time_hour,users,packets,bytes,fog,type
0,0,2012-08-18 16:00:00+00:00,103,72955,2.102120e+08,0,fog
1,0,2012-08-18 17:00:00+00:00,78,16908,3.871999e+07,0,fog
2,0,2012-08-18 18:00:00+00:00,36,7446,9.883734e+06,0,fog
3,0,2012-08-18 19:00:00+00:00,32,3879,4.705581e+06,0,fog
4,0,2012-08-18 20:00:00+00:00,23,4819,5.276622e+06,0,fog
...,...,...,...,...,...,...,...
19116,990,2012-08-26 11:00:00+00:00,939,151167,3.444785e+08,990,fog
19117,990,2012-08-26 12:00:00+00:00,941,161767,1.750033e+09,990,fog
19118,990,2012-08-26 13:00:00+00:00,876,127739,2.422385e+08,990,fog
19119,990,2012-08-26 14:00:00+00:00,794,167430,3.814466e+08,990,fog


In [106]:
rrhs = cellular_traffic_data.groupby(['cluster', 'rrh', 'time_hour']).sum().reset_index()
rrhs = rrhs[['cluster', 'rrh', 'time_hour', 'users', 'packets', 'bytes']]
rrhs=rrhs.rename(columns = {'cluster':'fog'})
rrhs.fog = rrhs.fog*10
rrhs['type'] = 'rrh'
rrhs['bs'] = rrhs.fog + rrhs.rrh + 1
del(rrhs['rrh'])
cols = rrhs.columns.tolist()
cols = cols[-1:] + cols[:-1]
rrhs = rrhs[cols]

rrhs

Unnamed: 0,bs,fog,time_hour,users,packets,bytes,type
0,1,0,2012-08-18 16:00:00+00:00,32,57455,190571724.0,rrh
1,1,0,2012-08-18 17:00:00+00:00,31,7816,24202568.0,rrh
2,1,0,2012-08-18 18:00:00+00:00,17,1415,1695240.0,rrh
3,1,0,2012-08-18 19:00:00+00:00,14,1733,2371406.0,rrh
4,1,0,2012-08-18 20:00:00+00:00,7,284,448324.0,rrh
...,...,...,...,...,...,...,...
56699,993,990,2012-08-26 11:00:00+00:00,364,74954,167374678.0,rrh
56700,993,990,2012-08-26 12:00:00+00:00,350,86872,209868345.0,rrh
56701,993,990,2012-08-26 13:00:00+00:00,315,49698,75360216.0,rrh
56702,993,990,2012-08-26 14:00:00+00:00,285,52638,73877648.0,rrh


In [107]:
cluster_traffic_data = fogs.append(rrhs)
cluster_traffic_data

Unnamed: 0,bs,time_hour,users,packets,bytes,fog,type
0,0,2012-08-18 16:00:00+00:00,103,72955,210211956.0,0,fog
1,0,2012-08-18 17:00:00+00:00,78,16908,38719988.0,0,fog
2,0,2012-08-18 18:00:00+00:00,36,7446,9883734.0,0,fog
3,0,2012-08-18 19:00:00+00:00,32,3879,4705581.0,0,fog
4,0,2012-08-18 20:00:00+00:00,23,4819,5276622.0,0,fog
...,...,...,...,...,...,...,...
56699,993,2012-08-26 11:00:00+00:00,364,74954,167374678.0,990,rrh
56700,993,2012-08-26 12:00:00+00:00,350,86872,209868345.0,990,rrh
56701,993,2012-08-26 13:00:00+00:00,315,49698,75360216.0,990,rrh
56702,993,2012-08-26 14:00:00+00:00,285,52638,73877648.0,990,rrh


## Add "Cloud" node

In [130]:
bs = cluster_topology["bs"].max()*100
lat = cluster_topology["lat"].max()
lon = cluster_topology["lon"].max()
type_ = "cloud"
fog = bs
cluster_topology = cluster_topology.append({'bs': bs, 'lat': lat, 'lon': lon, 
                                             'type': type_, 'fog': fog}, 
                                          ignore_index=True)

In [131]:
cluster_topology

Unnamed: 0,bs,lat,lon,type,fog
0,0,13.289695,111.219577,fog,0
1,10,13.145352,111.066682,fog,10
2,20,13.123322,111.100477,fog,20
3,30,13.190210,111.163722,fog,30
4,40,13.157255,111.028934,fog,40
...,...,...,...,...,...
395,983,13.219272,111.214256,rrh,980
396,991,13.107568,111.059648,rrh,990
397,992,13.107236,111.059821,rrh,990
398,993,13.105884,111.059006,rrh,990


## Calculate distance matrix

In [151]:
dist_matrix = create_dist_df(cluster_topology) 

In [152]:
threshold = dist_matrix["distance"].mean()
dist_matrix = dist_matrix[(dist_matrix["distance"] <= threshold) | (dist_matrix["to"] == bs)]

In [156]:
sensor_ids = cluster_topology['bs'].values

## Generate HD5 file

In [173]:
users_df = cluster_traffic_data[['bs', 'time_hour', 'users']]
users_pivot = pd.pivot_table(users_df, index='time_hour', columns='bs', values='users', fill_value=0)
cloud_data = users_pivot.sum(axis=1)
users_pivot[bs] = cloud_data
users_pivot

save_hd5('users', users_pivot)

In [174]:
packets_df = cluster_traffic_data[['bs', 'time_hour', 'packets']]
packets_pivot = pd.pivot_table(packets_df, index='time_hour', columns='bs', values='packets', fill_value=0)
cloud_data = packets_pivot.sum(axis=1)
packets_pivot[bs] = cloud_data
packets_pivot

save_hd5('packets', packets_pivot)

In [177]:
bytes_df = cluster_traffic_data[['bs', 'time_hour', 'bytes']]
bytes_pivot = pd.pivot_table(bytes_df, index='time_hour', columns='bs', values='bytes', fill_value=0)
cloud_data = bytes_pivot.sum(axis=1)
bytes_pivot[bs] = cloud_data
bytes_pivot

save_hd5('bytes', bytes_pivot)

## Save everything

In [188]:
dist_matrix.to_csv('dcrnn_data/sensor_graph/sensor_distance.csv', index=False)

In [189]:
np.savetxt('dcrnn_data/sensor_graph/sensor_ids.txt', sensor_ids[np.newaxis], fmt='%d', delimiter=',')

In [172]:
def save_hd5(filename, pivot_table):
    store = HDFStore(f'dcrnn_data/{filename}/{filename}.h5')
    store.put(filename, pivot_table, format='table')
    store.close()