In [1]:
import pandas as pd
import os
import numpy as np
import json
import sys
from tqdm import tqdm
from geopy.distance import geodesic 

# system setup
user_info_filename = "user_fix.csv"
trace_info_filename = "traces_fix.csv"

# node config filename
topology_filename = "topology.json"

## 1 Preprocess User info data

#### 1.1 load user
load user and filter out users with no country info

In [7]:
# load user info

user_df = pd.read_csv(user_info_filename)
trace_df = pd.read_csv(trace_info_filename)
#print(user_df.columns)
#print(trace_df.columns)

# calculate the timestamp offset
start_timestamp = trace_df.loc[0, "timestamp"]

def timestamp_offset(ts):
    return ts - start_timestamp

trace_df["timestamp_offset"] = trace_df["timestamp"].apply(timestamp_offset)

user_df = pd.merge(user_df, trace_df, on='user_id', how='inner')
print(user_df)
print(trace_df)

         user_id  n_pin  n_followers  n_following  n_like_x  \
0             82   2497          441           99       262   
1            420   3102           87           42       175   
2           1515   1726           44          105      1303   
3           1529   7884          302          177       966   
4           1550   8264          281          135       894   
...          ...    ...          ...          ...       ...   
1193754  3022154   8035          558          313      1534   
1193755  3022155    847           36          143        10   
1193756  3022156   3471          227           57        11   
1193757  3022158    574          299          285      1661   
1193758  3022165   1068          575           94       106   

         n_following_twitter  n_follower_twitter  n_tweet        lat  \
0                      238.0              8382.0   2140.0  29.760803   
1                     1586.0               761.0   2558.0  35.582684   
2                      136.

#### 1.2 split user to different node

In [3]:
topology_json = ""
with open(topology_filename) as json_file:
    topology_json = json.load(json_file)

access_nodes = topology_json["topology"]["layer-2"]
print(access_nodes)


[{'id': 'access-0', 'name': 'Seattle', 'location': [47.6062, 122.3321], 'domain_name': '', 'ip_address': ''}, {'id': 'access-1', 'name': 'Los Angeles', 'location': [34.0522, 118.2437], 'domain_name': '', 'ip_address': ''}, {'id': 'access-2', 'name': 'New York', 'location': [40.7128, 74.006], 'domain_name': '', 'ip_address': ''}, {'id': 'access-3', 'name': 'Atlanta', 'location': [33.7438, 84.387], 'domain_name': '', 'ip_address': ''}]


In [None]:
#user_df = user_df.head(20)
#print(user_df)

#### 1.3 allocate user to node

In [8]:
import math

def euclidean_distance(lat1, lon1, lat2, lon2):
    x = lat2 - lat1
    y = lon2 - lon1
    distance = math.sqrt(x**2 + y**2)

    return distance

def calculate_distance(row, nodes, prog_bar):
    min_distance = 1000000000000000000
    min_idx = -1
    for idx, node in enumerate(nodes):
        user_coords = (row['lat'], row['lng'])
        node_coords = node["location"]
        distance = euclidean_distance(user_coords[0], -user_coords[1], node_coords[0], node_coords[1])
        if min_idx == -1 :
            min_idx = idx
            min_distance = distance
        elif distance < min_distance:
            min_distance = distance
            min_idx = idx

    nodes[min_idx]["users_id"].append(row["user_id"])
    prog_bar.update(1)

progress_bar = tqdm(user_df.iterrows(), total=len(user_df))

for node in access_nodes: 
    node["users_id"] = []

user_df['distances'] = user_df.apply(calculate_distance, axis=1, nodes=access_nodes, prog_bar=progress_bar)

100%|██████████| 1193759/1193759 [11:22<00:00, 1750.30it/s] 


In [10]:
for node in access_nodes:
    print(len(node["users_id"]))

57289
198480
488961
449029




## 2 Extract node trace

#### 2.1 allocate trace to node

In [11]:
trace_split_result = {}

for node in access_nodes:
    node_name = node["name"]
    users_id = node["users_id"]
    
    mask = trace_df["user_id"].isin(users_id)
    
    filtered_trace_df = trace_df[mask].reset_index(drop=True)
    
    # save result
    trace_split_result[node_name] = filtered_trace_df

for name, trace in trace_split_result.items():
    print(name)
    print(trace)

Seattle
       user_id   timestamp method             file_url  file_size  n_like  \
0       277045  1370534496   POST     277045-69529.dat      26094       0   
1       364458  1370534499   POST     364458-69559.dat      27548       0   
2      2923017  1370534500    GET     239353-69563.dat      38575       0   
3       638653  1370534501    GET     378601-69576.dat     105661       0   
4      1092132  1370534511    GET     333785-69874.dat      29956       0   
...        ...         ...    ...                  ...        ...     ...   
57284  1219440  1374015962    GET   1929204-136827.dat      11377       2   
57285  1277511  1374030786    GET    1557893-91215.dat      33623       0   
57286   564725  1374032726    GET    2836425-76214.dat      57048       0   
57287  2500963  1374034522    GET  2874298-1114398.dat      66752       0   
57288  2725703  1374060488    GET    2866554-79480.dat      48526       0   

       timestamp_offset  
0                    91  
1              

#### 2.2 split to each node

In [12]:
# doing aisa node
# maximum trace line 10000
# maximum container number 10
max_line = 10000
max_container = 20

for node_name, node_traces in trace_split_result.items():
    # asia-node
    traces_size = len(node_traces)
    print(node_name, " with trace size ", traces_size)
    container_num = traces_size // max_line
    if container_num <= 0:
        container_num = 1
    elif container_num > 20:
        container_num = 20

    print(node_name, " with container number ", container_num)
    # split traces
    split_dfs = [node_traces.iloc[i::container_num] for i in range(container_num)]
    output_dir = "./dataset/" + node_name
    os.makedirs(output_dir, exist_ok=True)

    for idx, df in enumerate(split_dfs):
        output_filename = output_dir + "/" + "trace-%d" %idx
        df.to_csv(output_filename, header=False)


Seattle  with trace size  57289
Seattle  with container number  5
Los Angeles  with trace size  198480
Los Angeles  with container number  19
New York  with trace size  488961
New York  with container number  20
Atlanta  with trace size  449029
Atlanta  with container number  20
