In [3]:
import pandas as pd
import os
import numpy as np
import json
import sys
from tqdm import tqdm
from geopy.distance import geodesic 

# system setup
user_info_filename = "user_fix.csv"
trace_info_filename = "traces_fix.csv"

# node config filename
topology_filename = "topology.json"

## 1 Preprocess User info data

#### 1.1 load user
load user and filter out users with no country info

In [4]:
# load user info

user_df = pd.read_csv(user_info_filename)
trace_df = pd.read_csv(trace_info_filename)
#print(user_df.columns)
#print(trace_df.columns)

# calculate the timestamp offset
start_timestamp = trace_df.loc[0, "timestamp"]

def timestamp_offset(ts):
    return ts - start_timestamp

trace_df["timestamp_offset"] = trace_df["timestamp"].apply(timestamp_offset)

user_df = pd.merge(user_df, trace_df, on='user_id', how='inner')
#print(len(user_df))
print(trace_df)

         user_id   timestamp method             file_url  file_size  n_like  \
0        2993811  1370534405   POST    2993811-27932.dat      35111       0   
1        2308187  1370534417   POST    2308187-21838.dat      26447       0   
2         172946  1370534418   POST     172946-21841.dat      42245       2   
3        2039079  1370534418   POST     2039079-4720.dat      35763       0   
4        1399414  1370534423   POST     1399414-1645.dat     173362       0   
...          ...         ...    ...                  ...        ...     ...   
1193754   113082  1374074924    GET    1660106-90931.dat      57386       0   
1193755  2324093  1374076605    GET   1239822-377779.dat      34419       0   
1193756  2806178  1374076607    GET  1733287-1640109.dat      21618       1   
1193757  2653174  1374076613    GET   2814889-122868.dat      23635       0   
1193758   958643  1374076614    GET   2507688-129998.dat      16721       1   

         timestamp_offset  
0                      

#### 1.2 split user to different node

In [5]:
topology_json = ""
with open(topology_filename) as json_file:
    topology_json = json.load(json_file)

access_nodes = topology_json["topology"]["layer-2"]
print(access_nodes)


[{'id': 'access-0', 'name': 'access-0', 'location': [40.76, 111.84], 'domain_name': '', 'IP_address': ''}, {'id': 'access-1', 'name': 'access-1', 'location': [43.07, 89.41], 'domain_name': ''}, {'id': 'access-2', 'name': 'access-2', 'location': [34.67, 82.84], 'domain_name': ''}]


#### 1.3 allocate user to node

In [6]:
def calculate_distance(row, nodes, prog_bar):
    min_distance = 1000000000000000000
    min_idx = 0
    for idx, node in enumerate(nodes):
        user_coords = (row['lat'], row['lng'])
        node_coords = node["location"]
        distance = geodesic(user_coords, node_coords)
        if distance < min_distance:
            min_distance = distance
            min_idx = idx
    
    nodes[min_idx]["users_id"].append(row["user_id"])
    prog_bar.update(1)

progress_bar = tqdm(user_df.iterrows(), total=len(user_df))

for node in access_nodes: 
    node["users_id"] = []

user_df['distances'] = user_df.apply(calculate_distance, axis=1, nodes=access_nodes, prog_bar=progress_bar)

100%|█████████▉| 1193636/1193759 [13:07<00:00, 1528.32it/s]

100%|██████████| 1193759/1193759 [13:20<00:00, 1528.32it/s]

## 2 Extract node trace

#### 2.1 allocate trace to node

In [7]:
trace_split_result = {}

for node in access_nodes:
    node_name = node["name"]
    users_id = node["users_id"]
    
    mask = trace_df["user_id"].isin(users_id)
    
    filtered_trace_df = trace_df[mask].reset_index(drop=True)
    
    # save result
    trace_split_result[node_name] = filtered_trace_df

for name, trace in trace_split_result.items():
    print(name)
    print(trace)

access-0
          index  user_id   timestamp method            file_url  file_size  \
0             0  2993811  1370534405   POST   2993811-27932.dat      35111   
1             1  2308187  1370534417   POST   2308187-21838.dat      26447   
2             4  1399414  1370534423   POST    1399414-1645.dat     173362   
3             5   104377  1370534427   POST     104377-2100.dat     112988   
4             6  2671645  1370534431   POST    2671645-4771.dat      40677   
...         ...      ...         ...    ...                 ...        ...   
633800  1193745  2725703  1374060488    GET   2866554-79480.dat      48526   
633801  1193748  1908400  1374062661    GET   2866554-79480.dat      48526   
633802  1193749  1083092  1374062817    GET   344824-568871.dat      97005   
633803  1193751   505727  1374065799    GET  2643015-112633.dat      10008   
633804  1193757  2653174  1374076613    GET  2814889-122868.dat      23635   

        n_like  timestamp_offset  
0            0     

#### 2.2 split to each node

In [9]:
# doing aisa node
# maximum trace line 10000
# maximum container number 10
max_line = 10000
max_container = 20

for node_name, node_traces in trace_split_result.items():
    # asia-node
    traces_size = len(node_traces)
    print(node_name, " with trace size ", traces_size)
    container_num = traces_size // max_line
    if container_num <= 0:
        container_num = 1
    elif container_num > 20:
        container_num = 20

    print(node_name, " with container number ", container_num)
    # split traces
    split_dfs = [node_traces.iloc[i::container_num] for i in range(container_num)]
    output_dir = "./dataset/" + node_name
    os.makedirs(output_dir, exist_ok=True)

    for idx, df in enumerate(split_dfs):
        output_filename = output_dir + "/" + "trace-%d" %idx
        df.to_csv(output_filename, header=False)


access-0  with trace size  633805
access-0  with container number  20
access-1  with trace size  559954
access-1  with container number  20
access-2  with trace size  0
access-2  with container number  1
