In [7]:
#!/usr/bin/python3
### Filename: udp_analysis.ipynb

"""
Analyze packet capture of udp experiments.
Need to convert .pcap into .csv by tshark with script udp_pcap_to_csv.py

Author: Yuan-Jye Chen
Update: Yuan-Jye Chen 2022/10/09
"""

"""
    Future Development Plan
        (1) Neglect filename start with ".~lock". (e.g., ".~lock.packet_info.csv#", ".~lock.client_pcap_BL_sm05_3210_3211_2022-09-29_16-24-57.csv#")
        (2) Output packet loss statistics and lanency.
        (3) Time synchronization. (preprocessing)
        (4) functionalize
        (5) 對一下 packet loss rate / packet loss number 是否與截圖 summary 相符
    
"""
import os
import sys
import shutil
import argparse
import traceback
import csv
import pandas as pd
import datetime as dt
import numpy as np
from pprint import pprint
from tqdm import tqdm

In [12]:
# ******************************* User Settings *******************************
class Payload:
    LENGTH = 250              # (Bytes)
    TAG = "000425d401df5e76"  # 2 71828 3 1415926 (hex)            : 8-bytes
    OFS_TIME = (16, 24)       # epoch time of 'yyyy/mm/dd hh:mm:ss': 4-bytes
    OFS_USEC = (24, 32)       # microsecond (usec)                 : 4-bytes
    OFS_SEQN = (32, 40)       # sequence number (start from 1)     : 4-bytes
class ServerIP:
    PUBLIC = "140.112.20.183"  # 2F    
    PRIVATE = "192.168.1.248"  # 2F
    # PUBLIC = "140.112.17.209"  # 3F
    # PRIVATE = "192.168.1.108"  # 3F
database = "/home/wmnlab/Desktop/testspace/bandlock_analysis_0930/"
date = "2022-09-29"
Exp_Name = {  # experiment_name:(number_of_experiment_rounds, list_of_experiment_round)
              # If the list is empty, it will list all directories in the current directory by default.
              # If the number of experiment times != the length of existing directories of list, it would trigger warning and skip the directory.
    "_Bandlock_Udp":(6, []),
}
devices = [
    "sm05", 
    "sm06",
    "sm07",
    "sm08",
]
db_path = os.path.join(database, date)
# *****************************************************************************

# --------------------- Global Variables ---------------------
TRANS = 4
RECV = 0

# --------------------- Util Functions ---------------------
def makedir(dirpath, mode=0):  # mode=1: show message, mode=0 hide message
    if os.path.isdir(dirpath):
        if mode:
            print("mkdir: cannot create directory '{}': directory has already existed.".format(dirpath))
        return
    ### recursively make directory
    _temp = []
    while not os.path.isdir(dirpath):
        _temp.append(dirpath)
        dirpath = os.path.dirname(dirpath)
    while _temp:
        dirpath = _temp.pop()
        print("mkdir", dirpath)
        os.mkdir(dirpath)

def filter(df, terminal, direction, protocol):
    """
    Filter out the content that you need.

    Args:
        df (pandas.Dataframe): Original dataframe
        terminal  (str)      : 'server' or 'client'
        direction (str)      : 'uplink' or 'downlink'
        protocol  (str)      : 'tcp' or 'udp'
    Returns:
        df (pandas.Dataframe): Filtered dataframe
    """
    ### UpLink or DownLink
    if terminal == 'client':
        if direction == 'uplink':
            df = df[df['sll.pkttype'] == TRANS]
        elif direction == 'downlink':
            df = df[df['sll.pkttype'] == RECV]
    elif terminal == 'server':
        if direction == 'downlink':
            df = df[df['sll.pkttype'] == TRANS]
        elif direction == 'uplink':
            df = df[df['sll.pkttype'] == RECV]
    ### TCP or UDP
    if protocol == 'udp':
        df = df[df['_ws.col.Protocol'] == 'UDP']
    elif protocol == 'tcp':
        df = df[df['_ws.col.Protocol'] == 'TCP']
    return df

def to_utc8(ts):
    """
    Convert an epoch time into a readable format.
    Switch from utc-0 into utc-8.
    
    Args:
        ts (float): timestamp composed of datetimedec + microsecond (e.g., 1644051509.989306)
    Returns:
        (datetime.datetime): a readable timestamp (utc-8)
    """
    return (dt.datetime.utcfromtimestamp(ts) + dt.timedelta(hours=8))

def str_to_datetime(ts):
    """
    Convert a timestamp string in microseconds or milliseconds into datetime.datetime

    Args:
        ts (str): timestamp string (e.g., 2022-09-29 16:24:58.252615)
    Returns:
        (datetime.datetime)
    """
    return dt.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S.%f')

def datetime_to_str(ts):
    """
    Convert a datetime timestamp in microseconds into str

    Args:
        ts (datetime.datetime): datetime timestamp (e.g., datetime.datetime(2022, 9, 29, 16, 24, 58, 252615))
    Returns:
        (str): timestamp string (e.g., 2022-09-29 16:24:58.252615)
    """
    return dt.datetime.strftime(ts, '%Y-%m-%d %H:%M:%S.%f')

In [13]:
### Read csv file
filename = "/home/wmnlab/Desktop/testspace/bandlock_analysis_0930/2022-09-29/_Bandlock_Udp/#01/sm05/data/client_pcap_BL_sm05_3210_3211_2022-09-29_16-24-57.csv"
# filename = "/home/wmnlab/Desktop/testspace/bandlock_analysis_0930/2022-09-29/_Bandlock_Udp/#01/sm05/data/server_pcap_DL_sm05_3211_2022-09-29_16-24-41.csv"
dirpath = os.path.dirname(filename)
print(filename)
print(dirpath)
df = pd.read_csv(filename, sep='@')
print(df)

### Convert frame.time into datetime
# df['frame.time'] = pd.to_datetime(df['frame.time'])
df['frame.time'] = pd.to_datetime(df['frame.time']).dt.tz_localize(None)  # to remove the time zone information while keeping the local time

### Filtering
df = filter(df, 'client', 'downlink', 'udp')
# df = filter(df, 'server', 'downlink', 'udp')

/home/wmnlab/Desktop/testspace/bandlock_analysis_0930/2022-09-29/_Bandlock_Udp/#01/sm05/data/client_pcap_BL_sm05_3210_3211_2022-09-29_16-24-57.csv
/home/wmnlab/Desktop/testspace/bandlock_analysis_0930/2022-09-29/_Bandlock_Udp/#01/sm05/data
       frame.number                           frame.time  frame.time_epoch  \
0                 1  Sep 29, 2022 16:24:58.069399000 CST      1.664440e+09   
1                 2  Sep 29, 2022 16:24:58.110963000 CST      1.664440e+09   
2                 3  Sep 29, 2022 16:24:58.110947000 CST      1.664440e+09   
3                 4  Sep 29, 2022 16:24:58.111297000 CST      1.664440e+09   
4                 5  Sep 29, 2022 16:24:58.111346000 CST      1.664440e+09   
...             ...                                  ...               ...   
56533         56534  Sep 29, 2022 16:29:44.650596000 CST      1.664440e+09   
56534         56535  Sep 29, 2022 16:29:44.654268000 CST      1.664440e+09   
56535         56536  Sep 29, 2022 16:29:44.657870000 CST  

In [15]:
### Original version
# df = df[df['data.data'] != np.nan]
# df = df[df['data.len'] != 0]
# df = df[df['data.len'] % Payload.LENGTH == 0]
### Modified version
df = df[df['data.data'].str.contains(Payload.TAG)]

### Extract the features that you need
df = df[['frame.time', 'frame.time_epoch', 'data.len', '_ws.col.Protocol', 'ip.src', 'ip.dst', 'data.data']]

### Reset index
df = df.reset_index(drop=True)
print(df)

                      frame.time  frame.time_epoch  data.len _ws.col.Protocol  \
0     2022-09-29 16:24:58.252615      1.664440e+09     500.0              UDP   
1     2022-09-29 16:24:58.258921      1.664440e+09     250.0              UDP   
2     2022-09-29 16:24:58.266121      1.664440e+09     250.0              UDP   
3     2022-09-29 16:24:58.282106      1.664440e+09     250.0              UDP   
4     2022-09-29 16:24:58.286382      1.664440e+09     250.0              UDP   
...                          ...               ...       ...              ...   
27855 2022-09-29 16:29:44.627819      1.664440e+09     250.0              UDP   
27856 2022-09-29 16:29:44.638855      1.664440e+09     250.0              UDP   
27857 2022-09-29 16:29:44.650596      1.664440e+09     250.0              UDP   
27858 2022-09-29 16:29:44.657870      1.664440e+09     250.0              UDP   
27859 2022-09-29 16:29:44.670449      1.664440e+09     250.0              UDP   

               ip.src      

In [16]:
### Parse packet loss | Stage 1
# repackage: transport layer protocol may repackage multiple payload into 1 frame (capture),
#            so data.len would be an integer multiple of customized Payload.LENTGH.
# solution. => repkg_num

df = pd.DataFrame(df, columns=df.columns.tolist()+['repackage.num', 'sequence.number', 'payload.time', 'payload.time_epoch'])
df = df.reindex(['frame.time', 'frame.time_epoch', 'data.len', 'repackage.num', 'sequence.number', 'payload.time', 'payload.time_epoch', '_ws.col.Protocol', 'ip.src', 'ip.dst', 'data.data'], axis=1)

for i in tqdm(range(len(df))):
    repkg_num = int(df.loc[i, 'data.len'] // Payload.LENGTH)
    _offset = [s * Payload.LENGTH * 2 for s in list(range(repkg_num))]  # 1-Byte == 2-hex-digits
    payload = df.loc[i, 'data.data']  # string-type
    # print(repkg_num)
    _temp = [[], [], []]  # [[seq,], [gen_time,], [gen_time_epoch,]]
    for ofs in _offset:
        datetimedec = int(payload[ofs + Payload.OFS_TIME[0] : ofs + Payload.OFS_TIME[1]], 16)
        microsec = int(payload[ofs + Payload.OFS_USEC[0] : ofs + Payload.OFS_USEC[1]], 16)
        seq = int(payload[ofs + Payload.OFS_SEQN[0] : ofs + Payload.OFS_SEQN[1]], 16)
        gen_time = str(to_utc8(datetimedec + microsec * 1e-6))
        # print("   ", seq)
        # print("   ", gen_time)
        _temp[0].append(str(seq))
        _temp[1].append(gen_time)
        _temp[2].append(str(datetimedec + microsec * 1e-6))
    df.loc[i, 'repackage.num'] = str(round(repkg_num))
    df.loc[i, 'sequence.number'] = '@'.join(_temp[0])
    df.loc[i, 'payload.time'] = '@'.join(_temp[1])
    df.loc[i, 'payload.time_epoch'] = '@'.join(_temp[2])

print(df)
makedir(os.path.join(dirpath, "..", "analysis"))
df.to_csv(os.path.join(dirpath, "..", "analysis", "clt_dwnlnk_udp_packet_info.csv"), index=False)
# df.to_csv(os.path.join(dirpath, "..", "analysis", "srv_dwnlnk_udp_packet_info.csv"), index=False)

  0%|          | 0/27860 [00:00<?, ?it/s]  1%|▏         | 378/27860 [00:00<00:07, 3773.09it/s]  3%|▎         | 756/27860 [00:00<00:07, 3525.01it/s]  4%|▍         | 1185/27860 [00:00<00:06, 3858.13it/s]  6%|▌         | 1613/27860 [00:00<00:06, 4018.95it/s]  7%|▋         | 2043/27860 [00:00<00:06, 4118.04it/s]  9%|▉         | 2473/27860 [00:00<00:06, 4178.38it/s] 10%|█         | 2892/27860 [00:00<00:06, 3914.81it/s] 12%|█▏        | 3320/27860 [00:00<00:06, 4025.10it/s] 13%|█▎        | 3749/27860 [00:00<00:05, 4104.76it/s] 15%|█▍        | 4176/27860 [00:01<00:05, 4152.88it/s] 17%|█▋        | 4597/27860 [00:01<00:05, 4154.88it/s] 18%|█▊        | 5014/27860 [00:01<00:05, 3936.88it/s] 20%|█▉        | 5442/27860 [00:01<00:05, 4034.90it/s] 21%|██        | 5870/27860 [00:01<00:05, 4104.95it/s] 23%|██▎       | 6297/27860 [00:01<00:05, 4153.04it/s] 24%|██▍       | 6714/27860 [00:01<00:05, 3901.35it/s] 26%|██▌       | 7141/27860 [00:01<00:05, 4004.81it/s] 27%|██▋       | 7571/278

                      frame.time  frame.time_epoch  data.len repackage.num  \
0     2022-09-29 16:24:58.252615      1.664440e+09     500.0             2   
1     2022-09-29 16:24:58.258921      1.664440e+09     250.0             1   
2     2022-09-29 16:24:58.266121      1.664440e+09     250.0             1   
3     2022-09-29 16:24:58.282106      1.664440e+09     250.0             1   
4     2022-09-29 16:24:58.286382      1.664440e+09     250.0             1   
...                          ...               ...       ...           ...   
27855 2022-09-29 16:29:44.627819      1.664440e+09     250.0             1   
27856 2022-09-29 16:29:44.638855      1.664440e+09     250.0             1   
27857 2022-09-29 16:29:44.650596      1.664440e+09     250.0             1   
27858 2022-09-29 16:29:44.657870      1.664440e+09     250.0             1   
27859 2022-09-29 16:29:44.670449      1.664440e+09     250.0             1   

      sequence.number                                       pay

In [75]:
### Parse packet loss | Stage 2
# duplicate packets: It was found that we may receive the same payload data (with the same generating time & sequence number) on different arrival time.
#                    However, UDP should not do retransmission, so we only take the first arrival payload data into account.
# solution. => seq_set()
# !!! When running experiment, iPerf3 & tcpdump should always start / restart synchronously.

# df = pd.read_csv(os.path.join(dirpath, "..", "analysis", "clt_dwnlnk_udp_packet_info.csv"), dtype=str)
df = pd.read_csv(os.path.join(dirpath, "..", "analysis", "srv_dwnlnk_udp_packet_info.csv"), dtype=str)
df['frame.time'] = pd.to_datetime(df['frame.time'])

### check type of each data
# print(df)
# print(df['frame.time'])
# print(df['frame.time_epoch'])
# print(df.loc[0, 'frame.time'] == str_to_datetime("2022-09-29 16:24:58.252615"))
# print(df['payload.time'])
# print(df['payload.time_epoch'])

# print(type(df.loc[0, 'frame.time_epoch']))
# print(type([float(s) for s in df.loc[0, 'payload.time_epoch'].split('@')][0]))
# print(df.loc[0, 'frame.time_epoch'] < [float(s) for s in df.loc[0, 'payload.time_epoch'].split('@')][0])

timestamp_list = []
seq_set = set()
for i in tqdm(range(len(df))):
    ftime = df.loc[i, 'frame.time']
    ftime_epoch = df.loc[i, 'frame.time_epoch']
    _seq = [int(s) for s in df.loc[i, 'sequence.number'].split('@')]
    _ptime = [str_to_datetime(s) for s in df.loc[i, 'payload.time'].split('@')]
    _ptime_epoch = [float(s) for s in df.loc[i, 'payload.time_epoch'].split('@')]
    for seq, ptime, ptime_epoch in zip(_seq, _ptime, _ptime_epoch):
        if seq not in seq_set:
            timestamp_list.append([seq, ftime, ftime_epoch, ptime, ptime_epoch])
            seq_set.add(seq)

### Consider there are out-of-order packets
timestamp_list = sorted(timestamp_list, key = lambda v : v[0])

### Check the results
# pprint(timestamp_list)
# with open(os.path.join(dirpath, "..", "analysis", "clt_dwnlnk_udp_packet_brief.csv"), "w", newline='') as fp:
with open(os.path.join(dirpath, "..", "analysis", "srv_dwnlnk_udp_packet_brief.csv"), "w", newline='') as fp:
    writer = csv.writer(fp)
    writer.writerow(['sequence.number', 'frame.time', 'frame.time_epoch', 'payload.time', 'payload.time_epoch'])
    writer.writerows(timestamp_list)

100%|██████████| 28711/28711 [00:01<00:00, 16758.96it/s]


In [14]:
dirpath = "/home/wmnlab/Desktop/testspace/bandlock_analysis_0930/2022-09-29/_Bandlock_Udp/#01/sm05/data/"
rxdf = pd.read_csv(os.path.join(dirpath, "..", "analysis", "clt_dwnlnk_udp_packet_brief.csv"))
txdf = pd.read_csv(os.path.join(dirpath, "..", "analysis", "srv_dwnlnk_udp_packet_brief.csv"))
rxdf['frame.time'] = pd.to_datetime(rxdf['frame.time'])
rxdf['payload.time'] = pd.to_datetime(rxdf['payload.time'])
txdf['frame.time'] = pd.to_datetime(txdf['frame.time'])
txdf['payload.time'] = pd.to_datetime(txdf['payload.time'])

### get latency needs to apply da-shen's method to synchronize the data, so get packet loss first.
def get_latency_jitter(rxdf, txdf):
    ### !!! Downlink: arrival time (client, rxdf.frame.time) | payload generating time (client, rxdf.payload.time) | transmitted time (server, txdf.frame.time)
    ### !!! Uplink:   arrival time (server, rxdf.frame.time) | payload generating time (server, txdf.payload.time) | transmitted time (client, txdf.frame.time)
    ### !!! Needs to synchronize time between server & client before calculating latency & jitter
    ### calculate latency
    rxdf['latency'] = (rxdf['frame.time'] - rxdf['payload.time']).dt.total_seconds().round(6)
    ### calculate jitter
    jitter = 0
    for i in range(len(rxdf)):
        if i == 0:
            continue
        if rxdf.loc[i, 'latency'] < 0 or rxdf.loc[i-1, 'latency'] < 0:
            print("Latency should not be negative!!! Force to terminate.")
            print(rxdf.iloc[i-1])
            print()
            break
        jitter = jitter + abs(rxdf.loc[i, 'latency'] - rxdf.loc[i-1, 'latency'])
    jitter = round(jitter / (len(rxdf) - 1), 6)
    ### output file
    rxdf.to_csv(os.path.join(dirpath, "..", "analysis", "dwnlnk_udp_latency.csv"), index=False)
    return jitter

def get_loss(rxdf, txdf):
    _eseq = 1  # next expected sequence number
    timestamp_store = None
    loss_timestamp_list = []
    count = 0 # to count the total number of packet losses
    for i in tqdm(range(len(rxdf))):
        timestamp = (int(rxdf.loc[i, 'sequence.number']), float(rxdf.loc[i, 'frame.time_epoch']), float(rxdf.loc[i, 'payload.time_epoch']))
        if timestamp[0] == _eseq:
            ### received packet's sequence number as expected
            pass
        else:
            ### packet losses occur
            ### 可處理連續掉 N 個封包的狀況
            ### timestamp_store: 前一刻收到的封包
            ### timestamp: 此時此刻收到的封包
            ### _eseq 為預期收到的封包 sequence number (前一刻收到的 seq number + 1)
            ### rxdf.loc[i, 'sequence.number'] 為此時此刻收到的封包 seq
            ### rxdf.loc[i, 'sequence.number']-pointer+2 == 遺漏的封包數+2 (頭+尾)，因此要去頭去尾才是實際遺漏的封包
            if timestamp_store == None:
                ### if the first-N packets lost, we cannot predict the loss timestamp, so we only record their sequemce number.
                loss_linspace = np.linspace(0, timestamp[0], timestamp[0]-_eseq+2)
                loss_linspace = loss_linspace[1:-1]  # 去頭去尾
                for item in loss_linspace:
                    count += 1
                    loss_time = (round(item[0]), '-', '-', '-', '-')
                    loss_timestamp_list.append(loss_time)
            else:
                loss_linspace = np.linspace(timestamp_store, timestamp, timestamp[0]-_eseq+2)
                loss_linspace = loss_linspace[1:-1]  # 去頭去尾
                for item in loss_linspace:
                    ### !!! Downlink: estimated arrival time (client, rxdf.frame.time) | payload generating time (server, txdf.payload.time)
                    ### !!! Uplink:   transmitted time       (client, txdf.frame.time) | payload generating time (client, txdf.payload.time)
                    count += 1
                    loss_time = (round(item[0]), to_utc8(item[1]), item[1], to_utc8(item[2]), item[2])
                    loss_timestamp_list.append(loss_time)
        # Update information
        timestamp_store = timestamp
        _eseq = timestamp[0] + 1

    ### !!! Since we interrupt client-side first, the maximum sequence number of txdf is larger than rxdf.
    ### !!! We consider it not to be the packet loss coming from the cellular network condition, so we use (loss.num + len(rxdf)) as packet.sent.num
    total_packet_sent = count + len(rxdf)
    total_loss = count
    loss_rate = count / (count + len(rxdf)) * 100  # ratio (%)
    exp_time = round(rxdf['frame.time_epoch'].iloc[-1] - rxdf['frame.time_epoch'].iloc[0], 6)
    return loss_timestamp_list, (total_packet_sent, total_loss, loss_rate, exp_time)

loss_timestamp, loss_statistics = get_loss(rxdf, txdf)
jitter = get_latency_jitter(rxdf, txdf)
print("jitter:           ", jitter)
print("total_packet_sent:", loss_statistics[0])
print("total_packet_loss:", loss_statistics[1])
print("packet_loss_rate (%):", loss_statistics[2])
print("experiment_time (sec):", loss_statistics[3])

### Output the results
with open(os.path.join(dirpath, "..", "analysis", "dwnlnk_udp_loss_timestamp.csv"), "w", newline='') as fp:
    writer = csv.writer(fp)
    writer.writerow(['sequence.number', 'frame.time', 'frame.time_epoch', 'payload.time', 'payload.time_epoch'])
    writer.writerows(loss_timestamp)
with open(os.path.join(dirpath, "..", "analysis", "dwnlnk_udp_loss_statistics.csv"), "w", newline='') as fp:
    writer = csv.writer(fp)
    writer.writerow(['total_packet_sent', 'total_packet_loss', 'packet_loss_rate(%)', 'experiment_time(sec)'])
    writer.writerow(loss_statistics)

100%|██████████| 28543/28543 [00:00<00:00, 64396.86it/s]


jitter:            0.001964
total_packet_sent: 28642
total_packet_loss: 99
packet_loss_rate (%): 0.3456462537532295
experiment_time (sec): 286.417834
