In [1487]:
import csv
import ast
import json
import statistics

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime, timedelta

In [1488]:
date = "20240203"
date_with_dash = "2024-02-03"
port = "4201"
phone_time = "0322"
time = "1121"
sent_file_name = f"log_{date}_{time}_{port}_server"
received_file_name = f"log_{date}_{phone_time}_{port}_client"
path = f"/Volumes/MOLLY256/MOXA/{date_with_dash}/QUIC-300sec/sm00/#06/"

In [1489]:
sync_file_name = f"/Volumes/MOLLY256/MOXA/{date_with_dash}/QUIC-450sec/time_sync_sm00.json"
# sync_file = path + "raw/" + sync_file_name
with open(sync_file_name, 'r') as file:
    data = json.load(file)

# Extract values from the dictionary
values = list(data.values())
mean_diff = values[11] * 1000

### Transform to JSON & CSV file
Process the qlog file to json file & csv file.

In [1490]:
def QlogToJsonEntry(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Add commas between lines
    json_str = ",".join(lines)
    # Surround the entire string with square brackets to create a JSON array
    json_str = "[" + json_str + "]"
    # Load the JSON array
    json_entry = json.loads(json_str)
    
    return json_entry

def QlogToJson(json_entry, json_file_path):
    with open(json_file_path, 'w') as json_file:
        json.dump(json_entry, json_file, indent=2)

def JsonToCsv(json_entry, csv_file_path):
     # Open CSV file for writing
    with open(csv_file_path, 'w', newline='') as csv_file:
        # Create a CSV writer
        csv_writer = csv.writer(csv_file)

        # Write header row based on the keys of the second JSON object (assuming at least two objects are present)
        if len(json_entry) >= 2:
            header = list(json_entry[1].keys())
            csv_writer.writerow(header)

            # Write data rows starting from the second object
            for entry in json_entry[1:]:
                csv_writer.writerow(entry.values())

In [1491]:
# sender_side_file
sent_raw_path = path + "raw/" + sent_file_name
sent_qlog_file_path = sent_raw_path + ".qlog"
sent_json_file_path = sent_raw_path + ".json"
sent_csv_file_path = sent_raw_path + ".csv"
sent_json_entry = QlogToJsonEntry(sent_qlog_file_path)
QlogToJson(sent_json_entry, sent_json_file_path)
JsonToCsv(sent_json_entry, sent_csv_file_path)

In [1492]:
received_raw_path = path + "raw/" + received_file_name
received_qlog_file_path = received_raw_path + ".qlog"
received_json_file_path = received_raw_path + ".json"
received_csv_file_path = received_raw_path + ".csv"
received_json_entry = QlogToJsonEntry(received_qlog_file_path)
QlogToJson(received_json_entry, received_json_file_path)
JsonToCsv(received_json_entry, received_csv_file_path)

In [1493]:
sent_df = pd.read_csv(sent_csv_file_path)
received_df = pd.read_csv(received_csv_file_path)

Set time to UMT+8.

In [1494]:
def GetStartTime(json_data):
    # unit: ms
    refTime = json_data[0]["trace"]["common_fields"]["reference_time"]
    return refTime

def ProcessTime(df, reference_time):
    # Extract the "time" values from the DataFrame
    original_times = (df['time'].astype(float))

    # Calculate "epoch_time" and convert to timestamps
    epoch_times = (reference_time + original_times)
    timestamps = pd.to_datetime(epoch_times, unit='ms').dt.strftime('%Y-%m-%d %H:%M:%S.%f')

    df['epoch_time'] = epoch_times
    df['timestamp'] = timestamps

    return df

In [1495]:
# No matter downlink or uplink, the file time that need to change is client side.
if int(port)%2 == 0: # UL
    clientStartTime = GetStartTime(sent_json_entry)
    print(clientStartTime)
    serverStartTime = GetStartTime(received_json_entry)
    print(serverStartTime)

    senderRefTime = clientStartTime + mean_diff
    rcverRefTime = serverStartTime

else:   # DL
    clientStartTime = GetStartTime(received_json_entry)
    print(clientStartTime)
    serverStartTime = GetStartTime(sent_json_entry)
    print(serverStartTime)
    startTimeDiff = (clientStartTime - serverStartTime) + mean_diff

    senderRefTime = serverStartTime
    rcverRefTime = clientStartTime + mean_diff


1706930525951.65
1706930516053.0598


In [1496]:
sent_df = ProcessTime(sent_df, senderRefTime)
# Add 8 hours to both epoch times and timestamps to match UMT+8
# Also sync time with server
epoch_times_gmt8 = sent_df["epoch_time"] + 8 * 3600 * 1000
sent_df["epoch_time"] = epoch_times_gmt8
timestamps_gmt8 = pd.to_datetime(epoch_times_gmt8, unit='ms').dt.strftime('%Y-%m-%d %H:%M:%S.%f')
sent_df["timestamp"] = timestamps_gmt8

sent_df[-5:]

Unnamed: 0,time,name,data,epoch_time,timestamp
573564,300059.398715,recovery:metrics_updated,"{'smoothed_rtt': 25.815, 'latest_rtt': 25.2618...",1706960000000.0,2024-02-03 11:26:56.112458
573565,300059.40123,recovery:loss_timer_updated,{'event_type': 'cancelled'},1706960000000.0,2024-02-03 11:26:56.112461
573566,300059.402983,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706960000000.0,2024-02-03 11:26:56.112462
573567,300059.410531,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706960000000.0,2024-02-03 11:26:56.112470
573568,300059.429244,transport:connection_closed,"{'owner': 'remote', 'application_code': 0, 're...",1706960000000.0,2024-02-03 11:26:56.112488


In [1497]:
received_df = ProcessTime(received_df, rcverRefTime)
# if the sender is server, then it is no need to calculate time difference
epoch_times_gmt8 = received_df["epoch_time"] + 8 * 3600 * 1000
received_df["epoch_time"] = epoch_times_gmt8
timestamps_gmt8 = pd.to_datetime(epoch_times_gmt8, unit='ms').dt.strftime('%Y-%m-%d %H:%M:%S.%f')
received_df["timestamp"] = timestamps_gmt8

received_df[-5:]

Unnamed: 0,time,name,data,epoch_time,timestamp
201278,300106.17426,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706960000000.0,2024-02-03 11:26:56.101173
201279,300106.202229,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706960000000.0,2024-02-03 11:26:56.101201
201280,300106.261604,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'dcid': '0d...",1706960000000.0,2024-02-03 11:26:56.101261
201281,300107.572802,transport:connection_closed,"{'owner': 'local', 'application_code': 0, 'rea...",1706960000000.0,2024-02-03 11:26:56.102572
201282,300107.660094,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'dcid': '0d...",1706960000000.0,2024-02-03 11:26:56.102659


Parse the data.

In [1498]:
# sender side data
metrics_all_rows = sent_df[(sent_df['name'] == 'recovery:metrics_updated') & (sent_df['data'].str.contains("'bytes_in_flight':"))]
metrics_sent_rows = sent_df[(sent_df['name'] == 'recovery:metrics_updated') & (sent_df['data'].str.contains("{'bytes_in_flight':"))]
metrics_ack_rows = sent_df[(sent_df['name'] == 'recovery:metrics_updated') & (sent_df['data'].str.contains("'latest_rtt':"))]
total_sent_rows = sent_df[(sent_df['name'] == 'transport:packet_sent')]
pk_sent_rows = sent_df[(sent_df['name'] == 'transport:packet_sent') & (sent_df['data'].str.contains("'frame_type': 'stream'"))]
rcv_ack_rows = sent_df[(sent_df['name'] == 'transport:packet_received') & (sent_df['data'].str.contains("'frame_type': 'ack'")) & (sent_df['data'].str.contains("'packet_type': '1RTT'"))]
lost_rows = sent_df[sent_df['name'] == 'recovery:packet_lost']

# Get the count of rows
metrics_all_cnt = len(metrics_all_rows)
metrics_c_cnt = len(metrics_sent_rows)
metrics_ack_cnt = len(metrics_ack_rows)
total_sent_cnt = len(total_sent_rows)
pk_sent_cnt = len(pk_sent_rows)
rcv_ack_cnt = len(rcv_ack_rows)
lost_cnt = len(lost_rows)

print("packet_sent: ", pk_sent_cnt, metrics_c_cnt)
print("ack: ", rcv_ack_cnt, metrics_ack_cnt)
print(metrics_all_cnt, metrics_c_cnt, metrics_ack_cnt, pk_sent_cnt, rcv_ack_cnt, lost_cnt)

packet_sent:  148959 148972
ack:  42081 42048
191020 148972 42048 148959 42081 67


In [1499]:
pk_rcv_rows = received_df[(received_df['name'] == "transport:packet_received") & (received_df['data'].str.contains("'frame_type': 'stream'"))]
pk_rcv_rows = pk_rcv_rows.reset_index(drop=True)
print(len(pk_rcv_rows))
pk_rcv_rows[:5]

148938


Unnamed: 0,time,name,data,epoch_time,timestamp
0,95.740104,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.090739
1,102.013281,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.097012
2,108.214271,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.103213
3,108.24151,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.103240
4,108.253125,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.103252


## Deal with sender side data
Concat `transport:packet_sent` & `recovery:metrics_updated`.

In [1500]:
metrics_sent_csv_file_path = path + "middle/" + f"sent_metrics_{time}_{port}.csv"
metrics_sent_rows.to_csv(metrics_sent_csv_file_path, index=False)
pk_sent_csv_file_path = path + "middle/" + f"pk_sent_{time}_{port}.csv"
pk_sent_rows.to_csv(pk_sent_csv_file_path, index=False)

In [1501]:
def insert(df, idx, new_row):
    df1 = df.iloc[:idx, :]
    df2 = df.iloc[idx:, :]
    df_new = pd.concat([df1, new_row, df2], ignore_index=True)
    return df_new

In [1502]:
metrics_sent_rows = metrics_sent_rows.reset_index(drop=True)
pk_sent_rows = pk_sent_rows.reset_index(drop=True)
print(metrics_sent_rows[:5])
print(pk_sent_rows[:5])


        time                      name  \
0   2.227411  recovery:metrics_updated   
1   2.228884  recovery:metrics_updated   
2  32.114858  recovery:metrics_updated   
3  32.450625  recovery:metrics_updated   
4  34.173221  recovery:metrics_updated   

                                                data    epoch_time  \
0  {'bytes_in_flight': 1161, 'packets_in_flight': 2}  1.706959e+12   
1  {'bytes_in_flight': 1252, 'packets_in_flight': 3}  1.706959e+12   
2   {'bytes_in_flight': 307, 'packets_in_flight': 1}  1.706959e+12   
3   {'bytes_in_flight': 578, 'packets_in_flight': 2}  1.706959e+12   
4   {'bytes_in_flight': 851, 'packets_in_flight': 3}  1.706959e+12   

                    timestamp  
0  2024-02-03 11:21:56.055287  
1  2024-02-03 11:21:56.055288  
2  2024-02-03 11:21:56.085174  
3  2024-02-03 11:21:56.085510  
4  2024-02-03 11:21:56.087233  
        time                   name  \
0  32.443277  transport:packet_sent   
1  34.163407  transport:packet_sent   
2  36.042576  tra

In [1503]:
ori_recover_c_len = len(metrics_sent_rows)
for i in range(pk_sent_cnt):
    if(i >= len(metrics_sent_rows)):
        data = metrics_sent_rows.iloc[i-1]['data']
        new_row_data = {'time': [pk_sent_rows.iloc[i]['time']], 'name':['recovery:metrics_updated'], 'data': [data]}
        new_row = pd.DataFrame(new_row_data)
        metrics_sent_rows = pd.concat([metrics_sent_rows, new_row], ignore_index=True)
        continue
    time_diff = metrics_sent_rows.iloc[i]['time'] - pk_sent_rows.iloc[i]['time']
    # print(i, time_diff)
    # time_diff >= 1: not the matching metrics_update
    while time_diff >= 1:
        data = metrics_sent_rows.iloc[i-1]['data']
        new_row_data = {'time': [pk_sent_rows.iloc[i]['time']], 'name':['recovery:metrics_updated'], 'data': [data]}
        new_row = pd.DataFrame(new_row_data)
        # print(new_row)
        metrics_sent_rows = insert(metrics_sent_rows, i, new_row)
        time_diff = metrics_sent_rows.iloc[i]['time'] - pk_sent_rows.iloc[i]['time']
    # time_diff < 0: missing metrics_update
    while time_diff < 0:
        # print(i, time_diff_list)
        metrics_sent_rows.drop(index=metrics_sent_rows.index[i], inplace=True)
        time_diff = metrics_sent_rows.iloc[i]['time'] - pk_sent_rows.iloc[i]['time']

    

# if len(metrics_sent_rows) < pk_sent_cnt:
#     d = pk_sent_cnt - len(metrics_sent_rows)
# data = metrics_sent_rows.iloc[len(metrics_sent_rows)-1]['data']

# for i in range(d):
#     last_row_data = {'time': [pk_sent_rows.iloc[len(metrics_sent_rows)-1]['time']], 'name':['recovery:metrics_updated'], 'data': [data]}
#     new_row_df = pd.DataFrame(last_row_data)
#     metrics_sent_rows = pd.concat([metrics_sent_rows, new_row], ignore_index=True)

print(ori_recover_c_len, len(metrics_sent_rows))


148972 148959


In [1504]:
metrics_sent_rows = metrics_sent_rows.reset_index(drop=True)
pk_sent_rows = pk_sent_rows.reset_index(drop=True)
print(len(metrics_sent_rows), len(pk_sent_rows))

# check whether there's still mismatch exist.
time_diff_list = metrics_sent_rows['time'] - pk_sent_rows['time']
mismatch_indices = time_diff_list[(time_diff_list >= 1) | (time_diff_list < 0)].index
if len(mismatch_indices) == 0:
    print("All Matched!")
else:
    print(mismatch_indices)


148959 148959
All Matched!


In [1505]:
# extract bytes_in_flight & packets_in_flight
metrics_sent_rows['bytes_in_flight'] = None
metrics_sent_rows['packets_in_flight'] = None

# Use ast.literal_eval to safely evaluate the string and extract 'bytes_in_flight' and 'packets_in_flight'
metrics_sent_rows[['bytes_in_flight', 'packets_in_flight']] = metrics_sent_rows['data'].apply(
    lambda x: pd.Series(ast.literal_eval(x)) if isinstance(x, str) else pd.Series([None, None]))

metrics_sent_rows[:5]

Unnamed: 0,time,name,data,epoch_time,timestamp,bytes_in_flight,packets_in_flight
0,32.450625,recovery:metrics_updated,"{'bytes_in_flight': 578, 'packets_in_flight': 2}",1706959000000.0,2024-02-03 11:21:56.085510,578.0,2.0
1,34.173221,recovery:metrics_updated,"{'bytes_in_flight': 851, 'packets_in_flight': 3}",1706959000000.0,2024-02-03 11:21:56.087233,851.0,3.0
2,36.046066,recovery:metrics_updated,"{'bytes_in_flight': 1124, 'packets_in_flight': 4}",1706959000000.0,2024-02-03 11:21:56.089105,1124.0,4.0
3,38.122324,recovery:metrics_updated,"{'bytes_in_flight': 1397, 'packets_in_flight': 5}",1706959000000.0,2024-02-03 11:21:56.091182,1397.0,5.0
4,40.136013,recovery:metrics_updated,"{'bytes_in_flight': 1670, 'packets_in_flight': 6}",1706959000000.0,2024-02-03 11:21:56.093195,1670.0,6.0


In [1506]:
# Add bytes_in_flight & packets_in_flight to pk_sent_rows
pk_sent_rows['bytes_in_flight'] = metrics_sent_rows['bytes_in_flight']
pk_sent_rows['packets_in_flight'] = metrics_sent_rows['packets_in_flight']

pk_sent_rows[:5]

Unnamed: 0,time,name,data,epoch_time,timestamp,bytes_in_flight,packets_in_flight
0,32.443277,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.085503,578.0,2.0
1,34.163407,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.087223,851.0,3.0
2,36.042576,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.089102,1124.0,4.0
3,38.120828,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.091180,1397.0,5.0
4,40.132247,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.093192,1670.0,6.0


Concat `transport:packet_received` & `recovery:metrics_updated`.

In [1507]:
metrics_ack_csv_file_path = path + "middle/" + f"ack_metrics_{time}_{port}.csv" 
metrics_ack_rows.to_csv(metrics_ack_csv_file_path, index=False)
rcv_ack_csv_file_path = path + "middle/" + f"rcv_ack_{time}_{port}.csv"
rcv_ack_rows.to_csv(rcv_ack_csv_file_path, index=False)

In [1508]:
metrics_ack_rows = metrics_ack_rows.reset_index(drop=True)
rcv_ack_rows = rcv_ack_rows.reset_index(drop=True)
initial_ack_metrics = metrics_ack_rows.iloc[[0]]
metrics_ack_rows.drop(index=metrics_ack_rows.index[0], inplace=True)
metrics_ack_rows = metrics_ack_rows.reset_index(drop=True)

In [1509]:
metrics_ack_rows[:3]

Unnamed: 0,time,name,data,epoch_time,timestamp
0,31.872711,recovery:metrics_updated,"{'min_rtt': 29.619015, 'smoothed_rtt': 29.6190...",1706959000000.0,2024-02-03 11:21:56.084932
1,32.067919,recovery:metrics_updated,"{'smoothed_rtt': 29.62, 'latest_rtt': 29.63140...",1706959000000.0,2024-02-03 11:21:56.085127
2,64.093067,recovery:metrics_updated,"{'smoothed_rtt': 29.85, 'latest_rtt': 31.46769...",1706959000000.0,2024-02-03 11:21:56.117152


In [1510]:
rcv_ack_rows[:3]

Unnamed: 0,time,name,data,epoch_time,timestamp
0,32.069466,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.085129
1,64.093868,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.117153
2,76.800677,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.129860


In [1511]:
print(len(metrics_ack_rows), len(rcv_ack_rows))
for i in range(rcv_ack_cnt):
    if(i >= len(metrics_ack_rows)):
        data = metrics_ack_rows.iloc[i-1]['data']
        new_row_data = {'time': [rcv_ack_rows.iloc[i]['time']], 'name':['recovery:metrics_updated'], 'data': [data]}
        new_row = pd.DataFrame(new_row_data)
        metrics_ack_rows = pd.concat([metrics_ack_rows, new_row], ignore_index=True)
        continue
    time_diff = metrics_ack_rows.iloc[i]['time'] - rcv_ack_rows.iloc[i]['time']
    # time_diff >= 1: not the matching metrics_update
    while time_diff > 0:
        # print("> 0:", i, time_diff)
        if i == 0:
            data = initial_ack_metrics.iloc[0]['data']
        else:
            data = metrics_ack_rows.iloc[i-1]['data']
        new_row_data = {'time': [rcv_ack_rows.iloc[i]['time']], 'name':['recovery:metrics_updated'], 'data': [data]}
        new_row = pd.DataFrame(new_row_data)
        metrics_ack_rows = insert(metrics_ack_rows, i, new_row)
        time_diff = metrics_ack_rows.iloc[i]['time'] - rcv_ack_rows.iloc[i]['time']
    # time_diff < 0: missing metrics_update
    while time_diff <= -1:
        # print("<= -1:", i, time_diff)
        metrics_ack_rows.drop(index=metrics_ack_rows.index[i], inplace=True)
        time_diff = metrics_ack_rows.iloc[i]['time'] - rcv_ack_rows.iloc[i]['time']
print(len(metrics_ack_rows), len(rcv_ack_rows))

42047 42081
42081 42081


In [1512]:
metrics_ack_rows = metrics_ack_rows.reset_index(drop=True)
rcv_ack_rows = rcv_ack_rows.reset_index(drop=True)

# check whether there's still mismatch exist.
time_diff_list = metrics_ack_rows['time'] - rcv_ack_rows['time']
mismatch_indices = time_diff_list[(time_diff_list <= -1) | (time_diff_list > 0)].index
if len(mismatch_indices) == 0:
    print("All Matched!")
else:
    print(mismatch_indices)

Index([8149, 13305, 34956], dtype='int64')


In [1513]:
ack_json_list = []
## Add the initial_ack_metrics for temporary
print(initial_ack_metrics)
metrics_ack_rows = pd.concat([initial_ack_metrics, metrics_ack_rows], axis=0).reset_index(drop=True)
for i in range(len(metrics_ack_rows)):
    s = metrics_ack_rows.iloc[i]['data'].replace("\'", "\"")
    json_object = json.loads(s)
    ack_json_list.append(json_object)

metrics_ack_df = pd.DataFrame(ack_json_list)
# Fill missing values in each row with the previous row's values
metrics_ack_df = metrics_ack_df.ffill(axis=0)

## drop initial_ack_metrics
metrics_ack_rows.drop(index=metrics_ack_rows.index[0], inplace=True)
metrics_ack_rows = metrics_ack_rows.reset_index(drop=True)
metrics_ack_df.drop(index=metrics_ack_df.index[0], inplace=True)
metrics_ack_df = metrics_ack_df.reset_index(drop=True)
metrics_ack_df[:5]

       time                      name  \
0  2.224154  recovery:metrics_updated   

                                                data    epoch_time  \
0  {'min_rtt': 0, 'smoothed_rtt': 0, 'latest_rtt'...  1.706959e+12   

                    timestamp  
0  2024-02-03 11:21:56.055283  


Unnamed: 0,min_rtt,smoothed_rtt,latest_rtt,rtt_variance,congestion_window,bytes_in_flight,packets_in_flight
0,29.619015,29.619015,29.619015,14.809507,40064.0,815,2.0
1,29.619015,29.85,31.467699,8.793,40064.0,4368,16.0
2,28.760304,29.713,28.760304,6.867,40064.0,3822,14.0
3,22.774934,28.845,22.774934,6.884,40064.0,3003,11.0
4,16.706207,27.327,16.706207,8.197,40064.0,2184,8.0


In [1514]:
metrics_ack_rows = pd.concat([metrics_ack_rows, metrics_ack_df], axis=1).reset_index(drop=True)
# since we have parse out all the information in data, we can drop the data cl=olumn
metrics_ack_rows = metrics_ack_rows.drop(columns=['data'])
metrics_ack_rows[:5]

Unnamed: 0,time,name,epoch_time,timestamp,min_rtt,smoothed_rtt,latest_rtt,rtt_variance,congestion_window,bytes_in_flight,packets_in_flight
0,31.872711,recovery:metrics_updated,1706959000000.0,2024-02-03 11:21:56.084932,29.619015,29.619015,29.619015,14.809507,40064.0,815,2.0
1,64.093067,recovery:metrics_updated,1706959000000.0,2024-02-03 11:21:56.117152,29.619015,29.85,31.467699,8.793,40064.0,4368,16.0
2,76.797772,recovery:metrics_updated,1706959000000.0,2024-02-03 11:21:56.129857,28.760304,29.713,28.760304,6.867,40064.0,3822,14.0
3,76.804359,recovery:metrics_updated,1706959000000.0,2024-02-03 11:21:56.129864,22.774934,28.845,22.774934,6.884,40064.0,3003,11.0
4,76.806917,recovery:metrics_updated,1706959000000.0,2024-02-03 11:21:56.129866,16.706207,27.327,16.706207,8.197,40064.0,2184,8.0


In [1515]:
# Check whehter the length is equal before concating metrics into rcv_ack_rows
print(len(rcv_ack_rows), len(metrics_ack_df))

42081 42081


In [1516]:
rcv_ack_rows = pd.concat([rcv_ack_rows, metrics_ack_df], axis=1)
rcv_ack_rows = rcv_ack_rows.reset_index(drop=True)

print(len(rcv_ack_rows), len(metrics_ack_df))
rcv_ack_rows[-5:]

42081 42081


Unnamed: 0,time,name,data,epoch_time,timestamp,min_rtt,smoothed_rtt,latest_rtt,rtt_variance,congestion_window,bytes_in_flight,packets_in_flight
42076,300036.851795,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706960000000.0,2024-02-03 11:26:56.089911,11.507035,28.498,24.498575,7.696,32142.0,3275,11.0
42077,300036.854124,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706960000000.0,2024-02-03 11:26:56.089913,11.507035,27.259,18.589037,8.249,32142.0,2450,8.0
42078,300048.88982,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706960000000.0,2024-02-03 11:26:56.101949,11.507035,26.918,24.536086,6.867,32142.0,1625,5.0
42079,300048.893131,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706960000000.0,2024-02-03 11:26:56.101952,11.507035,25.895,18.735444,7.195,32142.0,800,2.0
42080,300059.402983,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706960000000.0,2024-02-03 11:26:56.112462,11.507035,25.815,25.261895,5.554,32142.0,0,2.0


Mapping the ACK ranges

In [1517]:
acked_ranges_series = rcv_ack_rows['data']
acked_ranges_list = []
for i in range(len(acked_ranges_series)):
    s = acked_ranges_series.iloc[i]
    data_dict = json.loads(s.replace("\'", "\""))
    # Extract 'acked_ranges' from all frames
    acked_ranges = [range_entry for frame in data_dict['frames'] if 'acked_ranges' in frame for range_entry in frame['acked_ranges']]
    acked_ranges_list.append(acked_ranges)

acked_ranges_df = pd.DataFrame({"acked_ranges": acked_ranges_list})
acked_ranges_df[:5]

Unnamed: 0,acked_ranges
0,[[0]]
1,"[[0, 2]]"
2,"[[0, 10]]"
3,"[[0, 13]]"
4,"[[0, 16]]"


In [1518]:
rcv_ack_rows = pd.concat([rcv_ack_rows, acked_ranges_df], axis=1)
rcv_ack_rows = rcv_ack_rows.reset_index(drop=True)

rcv_ack_rows[:5]

Unnamed: 0,time,name,data,epoch_time,timestamp,min_rtt,smoothed_rtt,latest_rtt,rtt_variance,congestion_window,bytes_in_flight,packets_in_flight,acked_ranges
0,32.069466,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.085129,29.619015,29.619015,29.619015,14.809507,40064.0,815,2.0,[[0]]
1,64.093868,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.117153,29.619015,29.85,31.467699,8.793,40064.0,4368,16.0,"[[0, 2]]"
2,76.800677,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.129860,28.760304,29.713,28.760304,6.867,40064.0,3822,14.0,"[[0, 10]]"
3,76.80491,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.129864,22.774934,28.845,22.774934,6.884,40064.0,3003,11.0,"[[0, 13]]"
4,76.80739,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.129867,16.706207,27.327,16.706207,8.197,40064.0,2184,8.0,"[[0, 16]]"


In [1519]:
# parse out the packet_number & offset & length
pk_sent_series =  pk_sent_rows['data']
pk_num_list = []
offset_list = []
length_list = []
for i in range(len(pk_sent_series)):
    s = pk_sent_series.iloc[i]
    data_dict = json.loads(s.replace("\'", "\""))
    packet_number = data_dict['header']['packet_number']
    # Initialize offset to None in case 'frame_type': 'stream' is not found
    offset = None
    # Iterate through frames to find 'offset' for 'frame_type': 'stream'
    for frame in data_dict.get('frames', []):
        if frame.get('frame_type') == 'stream':
            offset = frame.get('offset')
            length = frame.get('length')
            break  # Stop iterating once 'offset' is found
    
    pk_num_list.append(packet_number)
    offset_list.append(offset)
    length_list.append(length)

pk_num_df = pd.DataFrame({"packet_number": pk_num_list, "offset": offset_list, "length": length_list})
pk_num_df[:5]

Unnamed: 0,packet_number,offset,length
0,2,0,250
1,3,250,250
2,4,500,250
3,5,750,250
4,6,1000,250


In [1520]:
pk_sent_rows = pd.concat([pk_sent_rows, pk_num_df], axis=1)
pk_sent_rows = pk_sent_rows.reset_index(drop=True)

pk_sent_rows[:5]

Unnamed: 0,time,name,data,epoch_time,timestamp,bytes_in_flight,packets_in_flight,packet_number,offset,length
0,32.443277,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.085503,578.0,2.0,2,0,250
1,34.163407,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.087223,851.0,3.0,3,250,250
2,36.042576,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.089102,1124.0,4.0,4,500,250
3,38.120828,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.091180,1397.0,5.0,5,750,250
4,40.132247,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.093192,1670.0,6.0,6,1000,250


In [1521]:
pk_sent_rows['smoothed_rtt'] = np.nan
pk_sent_rows['latest_rtt'] = np.nan
pk_sent_rows['rtt_variance'] = np.nan
pk_sent_rows['congestion_window'] = np.nan

pk_sent_rows[:5]

Unnamed: 0,time,name,data,epoch_time,timestamp,bytes_in_flight,packets_in_flight,packet_number,offset,length,smoothed_rtt,latest_rtt,rtt_variance,congestion_window
0,32.443277,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.085503,578.0,2.0,2,0,250,,,,
1,34.163407,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.087223,851.0,3.0,3,250,250,,,,
2,36.042576,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.089102,1124.0,4.0,4,500,250,,,,
3,38.120828,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.091180,1397.0,5.0,5,750,250,,,,
4,40.132247,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.093192,1670.0,6.0,6,1000,250,,,,


In [1522]:
def update_pk_sent_rows(row):
    acked_ranges = row['acked_ranges']
    smoothed_rtt = row['smoothed_rtt']
    latest_rtt = row['latest_rtt']
    rtt_variance = row['rtt_variance']
    congestion_window = row['congestion_window']

    for ack_range in acked_ranges:
        start_packet, end_packet = ack_range[0], ack_range[-1]
        existing_packets = set(pk_sent_rows['packet_number'])
        packet_numbers_to_update = set(range(start_packet, end_packet + 1)).intersection(existing_packets)

        mask = pk_sent_rows['packet_number'].isin(packet_numbers_to_update)
        pk_sent_rows.loc[mask, 'smoothed_rtt'] = pk_sent_rows.loc[mask, 'smoothed_rtt'].fillna(smoothed_rtt)
        pk_sent_rows.loc[mask, 'latest_rtt'] = pk_sent_rows.loc[mask, 'latest_rtt'].fillna(latest_rtt)
        pk_sent_rows.loc[mask, 'congestion_window'] = pk_sent_rows.loc[mask, 'congestion_window'].fillna(congestion_window)
        pk_sent_rows.loc[mask, 'rtt_variance'] = pk_sent_rows.loc[mask, 'rtt_variance'].fillna(rtt_variance)

# Apply the custom update function to each row in rcv_ack_rows
rcv_ack_rows.apply(update_pk_sent_rows, axis=1)

# Display the updated pk_sent_rows
pk_sent_rows[:5]

Unnamed: 0,time,name,data,epoch_time,timestamp,bytes_in_flight,packets_in_flight,packet_number,offset,length,smoothed_rtt,latest_rtt,rtt_variance,congestion_window
0,32.443277,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.085503,578.0,2.0,2,0,250,29.85,31.467699,8.793,40064.0
1,34.163407,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.087223,851.0,3.0,3,250,250,29.713,28.760304,6.867,40064.0
2,36.042576,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.089102,1124.0,4.0,4,500,250,29.713,28.760304,6.867,40064.0
3,38.120828,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.091180,1397.0,5.0,5,750,250,29.713,28.760304,6.867,40064.0
4,40.132247,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.093192,1670.0,6.0,6,1000,250,29.713,28.760304,6.867,40064.0


Identify lost packets

In [1523]:
# Use ast.literal_eval to safely evaluate the string and extract 'packet_number'
lost_rows['packet_number'] = lost_rows['data'].apply(lambda x: ast.literal_eval(x)['header']['packet_number'] if isinstance(x, str) else None)
lost_rows['trigger'] = lost_rows['data'].apply(lambda x: ast.literal_eval(x)['trigger'] if isinstance(x, str) else None)
lost_rows[:5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lost_rows['packet_number'] = lost_rows['data'].apply(lambda x: ast.literal_eval(x)['header']['packet_number'] if isinstance(x, str) else None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lost_rows['trigger'] = lost_rows['data'].apply(lambda x: ast.literal_eval(x)['trigger'] if isinstance(x, str) else None)


Unnamed: 0,time,name,data,epoch_time,timestamp,packet_number,trigger
22792,11838.02594,recovery:packet_lost,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:22:07.891085,5900,time_threshold
74382,38354.949676,recovery:packet_lost,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:22:34.408009,19074,time_threshold
82710,42693.760191,recovery:packet_lost,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:22:38.746820,21235,time_threshold
82719,42747.229921,recovery:packet_lost,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:22:38.800289,21260,time_threshold
360451,186335.165921,recovery:packet_lost,"{'header': {'packet_type': '1RTT', 'packet_num...",1706960000000.0,2024-02-03 11:25:02.388225,92804,reordering_threshold


In [1524]:
lost_pk_csv_file_path = path + "middle/" + f"lost_pk_{time}_{port}.csv"
lost_rows.to_csv(lost_pk_csv_file_path, index=False)

In [1525]:
## set to True if the packet is lost
pk_sent_rows['packet_lost'] = False

# Iterate through rows and set 'packet_lost' to True where 'packet_number' values match
for _, lost_row in lost_rows.iterrows():
    packet_number = lost_row['packet_number']
    
    # Check if 'packet_number' exists in pk_sent_rows
    if packet_number in pk_sent_rows['packet_number'].values:
        pk_sent_rows.loc[pk_sent_rows['packet_number'] == packet_number, 'packet_lost'] = True

pk_sent_rows[19340:19345]

Unnamed: 0,time,name,data,epoch_time,timestamp,bytes_in_flight,packets_in_flight,packet_number,offset,length,smoothed_rtt,latest_rtt,rtt_variance,congestion_window,packet_lost
19340,38738.077609,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:22:34.791137,5225.0,19.0,19356,4838250,250,29.008,22.924163,6.704,28720.0,False
19341,38740.128885,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:22:34.793188,5500.0,20.0,19357,4838500,250,29.008,22.924163,6.704,28720.0,False
19342,38742.124926,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:22:34.795184,5775.0,21.0,19358,4838750,250,29.008,22.924163,6.704,28720.0,False
19343,38744.010541,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:22:34.797070,3850.0,14.0,19359,4839000,250,27.522,17.120025,7.999,28720.0,False
19344,38746.115186,transport:packet_sent,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:22:34.799175,4125.0,15.0,19360,4839250,250,27.522,17.120025,7.999,28720.0,False


In [1526]:
cols = ['time', 'epoch_time', 'timestamp', 'name', 'packet_number', 'offset', 'length', 'bytes_in_flight', 'packets_in_flight', 'smoothed_rtt', 'latest_rtt', 'rtt_variance', 'congestion_window', 'packet_lost', 'data']
processed_df = pk_sent_rows[cols]
processed_df[:5]

Unnamed: 0,time,epoch_time,timestamp,name,packet_number,offset,length,bytes_in_flight,packets_in_flight,smoothed_rtt,latest_rtt,rtt_variance,congestion_window,packet_lost,data
0,32.443277,1706959000000.0,2024-02-03 11:21:56.085503,transport:packet_sent,2,0,250,578.0,2.0,29.85,31.467699,8.793,40064.0,False,"{'header': {'packet_type': '1RTT', 'packet_num..."
1,34.163407,1706959000000.0,2024-02-03 11:21:56.087223,transport:packet_sent,3,250,250,851.0,3.0,29.713,28.760304,6.867,40064.0,False,"{'header': {'packet_type': '1RTT', 'packet_num..."
2,36.042576,1706959000000.0,2024-02-03 11:21:56.089102,transport:packet_sent,4,500,250,1124.0,4.0,29.713,28.760304,6.867,40064.0,False,"{'header': {'packet_type': '1RTT', 'packet_num..."
3,38.120828,1706959000000.0,2024-02-03 11:21:56.091180,transport:packet_sent,5,750,250,1397.0,5.0,29.713,28.760304,6.867,40064.0,False,"{'header': {'packet_type': '1RTT', 'packet_num..."
4,40.132247,1706959000000.0,2024-02-03 11:21:56.093192,transport:packet_sent,6,1000,250,1670.0,6.0,29.713,28.760304,6.867,40064.0,False,"{'header': {'packet_type': '1RTT', 'packet_num..."


In [1527]:
csv_file_path = path + "data/" + f"processed_sent_{time}_{port}.csv"
processed_df.to_csv(csv_file_path, sep='@', index=False)

In [1528]:
weird_length_list =[]
for i in range(len(processed_df)):
    if processed_df.iloc[i]['length'] != 250:
        weird_length_list.append(processed_df.iloc[i])

weird_length_df = pd.DataFrame(weird_length_list)

In [1529]:
print(len(weird_length_df))
weird_length_df[:5]

337


Unnamed: 0,time,epoch_time,timestamp,name,packet_number,offset,length,bytes_in_flight,packets_in_flight,smoothed_rtt,latest_rtt,rtt_variance,congestion_window,packet_lost,data
2501,5036.065482,1706959000000.0,2024-02-03 11:22:01.089125,transport:packet_sent,2511,625250,500,8775.0,31.0,64.01,63.81458,15.659,58615.0,False,"{'header': {'packet_type': '1RTT', 'packet_num..."
6915,13878.509406,1706959000000.0,2024-02-03 11:22:09.931569,transport:packet_sent,6928,1728750,1250,1550.0,2.0,22.459,16.562239,4.889,58615.0,False,"{'header': {'packet_type': '1RTT', 'packet_num..."
6917,13878.658381,1706959000000.0,2024-02-03 11:22:09.931718,transport:packet_sent,6930,1730250,750,2600.0,4.0,22.459,16.562239,4.889,58615.0,False,"{'header': {'packet_type': '1RTT', 'packet_num..."
15487,31027.998748,1706959000000.0,2024-02-03 11:22:27.081058,transport:packet_sent,15502,3873250,1250,2925.0,7.0,23.25,23.65796,2.043,58615.0,False,"{'header': {'packet_type': '1RTT', 'packet_num..."
17148,34358.780039,1706959000000.0,2024-02-03 11:22:30.411839,transport:packet_sent,17163,4289500,1250,3207.0,8.0,23.462,22.422372,3.088,58615.0,False,"{'header': {'packet_type': '1RTT', 'packet_num..."


In [1530]:
# Sum up the 'length' column
total_length = processed_df['length'].sum()

print(f'Total Length: {total_length}')

Total Length: 37524250


## Receiver side data

In [1531]:
pk_rcv_df = pk_rcv_rows.reset_index(drop=True)
pk_rcv_df[:5]

Unnamed: 0,time,name,data,epoch_time,timestamp
0,95.740104,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.090739
1,102.013281,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.097012
2,108.214271,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.103213
3,108.24151,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.103240
4,108.253125,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.103252


In [1532]:
pk_rcv_series =  pk_rcv_df['data']
pk_rcv_num_list = []
offset_rcv_list = []
length_rcv_list = []
for i in range(len(pk_rcv_series)):
    s = pk_rcv_series.iloc[i]
    data_dict = json.loads(s.replace("\'", "\""))
    packet_number = data_dict['header']['packet_number']
    # Initialize offset to None in case 'frame_type': 'stream' is not found
    offset = None
    # Iterate through frames to find 'offset' for 'frame_type': 'stream'
    for frame in data_dict.get('frames', []):
        if frame.get('frame_type') == 'stream':
            offset = frame.get('offset')
            length = frame.get('length')
            break  # Stop iterating once 'offset' is found
    
    pk_rcv_num_list.append(packet_number)
    offset_rcv_list.append(offset)
    length_rcv_list.append(length)

pk_rcv_df['packet_number'] = pk_rcv_num_list
pk_rcv_df['offset'] = offset_rcv_list
pk_rcv_df['length'] = length_rcv_list

pk_rcv_df[:5]

Unnamed: 0,time,name,data,epoch_time,timestamp,packet_number,offset,length
0,95.740104,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.090739,2,0,250
1,102.013281,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.097012,3,250,250
2,108.214271,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.103213,4,500,250
3,108.24151,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.103240,5,750,250
4,108.253125,transport:packet_received,"{'header': {'packet_type': '1RTT', 'packet_num...",1706959000000.0,2024-02-03 11:21:56.103252,6,1000,250


In [1533]:
cols = ['time', 'epoch_time', 'timestamp', 'name', 'packet_number', 'offset', 'length', 'data']
processed_rcv_df = pk_rcv_df[cols]
processed_rcv_df[:5]


Unnamed: 0,time,epoch_time,timestamp,name,packet_number,offset,length,data
0,95.740104,1706959000000.0,2024-02-03 11:21:56.090739,transport:packet_received,2,0,250,"{'header': {'packet_type': '1RTT', 'packet_num..."
1,102.013281,1706959000000.0,2024-02-03 11:21:56.097012,transport:packet_received,3,250,250,"{'header': {'packet_type': '1RTT', 'packet_num..."
2,108.214271,1706959000000.0,2024-02-03 11:21:56.103213,transport:packet_received,4,500,250,"{'header': {'packet_type': '1RTT', 'packet_num..."
3,108.24151,1706959000000.0,2024-02-03 11:21:56.103240,transport:packet_received,5,750,250,"{'header': {'packet_type': '1RTT', 'packet_num..."
4,108.253125,1706959000000.0,2024-02-03 11:21:56.103252,transport:packet_received,6,1000,250,"{'header': {'packet_type': '1RTT', 'packet_num..."


In [1534]:
csv_file_path = path + "data/" + f"processed_rcv_{time}_{port}.csv"
processed_rcv_df.to_csv(csv_file_path, sep='@')

In [None]:
# weird_length_list =[]
# for i in range(len(pk_rcv_df)):
#     if pk_rcv_df.iloc[i]['length'] != 250:
#         weird_length_list.append(pk_rcv_df.iloc[i])

# weird_length_df = pd.DataFrame(weird_length_list)

In [None]:
# print(len(weird_length_df))
# weird_length_df[:5]