In [1]:
import pandas as pd
import json

def xls_to_json(file_path, output_path):
    # 读取Excel文件
    df = pd.read_excel(file_path)
    
    # 将每一行的数据转换为JSON对象，并存储在一个列表中
    data = df.to_dict(orient='records')
    
    # 将数据写入到JSON文件中
    with open(output_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

# 使用示例
file_path = '/Users/hugo/Projects/NLBGP/BGPAgent/raw_data/bgpleak.xls'
output_path = '/Users/hugo/Projects/NLBGP/BGPAgent/raw_data/bgpleak.json'
xls_to_json(file_path, output_path)

In [4]:
import json
from collections import defaultdict

def process_as_path(as_path):
    # 将as_path字符串按空格分隔
    as_numbers = as_path.strip().split()
    
    # 去除连续重复的对象
    processed_as_numbers = []
    prev_number = None
    for number in as_numbers:
        if number != prev_number:
            processed_as_numbers.append(number)
        prev_number = number
    
    # 将分隔符从空格改为"-"
    return '-'.join(processed_as_numbers)

def process_and_separate_json(input_path, output_directory):
    # 读取JSON文件
    with open(input_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
    
    length_to_data = defaultdict(list)

    # 处理每个对象的as_path字段
    for obj in data:
        if 'as_path' in obj:
            obj['as_path'] = process_as_path(obj['as_path'])
            as_path_length = len(obj['as_path'].split('-'))
            length_to_data[as_path_length].append(obj)
    
    # 将按长度分类后的数据写回到不同的JSON文件
    for length, objects in length_to_data.items():
        output_path = f"{output_directory}/as_path_length_{length}.json"
        with open(output_path, 'w', encoding='utf-8') as json_file:
            json.dump(objects, json_file, ensure_ascii=False, indent=4)

    # 打印每个长度对应的对象个数
    for length, objects in length_to_data.items():
        print(f"Number of objects with as_path length {length}: {len(objects)}")

    # 打印总共有多少个不同长度的as_path
    print(f"Number of different as_path lengths: {len(length_to_data)}")

    # 打印总共有多少个对象
    print(f"Number of objects: {len(data)}")

# 使用示例
input_path = '/Users/hugo/Projects/NLBGP/BGPAgent/raw_data/bgpleak.json'            # 替换为你的输入JSON文件路径
output_directory = '/Users/hugo/Projects/NLBGP/BGPAgent/filtered_data/bgpleak_different_length'               # 替换为你希望输出的目录路径
process_and_separate_json(input_path, output_directory)

Number of objects with as_path length 11: 157
Number of objects with as_path length 12: 138
Number of objects with as_path length 9: 130
Number of objects with as_path length 10: 163
Number of objects with as_path length 14: 51
Number of objects with as_path length 16: 14
Number of objects with as_path length 7: 84
Number of objects with as_path length 8: 106
Number of objects with as_path length 13: 90
Number of objects with as_path length 15: 21
Number of objects with as_path length 19: 2
Number of objects with as_path length 6: 54
Number of objects with as_path length 17: 2
Number of objects with as_path length 18: 2
Number of different as_path lengths: 14
Number of objects: 1014
