### 读取数据

In [None]:
import os
import json

def process_file(file_path, label):
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    item = json.loads(line)
                    # item['language'] = label
                    data.append(item)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON in line: {e}")
        return data
    except IOError as e:
        print(f"Error opening file: {file_path}, {e}")

def process_directory(directory):
    for subdir, dirs, files in os.walk(directory):
        label = os.path.basename(subdir)
        for file in files:
            if file.endswith('.jsonl'):
                file_path = os.path.join(subdir, file)
                data_partial = process_file(file_path, label)
                data.extend(data_partial)
                all_data.update({label: data_partial})

data = []
all_data = {}
directory_path = 'dataset'
process_directory(directory_path)


In [None]:
print(all_data.keys())

In [None]:
for value in all_data.values():
    print(len(value))

### 均匀划分

In [None]:
import json
import os
from collections import defaultdict
from sklearn.model_selection import StratifiedShuffleSplit

# data = [...]

def stratified_group_split(data, n_splits=10):
    labels = [d['language'] for d in data]
    features = [i for i in range(len(data))]

    sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1/n_splits)
    
    grouped_data = defaultdict(list)
    
    for group_index, (_, test_index) in enumerate(sss.split(features, labels)):
        for i in test_index:
            grouped_data[group_index].append(data[i])
    
    return grouped_data

def save_group_data(data, index, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    group_file = os.path.join(directory, f'group_{index}.json')
    with open(group_file, 'w', encoding='utf-8') as f:
        json.dump(group_data, f, indent=4)

def count_languages(grouped_data):
    for group_index, items in grouped_data.items():
        language_count = defaultdict(int)
        for item in items:
            language = item.get('language', 'Unknown')
            language_count[language] += 1
        print(f"Group {group_index}:")
        for language, count in language_count.items():
            print(f"    {language}: {count}")

grouped_data = stratified_group_split(data)

count_languages(grouped_data)

for index, group_data in grouped_data.items():
    save_group_data(group_data, index, 'lans/splits')


### 不均匀划分

In [None]:
for key, value in all_data.items():
    print(f'{key}: {len(value)}')

In [None]:
import numpy as np

In [None]:

all_data_num = [len(value) for value in all_data.values()]
keys = list(all_data.keys())

def generate_balanced_matrix(rows, cols, row_sum, col_sums):
    if sum(col_sums) < rows * row_sum or sum(col_sums) > rows * (row_sum + 1):
        return None

    matrix = np.zeros((rows, cols), dtype=int)

    for col in range(cols):
        col_sum_remaining = col_sums[col]

        while col_sum_remaining > 0:
            rows_with_space = np.where(np.sum(matrix, axis=1) < row_sum + 1)[0]
            if len(rows_with_space) == 0:
                break

            row = np.random.choice(rows_with_space)
            max_possible_addition = min(col_sum_remaining, row_sum + 1 - np.sum(matrix[row]))
            addition = np.random.randint(1, max_possible_addition + 1)
            matrix[row, col] += addition
            col_sum_remaining -= addition

    return matrix

def redistribute_values_evenly(matrix, redistribute_limit=50):
    rows, cols = matrix.shape

    for col in range(cols):
        for _ in range(redistribute_limit):
            max_row = np.argmax(matrix[:, col])
            min_row = np.argmin(matrix[:, col])
            avg_value = np.mean(matrix[:, col])

            if matrix[max_row, col] > avg_value + 30:
                matrix[max_row, col] -= 1
                matrix[min_row, col] += 1

    return matrix

rows = 10
cols = len(all_data_num)
total_count = sum(all_data_num)
row_sum = total_count // rows

balanced_matrix = generate_balanced_matrix(rows, cols, row_sum, all_data_num)

if balanced_matrix is not None:
    redistributed_matrix = redistribute_values_evenly(balanced_matrix)

    for i, row in enumerate(redistributed_matrix):
        group_data = {lang: value for lang, value in zip(keys, row)}
        group_total = sum(row)
        print(f"Group {i+1}: {group_data}, Number: {group_total}")

    total_outputs = {lang: 0 for lang in keys}
    for row in redistributed_matrix:
        for lang, value in zip(keys, row):
            total_outputs[lang] += value

    print("\n")
    for lang, total in total_outputs.items():
        print(f"{lang}: {total}")
else:
    print("cannot generate balanced matrix")

In [None]:
import os
import json
import copy
def divide_data_according_to_matrix(data, matrix):
    """
    
    :param data: A dictionary containing the data to be divided.
    :param matrix: A 2D numpy array containing the number of items to be placed in each group.
    :return: None
    """

    data_copy = copy.deepcopy(data)

    os.makedirs('lans/splits_uneven', exist_ok=True)
    for i, row in enumerate(matrix):
        group = {key: [] for key in data.keys()}
        for j, (key, value_list) in enumerate(data_copy.items()): 
            num_items = row[j]
            group[key], data_copy[key] = value_list[:num_items], value_list[num_items:]

        with open(f'lans/splits_uneven/group_{i}.json', 'w') as f:
            json.dump(group, f, indent=4)

In [None]:
for value in all_data.values():
    print(len(value))

In [None]:
# Divide the data according to the matrix
divide_data_according_to_matrix(all_data, balanced_matrix)

In [None]:
for value in all_data.values():
    print(len(value))

In [None]:
# print(all_data)
print(balanced_matrix)

In [None]:
for key, value in all_data.items():
    print(f'{key}: {len(value)}')

### 独享标签划分

In [None]:
data_counts = {
    "go": 167288,
    "java": 164923,
    "javascript": 58025,
    "php": 241241,
    "python": 251820,
    "ruby": 24927
}

sum_data = sum(data_counts.values())

total_data = sum(data_counts.values()) - data_counts["javascript"]

data_per_set = sum_data // 10

language_ratios = {lang: data_counts[lang] / total_data for lang in data_counts if lang != "javascript"}

subsets = {f"subset_{i}": {} for i in range(0, 9)}

for subset_name in subsets:
    if "subset_0" in subset_name:
        for lang, ratio in language_ratios.items():
            subsets[subset_name][lang] = int((data_per_set - data_counts["javascript"]) * ratio)
        subsets[subset_name]["javascript"] = data_counts["javascript"]
    else:
        for lang, data_per_subset in language_ratios.items():
            subsets[subset_name][lang] = int(data_per_set * data_per_subset)

for subset_name, subset_data in subsets.items():
    print(f"{subset_name}: {subset_data}")


In [None]:
import os
import json

def divide_data_according_to_subsets(data, subsets):
    """
    Divide the data in 'data' dictionary according to the distribution specified in 'subsets'.

    :param data: Dictionary containing lists of data for each key.
    :param subsets: Dictionary specifying how to divide the data for each key.
    :return: A list of dictionaries, each dictionary represents a divided group of data.
    """
    
    # Create a directory to store the groups
    os.makedirs('lans/splits_special_tags', exist_ok=True)

    # Initialize a list to store the divided groups
    divided_groups = []

    # Iterate over each subset definition
    for subset_name, subset_data in subsets.items():
        group = {key: [] for key in data.keys()}
        
        # Iterate over each key (language) and its value (number of items to include)
        for key, num_items in subset_data.items():
            # Use the specified number of items to slice the data list
            group[key] = data[key][:num_items]
            data[key] = data[key][num_items:]

        # Append the group to the list of divided groups
        divided_groups.append(group)

    # Optionally, save the divided groups to JSON files
    for i, group in enumerate(divided_groups):
        with open(f'lans/splits_special_tags/group_{i}.json', 'w') as f:
            json.dump(group, f, indent=4)
    
    return divided_groups


In [None]:
for key, value in all_data.items():
    print(f'{key}: {len(value)}')

In [None]:
# Divide the data according to the matrix
divide_data_according_to_subsets(all_data, subsets)