In [1]:
import random
import httpx
import msgpack
import threading
import time
import os
import argparse
import json
import scipy
import numpy as np
from sklearn.preprocessing import normalize
from tqdm import tqdm

In [2]:
!pip install jsonlines

Defaulting to user installation because normal site-packages is not writeable
Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [2]:
gpt_2_api = 'http://0.0.0.0:6006/inference'
gpt_neo_api = 'http://0.0.0.0:6007/inference'
gpt_J_api = 'http://0.0.0.0:6008/inference'
llama_api = 'http://0.0.0.0:6009/inference'

In [7]:
import json
import glob

def merge_jsonl(input_files, output_file):
    combined_data = []
    for file in input_files:
        with open(file, 'r') as f:
            for line in f:
                json_data = json.loads(line.strip())
                combined_data.append(json_data)

    with open(output_file, 'w') as f:
        for item in combined_data:
            f.write(json.dumps(item) + '\n')

# Example usage
input_files = glob.glob('./SeqXGPT-Bench/*.jsonl')  # List of JSONL files in current directory
output_file = 'input.jsonl'

merge_jsonl(input_files, output_file)

In [3]:
def access_api(text, api_url, do_generate=False):
    """

    :param text: input text
    :param api_url: api
    :param do_generate: whether generate or not
    :return:
    """
    with httpx.Client(timeout=None) as client:
        post_data = {
            "text": text,
            "do_generate": do_generate,
        }
        prediction = client.post(api_url,
                                 data=msgpack.packb(post_data),
                                 timeout=None)
    if prediction.status_code == 200:
        content = msgpack.unpackb(prediction.content)
    else:
        content = None
    return content

In [4]:
def get_features(input_file, output_file, model_names, model_apis):
    """
    get [losses, begin_idx_list, ll_tokens_list, label_int, label] based on raw lines
    """

    en_labels = {
        'gpt2': 0,
        'gptneo': 1,
        'gptj': 1,
        'llama': 2,
        'gpt3re': 3,
        'gpt3sum': 3,
        'human': 4,
        'alpaca': None,
        'dolly': None,
    }


    with open(input_file, 'r') as f:
        lines = [json.loads(line) for line in f]

    print('input file:{}, length:{}'.format(input_file, len(lines)))

    with open(output_file, 'w', encoding='utf-8') as f:
        for data in tqdm(lines):
            line = data['text']
            label = data['label']

            losses = data['losses'] if 'losses' in data else []
            begin_idx_list = data['begin_idx_list'] if 'begin_idx_list' in data else []
            ll_tokens_list = data['ll_tokens_list'] if 'll_tokens_list' in data else []
            
            label_dict = en_labels

            label_int = label_dict[label]

            error_flag = False
            for api in model_apis:
                try:
                    loss, begin_word_idx, ll_tokens = access_api(line, api)
                except TypeError:
                    print("return NoneType, probably gpu OOM, discard this sample")
                    error_flag = True
                    break
                losses.append(loss)
                begin_idx_list.append(begin_word_idx)
                ll_tokens_list.append(ll_tokens)
            # if oom, discard this sample
            if error_flag:
                continue

            result = {
                'losses': losses,
                'begin_idx_list': begin_idx_list,
                'll_tokens_list': ll_tokens_list,
                'label_int': label_int,
                'label': label,
                'text': line
            }

            f.write(json.dumps(result, ensure_ascii=False) + '\n')

In [5]:
get_features('input.jsonl','output_all.jsonl',['gpt_2', 'gpt_neo'],[gpt_2_api,gpt_neo_api])

input file:small_input.jsonl, length:24


100%|██████████| 24/24 [00:12<00:00,  1.94it/s]


In [None]:
get_features('output_all.jsonl','output_all.jsonl',['gpt_J', 'llama'],[gpt_J_api,llama_api])

In [4]:
import jsonlines

def read_jsonl(file_path):
    with jsonlines.open(file_path, 'r') as reader:
        return list(reader)
    
def write_jsonl(file_path, data):
    with jsonlines.open(file_path, 'w') as writer:
        writer.write_all(data)

def write_jsonl(file_path, data):
    with jsonlines.open(file_path, 'w') as writer:
        writer.write_all(data)


input_jsonl = read_jsonl('input.jsonl')
output_jsonl = read_jsonl('output_all.jsonl')

for item1, item2 in zip(input_jsonl, output_jsonl):
    if 'prompt_len' in item1:
        item2['prompt_len'] = item1['prompt_len']
    else:
        item2['prompt_len'] = 0

write_jsonl('output_ prompt_merged.jsonl',output_jsonl)


In [2]:
# data binary
import jsonlines

def read_jsonl(file_path):
    with jsonlines.open(file_path, 'r') as reader:
        return list(reader)
    
def write_jsonl(file_path, data):
    with jsonlines.open(file_path, 'w') as writer:
        writer.write_all(data)

binary_data = read_jsonl('output_ prompt_merged.jsonl')

for item in binary_data:
    if item['label'] != 'human':
        item['label'] = 'AI'

write_jsonl('output_binary_merged.jsonl',binary_data)