In [14]:
import json
import re
import random
from tqdm import tqdm
from typing import Optional, Dict, Sequence, List
# import argparse

def extract_last_num(text: str) -> float:
    text = re.sub(r"(\d),(\d)", "\g<1>\g<2>", text)  # 处理形如 123,456
    res = re.findall(r"(\d+(\.\d+)?)", text)  # 匹配 123456.789
    if len(res) > 0:
        num_str = res[-1][0]
        return float(num_str)
    else:
        return 0.0
    
def check(key, truth, predict):
    if key in ['cycle', 'connectivity', 'hamilton', 'substructure', 'bipartite']:
        if '###' in predict:
            if 'yes' in truth.lower() and 'yes' in predict.split('###')[-1].lower():
                # correct_samples[key].append(v)
                return True
            elif 'no' in truth.lower() and 'no' in predict.split('###')[-1].lower():
                return True
            return False
        else:
            matches = re.findall(r'(yes|no)', predict, flags=re.IGNORECASE)
            if matches:
                last_match = matches[-1].lower()
                if last_match == 'yes' and 'yes' in truth.lower():
                    return True
                elif last_match == 'no' and 'no' in truth.lower():
                    return True
                return False
            else:
                return False
                      
    elif key in ['flow', 'shortest', 'triplet']:
      
        t_num = extract_last_num(truth)
        p_num = extract_last_num(predict.split('###')[-1])
        if abs(t_num - p_num) < 1e-2:
            return True
        return False
                
    elif key == 'topology':
        
        if '###' in predict:
            pre = predict.split('###')[-1].strip(' ')
            truth = truth.split('###')[-1].strip(' ')
            if truth in pre or pre in truth:
                return True
            return False
        else:
            truth = truth.split('###')[-1].split(',')
            for t in truth:
                if t in predict or t.strip(' ') in predict:
                    return True
            return False
    

In [15]:
with open('/cpfs/user/chennuo/CN/Graph_RFT_Data/gpt4data/gpt4_gsm8knlg_sample3_output.json') as f:
    datas = f.readlines()
print(len(datas))

tasks = ['cycle', 'connectivity', 'hamilton', 'substructure', 'bipartite', 'flow', 'shortest', 'triplet', 'topology']
correct_samples  = {task:[] for task in tasks}
all_samples  = {task:[] for task in tasks}
temp = 0
select_samples = []
math_samples = []
for data in datas:
    
    data = json.loads(data)
        
    task = data['task']
    
    if task in ['4','5', '6', '7', '8','9']:
        temp += 1
        math_samples.append(data)
        continue
    if check(task, data['response'], data['dv3_response']):
        if data['query'] not in  correct_samples[task]:
            correct_samples[task].append(data['query'])
            select_samples.append(data)
            
    if data['query'] not in all_samples[task]:
        all_samples[task].append(data['query'])
        
# with open('/cpfs/user/chennuo/CN/Graph_RFT_Data/gpt4data/gpt4_generate_nlg.json', 'w' ) as writer:
#     for sample in select_samples:
#         writer.write(json.dumps(sample, ensure_ascii=False) + '\n')


# with open('/cpfs/user/chennuo/CN/Graph_RFT_Data/gpt4data/gpt4_generate_math.json', 'w' ) as writer:
#     for sample in math_samples:
#         writer.write(json.dumps(sample, ensure_ascii=False) + '\n')

67324


In [26]:
for key, value in all_samples.items():
    print(key)
    print(len(value))

cycle
3000
connectivity
3000
hamilton
3000
substructure
1130
bipartite
2991
flow
3000
shortest
2999
triplet
3000
topology
2973


In [32]:
len(select_samples)

16127

In [16]:
for key, value in correct_samples.items():
    print(key)
    total = all_samples[key]
    print(len(value)/len(total))

# all_samples

cycle
0.938
connectivity
0.9086666666666666
hamilton
0.7383333333333333
substructure
0.827433628318584
bipartite
0.675025075225677
flow
0.07533333333333334
shortest
0.49783261087029007
triplet
0.9186666666666666
topology
0.30507904473595693


In [30]:
correct_samples['triplet'][-1]

'Find the maximum sum of the weights of three interconnected nodes. In an undirected graph, [i, k] means that node i has the weight k. (i,j) means that node i and node j are connected with an undirected edge. Given a graph, you need to output the maximum sum of the weights of three interconnected nodes. \nQ: The nodes are numbered from 0 to 13, weights of nodes are: [0, 1] [1, 5] [2, 6] [3, 10] [4, 8] [5, 3] [6, 10] [7, 5] [8, 3] [9, 6] [10, 9] [11, 9] [12, 7] [13, 6], and the edges are: (1, 7) (1, 3) (3, 7) (4, 5) (8, 9). What is the maximum sum of the weights of three nodes?'

In [2]:


import json
with open('/cpfs/user/chennuo/CN/Graph_RFT_Data/gpt4data/gpt4_generate_nlg.json') as f:
    datas = f.readlines()
refine_datas = []
temp = 0
for data in datas:
    data = json.loads(data)
    new_data = dict()
    new_data['query'] = data['query']
    new_data['task'] = data['task']
    new_data['response'] = data['response']
    response = data['dv3_response'].split('A:')[-1].split('\n\n')[-1]
    if '###' not in response:
        response +=  data['response'] +'.'
    if 'print' in response or 'return' in response or 'def' in response:
        temp += 1
        continue
    new_data['CoT_response'] = response
    refine_datas.append(new_data)

with open('/cpfs/user/chennuo/CN/Graph_RFT_Data/gpt4data/graph_source_data_v1.json', 'w' ) as writer:
    for sample in refine_datas:
        writer.write(json.dumps(sample, ensure_ascii=False) + '\n')

In [24]:
import random
random.sample(refine_datas, 10)[0]

{'query': 'Determine whether or not a graph is bipartite. In a directed graph, (i->j) means that node i and node j are connected with an directed edge from node i to node j. Given a graph, you need to output Yes or No, indicating whether the graph is bipartite. \nQ: The nodes are numbered from 0 to 5, and the edges are: (0->2) (0->1) (1->2) (1->5) (2->5) (3->4) (4->5). Is this graph bipartite?',
 'task': 'bipartite',
 'response': '### No',
 'CoT_response': "Let us try this method on the given graph. We can start with node 0 and assign it to set A. Then, we assign its neighbors, node 1 and node 2, to set B. Next, we assign node 1's neighbors, node 5, to set A, and node 2's neighbor, node 5, to set A as well. \nSo far, we have not encountered any contradiction, and we have assigned nodes 0, 1, 2, and 5 to different sets. \nNow, we move on to node 3, which has not been assigned yet. We can assign it to either set, but let us choose set A for convenience. Then, we assign its neighbor, node

In [26]:

PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
}


with open('/cpfs/user/chennuo/CN/Graph-Reasoning-LLM/datasets/data/graph_v1_dsformat.json', 'w' ) as writer:
    for sample in refine_datas:
        new_sample = dict()
        new_sample['prompt'] = PROMPT_DICT["prompt_no_input"].format(instruction=sample['query'])
        new_sample['chosen'] = sample['CoT_response']
        new_sample['reject'] = 'I do not know'
        writer.write(json.dumps(new_sample, ensure_ascii=False) + '\n')

In [28]:
import re

text = "This is a sample string with YES and no, as well as YES."

# Use regular expressions to find the last 'yes' or 'no' (case-insensitive)
matches = re.findall(r'(yes|no)', text, flags=re.IGNORECASE)

if matches:
    last_match = matches[-1].lower()
    if last_match == 'yes':
        print('yes')
    elif last_match == 'no':
        print('no')
else:
    print("No 'yes' or 'no' found in the text.")


yes


In [None]:
import os
import json
import random

# Define the paths
input_folder = '/cpfs/user/chennuo/CN/Graph-Reasoning-LLM/datasets/train_set'
output_folder = '/cpfs/user/chennuo/CN/Graph-Reasoning-LLM/datasets/train_set_shuffle'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Get the list of JSON files in the input folder
json_files = [file for file in os.listdir(input_folder) if file.endswith('.json')]

# Iterate over each JSON file
for file in json_files:
    # Read the JSON file
    with open(os.path.join(input_folder, file)) as f:
        datas = f.readlines()
    
    data = [json.loads(item) for item in datas]
    # Assign sample IDs
    max_length = len(data)
    for i, sample in enumerate(data):
        sample['sample_id'] = i
    
    # Shuffle the samples
    random.shuffle(data)
    
    # Write the shuffled data to a new JSON file in the output folder
    output_file = os.path.join(output_folder, file)
    with open(output_file, 'w') as f:
        # json.dump(data, f)
        for new_sample in data:
            f.write(json.dumps(new_sample, ensure_ascii=False) + '\n')


In [2]:
str(5)

'5'