In [4]:
import difflib
import os 
import json

def compare_code(original: str, fixed: str) -> str:
    """
    对比两段代码的差异，生成统一的差异格式（Unified Diff）
    
    参数:
        original (str): 原始代码
        fixed (str): 修改后的代码
    
    返回:
        str: 差异报告文本
    """
    # 将代码按行分割，保留换行符
    original_lines = original.splitlines(keepends=True)
    fixed_lines = fixed.splitlines(keepends=True)
    
    # 生成差异报告
    diff = difflib.unified_diff(
        original_lines,
        fixed_lines,
        fromfile='original',
        tofile='fixed',
        lineterm='' 
    )
    return ''.join(diff)


def parse_diff(diff_text: str) -> list:
    """
    解析差异文本，提取修改信息
    
    返回格式示例：
    [
        {
            'type': 'modified',
            'old_line': 1,
            'new_line': 1,
            'content': '-def hello():\n+def greet():'
        },
        {
            'type': 'added',
            'old_line': None,
            'new_line': 3,
            'content': '+    print("More changes")'
        }
    ]
    """
    changes = []
    current_block = None
    
    for line in diff_text.split('\n'):
        if line.startswith('@@'):
            parts = line.split(' ')
            old_range = parts[1][1:].split(',')  # 去掉"-"号
            new_range = parts[2][1:].split(',')  # 去掉"+"号
            current_block = {
                'old_start': int(old_range[0]),
                'new_start': int(new_range[0]),
                'old_lines': int(old_range[1]) if len(old_range) > 1 else 1,
                'new_lines': int(new_range[1]) if len(new_range) > 1 else 1,
                'changes': []
            }
        elif current_block and (line.startswith('-') or line.startswith('+')):
            # 记录具体修改
            change_type = 'deleted' if line.startswith('-') else 'added'
            content = line
            changes.append({
                'type': change_type,
                'content': content,
                'old_line': current_block['old_start'] if change_type == 'deleted' else None,
                'new_line': current_block['new_start'] if change_type == 'added' else None
            })
            # 更新行号计数器（简化逻辑，实际需更精确处理）
            if change_type == 'deleted':
                current_block['old_start'] += 1
            else:
                current_block['new_start'] += 1
    
    return changes


if __name__ == "__main__":
    with open('/home/casit205/code/SoftwareDefectDetection/Annotation/results/split2_output_deepseekv3_huoshan_20250208005327.json','r') as f:
        data = json.load(f)
    item = data[0]
    original_code = item['question']
    fixed_code = item['fixed_code']
    

    diff_result = compare_code(original_code, fixed_code)
    print(diff_result)
    changes = parse_diff(diff_result)
    print(changes)

--- original+++ fixed@@ -1,6 +1,6 @@ @org.junit.Test
-public void calculateChangeInPriceSameNumbersShouldReturnZero() throws java.lang.Exception {
+public void calculateChangeInPricePriceWasSmallerShouldReturnNegativeValue() throws java.lang.Exception {
     ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculator calculator = new ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculator();
-    double result = calculator.calculateChangeInPrice(java.math.BigDecimal.ONE, java.math.BigDecimal.ONE);
-    assertEquals(0, result, 0);
+    double result = calculator.calculateChangeInPrice(new java.math.BigDecimal(3), new java.math.BigDecimal(2));
+    assertEquals(0.5, result, ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculatorTest.ACCEPTABLE_DOUBLE_ACCURACY);
 }
[]


In [None]:
import difflib
import os
import json

def compare_code(original: str, fixed: str) -> str:
    original_lines = original.splitlines(keepends=True)
    fixed_lines = fixed.splitlines(keepends=True)
    
    # 生成差异时强制添加换行符分隔文件头和块头
    diff = difflib.unified_diff(
        original_lines,
        fixed_lines,
        fromfile='original',
        tofile='fixed',
        lineterm='\n'  # 强制使用换行符分隔行
    )
    return ''.join(diff)

def parse_diff(diff_text: str) -> list:
    """解析差异文本为结构化数据"""
    changes = []
    current_block = None

    for line in diff_text.split('\n'):
        if line.startswith('@@'):
            parts = line.split(' ')
            old_part = parts[1][1:].split(',')  # 去掉开头的 "-"
            new_part = parts[2][1:].split(',')  # 去掉开头的 "+"
            current_block = {
                'old_start': int(old_part[0]),
                'new_start': int(new_part[0]),
                'old_line_counter': int(old_part[0]),
                'new_line_counter': int(new_part[0]),
                'changes_in_block': []
            }
        elif current_block and (line.startswith('-') or line.startswith('+')):
            # 记录修改类型和内容
            change_type = 'deleted' if line.startswith('-') else 'added'
            content = line
            old_line = current_block['old_line_counter'] if change_type == 'deleted' else None
            new_line = current_block['new_line_counter'] if change_type == 'added' else None

            changes.append({
                'type': change_type,
                'content': content,
                'old_line': old_line,
                'new_line': new_line
            })

            if change_type == 'deleted':
                current_block['old_line_counter'] += 1
            else:
                current_block['new_line_counter'] += 1

    return changes


if __name__ == "__main__":
    with open('./small_sample_output_dir/With_Original_Code/Task3/split0_output_deepseek-chat_voted_five_models_with_code.json','r') as f:
        data = json.load(f)
    item = data[0]
    original_code = item['question']
    fixed_code = item['fixed_code']     
    
    # 生成差异文本
    diff_result = compare_code(original_code, fixed_code)
    print("原始差异文本：\n", diff_result)

    # 解析差异为结构化数据
    structured_changes = parse_diff(diff_result)
    print("\n结构化解析结果：")
    for change in structured_changes:
        print(change)

原始差异文本：
 --- original
+++ fixed
@@ -1,6 +1,6 @@
 @org.junit.Test
-public void calculateChangeInPriceSameNumbersShouldReturnZero() throws java.lang.Exception {
+public void calculateChangeInPricePriceWasSmallerShouldReturnNegativeValue() throws java.lang.Exception {
     ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculator calculator = new ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculator();
-    double result = calculator.calculateChangeInPrice(java.math.BigDecimal.ONE, java.math.BigDecimal.ONE);
-    assertEquals(0, result, 0);
+    double result = calculator.calculateChangeInPrice(new java.math.BigDecimal(3), new java.math.BigDecimal(2));
+    assertEquals(0.5, result, ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculatorTest.ACCEPTABLE_DOUBLE_ACCURACY);
 }

结构化解析结果：
{'type': 'deleted', 'content': '-public void calculateChangeInPriceSameNumbersShouldReturnZero() throws java.lang.Exception {', 'old_line': 1, 'new_line': None}
{'type': 'added', 'content': '+pu

In [None]:
import difflib
import os
import json
import re

def compare_code(original: str, fixed: str) -> str:
    original_lines = original.splitlines(keepends=True)
    fixed_lines = fixed.splitlines(keepends=True)
    # 生成差异时强制添加换行符分隔文件头和块头
    diff = difflib.unified_diff(
        original_lines,
        fixed_lines,
        fromfile='original',
        tofile='fixed',
        lineterm='\n'  # 强制使用换行符分隔行
    )
    return ''.join(diff)

def parse_diff(diff_text: str) -> list:
    """解析差异文本为结构化数据"""
    changes = []
    current_block = None

    for line in diff_text.split('\n'):
        if line.startswith('@@'):
            # 解析差异块的行号范围（例如 @@ -1,2 +1,3 @@）
            parts = line.split(' ')
            old_part = parts[1][1:].split(',')  # 去掉开头的 "-"
            new_part = parts[2][1:].split(',')  # 去掉开头的 "+"
            current_block = {
                'old_start': int(old_part[0]),
                'new_start': int(new_part[0]),
                'old_line_counter': int(old_part[0]),
                'new_line_counter': int(new_part[0]),
                'changes_in_block': []
            }
        elif line.startswith(' '):
            if current_block:
                current_block['old_line_counter'] += 1
                current_block['new_line_counter'] += 1
        elif current_block and (line.startswith('-') or line.startswith('+')):
            # 记录修改类型和内容
            change_type = 'deleted' if line.startswith('-') else 'added'
            content = line
            old_line = current_block['old_line_counter'] if change_type == 'deleted' else None
            new_line = current_block['new_line_counter'] if change_type == 'added' else None

            changes.append({
                'type': change_type,
                'content': content,
                'old_line': old_line,
                'new_line': new_line
            })

            if change_type == 'deleted':
                current_block['old_line_counter'] += 1
            else:
                current_block['new_line_counter'] += 1

    return changes

def extract_code(model_response: str, model_name: str) -> str:
    """从模型回复中提取修复后的代码"""
    if model_response == None:
        return ""
    if model_name == "A":
        # 去除 Markdown 代码块标记
        code_block = model_response.replace("```cpp", "").replace("```", "")
        # 解析 JSON 数据
        try:
            data = json.loads(code_block)
            return data[0]['repair_code']
        except (json.JSONDecodeError, IndexError, KeyError):
            print(f"解析模型 {model_name} 的回复失败")
            return ""
    else:
        # 解析 JSON 数据
        try:
            data = json.loads(model_response)
            return data[0]['repair_code']
        except (json.JSONDecodeError, IndexError, KeyError):
            print(f"解析模型 {model_name} 的回复失败")
            return ""

def validate_fixes(original_diff: list, original_code: str, model_responses: dict) -> dict:
    """验证模型的修复结果"""
    results = {}
    for model_name, response in model_responses.items():
        fixed_code = extract_code(response, model_name)
        if not fixed_code:
            results[model_name] = {"status": "failed", "message": "无法提取修复后的代码"}
            continue
        new_diff_text = compare_code(original_code, fixed_code)
        new_diff = parse_diff(new_diff_text)

        fixed_issues = []
        remaining_issues = []

        original_diff_lines = {(item['old_line'], item['content']) for item in original_diff if item['type'] == 'deleted'}
        new_diff_deleted_lines = {(item['old_line'], item['content']) for item in new_diff if item['type'] == 'deleted'}

        for item in original_diff:
          if item['type'] == 'deleted':
            if (item['old_line'], item['content']) not in new_diff_deleted_lines:
              fixed_issues.append(item)
            else:
              remaining_issues.append(item)

        results[model_name] = {
            "status": "success",
            "fixed_issues": fixed_issues,
            "remaining_issues": remaining_issues,
            "new_diff": new_diff_text
        }
    return results

if __name__ == "__main__":
    with open('./small_sample_output_dir/With_Original_Code/Task3/split0_output_deepseek-chat_voted_five_models_with_code.json', 'r') as f:
        data = json.load(f)
    item = data[0]
    original_code = item['question']
    fixed_code = item['fixed_code']

    # 原始差异文本
    original_diff_text = compare_code(original_code, fixed_code)
    original_diff = parse_diff(original_diff_text)

    # 模型回复 (你需要将这里替换成你的模型回复数据)
    model_responses = item['repaired_code']

    # 验证修复结果
    validation_results = validate_fixes(original_diff, original_code, model_responses)

    # 输出结果
    for model_name, result in validation_results.items():
        print(f"模型 {model_name}:")
        if result["status"] == "failed":
            print(f"  状态: 失败 - {result['message']}")
        else:
            print(f"  状态: 成功")
            print(f"  修复的问题:")
            if result["fixed_issues"]:
                for issue in result["fixed_issues"]:
                    print(f"    - 行号: {issue['old_line']}, 内容: {issue['content']}")
            else:
                print("    - 无")
            print(f"  未修复的问题:")
            if result["remaining_issues"]:
                for issue in result["remaining_issues"]:
                    print(f"    - 行号: {issue['old_line']}, 内容: {issue['content']}")
            else:
                print("    - 无")
            print(f"  新的差异:\n{result['new_diff']}")
        print("-" * 20)

解析模型 C 的回复失败
解析模型 D 的回复失败
模型 A:
  状态: 失败 - 无法提取修复后的代码
--------------------
模型 B:
  状态: 失败 - 无法提取修复后的代码
--------------------
模型 C:
  状态: 失败 - 无法提取修复后的代码
--------------------
模型 D:
  状态: 失败 - 无法提取修复后的代码
--------------------
模型 E:
  状态: 失败 - 无法提取修复后的代码
--------------------


In [None]:
import difflib
import json
from typing import Dict, List

def normalize_code(code: str) -> str:
    """统一代码格式以便比较"""
    # 1. 去除多余空白和换行
    code = '\n'.join([line.strip() for line in code.splitlines() if line.strip()])
    # 2. 统一字符串引号（可选）
    # code = code.replace("'", "\"")  # 如果修复代码可能修改引号风格
    
    return code

def extract_repaired_codes(repaired_entry) -> List[str]:
      """解析不同格式的repaired_code"""
      codes = []
      
      print(f"Type of repaired_entry: {type(repaired_entry)}")
      print(f"Value of repaired_entry: {repaired_entry}")
      
      if isinstance(repaired_entry, list):
          # 处理JSON数组格式（如 [{'repair_code': ...}, ...]）
          for item in repaired_entry:
              if 'repair_code' in item:
                  code = item['repair_code'].strip()
                  if code.startswith('```'):  # 去除代码块标记
                      code = '\n'.join(code.split('\n')[1:-1])
                  codes.append(code)
      elif isinstance(repaired_entry, str):
          # 处理纯字符串格式
          if repaired_entry.startswith('['):  # 可能是未解析的JSON字符串
              try:
                  codes.extend(extract_repaired_codes(json.loads(repaired_entry)))
              except Exception as e:
                  print(f"Exception during JSON parsing: {e}")
                  codes.append(repaired_entry)
          else:
              codes.append(repaired_entry)
      elif isinstance(repaired_entry, dict):
          # 处理字典格式
          if 'repair_code' in repaired_entry:
              code = repaired_entry['repair_code'].strip()
              if code.startswith('```'):
                  code = '\n'.join(code.split('\n')[1:-1])
              codes.append(code)
      else:
          print("传入的repaired_entry数据类型不符合预期。")
      
      print(f"Extracted codes: {codes}")
      
      return codes

def validate_repair(original: str, repaired: str, fixed: str) -> Dict:
    """验证修复完整性"""
    # 标准化代码
    original_norm = normalize_code(original)
    repaired_norm = normalize_code(repaired)
    fixed_norm = normalize_code(fixed)
    
    # 比较差异
    diff_fixed = list(difflib.unified_diff(
        original_norm.splitlines(),
        fixed_norm.splitlines(),
        fromfile='original',
        tofile='fixed'
    ))
    
    diff_repaired = list(difflib.unified_diff(
        original_norm.splitlines(),
        repaired_norm.splitlines(),
        fromfile='original',
        tofile='repaired'
    ))
    
    # 关键指标
    required_changes = [line for line in diff_fixed if line.startswith('+') or line.startswith('-')]
    actual_changes = [line for line in diff_repaired if line.startswith('+') or line.startswith('-')]
    
    # 计算修复覆盖率
    covered = 0
    missing_changes = []
    for req in required_changes:
        if req in actual_changes:
            covered += 1
        else:
            missing_changes.append(req)
    
    return {
        "coverage": covered / len(required_changes) if required_changes else 1.0,
        "missing_changes": missing_changes,
        "extra_changes": [chg for chg in actual_changes if chg not in required_changes]
    }

def process_item(item: Dict) -> Dict:
    """处理单个数据条目"""
    results = []
    
    # 提取所有修复方案
    repaired_codes = extract_repaired_codes(item.get('repaired_code', []))
    
    for i, repaired_code in enumerate(repaired_codes):
        result = {
            "repair_id": f"repair_{i+1}",
            "validation": validate_repair(
                item['question'],
                repaired_code,
                item['fixed_code']
            )
        }
        results.append(result)
    
    return {
        "question_id": item.get("source", "unknown"),
        "repair_results": results
    }

if __name__ == "__main__":
    with open('./small_sample_output_dir/With_Original_Code/Task3/split0_output_deepseek-chat_voted_five_models_with_code.json') as f:
        data = json.load(f)
    
    report = []
    for item in data:
        report.append(process_item(item))
    
    # 保存检测报告
    with open('repair_validation_report.json', 'w') as f:
        json.dump(report, f, indent=2)

Type of repaired_entry: <class 'dict'>
Value of repaired_entry: {'A': None, 'B': None, 'C': '[\n    {\n        \'repair_code\': \'\'\'@org.junit.Test\npublic void calculateChangeInPriceSameNumbersShouldReturnZero() throws java.lang.Exception {\n    ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculator calculator = new ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculator();\n    \n    // 验证输入参数\n    java.math.BigDecimal price1 = java.math.BigDecimal.ONE;\n    java.math.BigDecimal price2 = java.math.BigDecimal.ONE;\n    \n    if (price1 == null || price2 == null || \n        price1.compareTo(java.math.BigDecimal.ZERO) <= 0 || \n        price2.compareTo(java.math.BigDecimal.ZERO) <= 0) {\n        throw new IllegalArgumentException("价格必须为正数");\n    }\n    \n    double result = calculator.calculateChangeInPrice(price1, price2);\n    assertEquals(0, result, 0);\n}\'\'\',\n        \'repair_method\': \'添加了输入参数的有效性验证，确保价格为正数且不为空，防止非法输入导致的计算错误。\'\n    }\n]', 'D': "```json\n[\n    

In [None]:
import ast

json_str = r"""[\n    {\n        'repair_code': '@org.junit.Test\\n' +\n                       'public void calculateChangeInPriceSameNumbersShouldReturnZero() throws java.lang.Exception {\\n' +\n                       '    ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculator calculator = new ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculator();\\n' +\n                       '    double result = calculator.calculateChangeInPrice(java.math.BigDecimal.ONE, java.math.BigDecimal.ONE);\\n' +\n                       '    assertEquals(0, result, 0.0001);\\n' + // 修复了浮点数比较的精度问题\n                       '}',\n        'repair_method': '修改了 assertEquals 的第三个参数，增加了容差范围 0.0001，以解决浮点数精度比较问题。'\n    }\n]"""
# 使用 ast.literal_eval 将字符串转换为列表
data = ast.literal_eval(json_str)
print(data[0]['repair_code'])

SyntaxError: unexpected character after line continuation character (<unknown>, line 1)

In [None]:
import ast
import json
import re
import difflib
def before_process_data(data):
    cleaned_string = re.sub(r'//.*', '', data)
    cleaned_string = re.sub(r"\s*\'\s*\+\s*\'\s*", '', cleaned_string)
    cleaned_string = re.sub(r"\s*\"\s*\+\s*\"\s*", '', cleaned_string)
    
    cleaned_string = cleaned_string.replace("```json", "").replace("```", "").strip().replace("'", '"').replace('```javascript', '').replace('```python', '').replace('```cpp', '').replace('\'', '\"')
    
    # print(cleaned_string)
    return cleaned_string
    # return text

def normalize_code(code: str) -> str:
    """
    代码标准化处理，移除无意义的空行和多余的空格。
    """
    return '\n'.join(line.strip() for line in code.splitlines() if line.strip())

def calculate_code_diff(original_code: str, repaired_code: str) -> dict:
    """
    使用 difflib 计算代码差异，并抽取添加和删除的代码行。
    """
    # print(original_code)
    # print(repaired_code)
    # 标准化代码（去掉多余的空白行）
    original_norm = normalize_code(original_code)
    repaired_norm = normalize_code(repaired_code)

    # 使用 difflib 获取统一格式的差异
    diff = list(
        difflib.unified_diff(
            original_norm.splitlines(),
            repaired_norm.splitlines(),
            lineterm=""
        )
    )
    added_lines = [line[1:] for line in diff if line.startswith('+') and not line.startswith('+++')]
    # removed_lines = [line[1:] for line in diff if line.startswith('-') and not line.startswith('---')]

    return {
        "added_lines": added_lines,
        # "removed_lines": removed_lines,
        "diff_details": diff
    }

def process_item(index: int,item: dict) -> dict:
    """处理单个数据条目"""
    # print(item['repaired_code'])
    
    fixed_code = item['fixed_code']
    original_code = item['question']
    repair_results = []
    back_format = {
                "repair_id": f"repair_{index+1}",
                "model":{
                    'A':{
                        'added_lines_repaired':'',
                        'added_lines_fixed':'',
                        'added_lines_repaired_details':'',
                        'added_lines_fixed_details':''
                    },
                    'B':{
                        'added_lines_repaired':'',
                        'added_lines_fixed':'',
                        'added_lines_repaired_details':'',
                        'added_lines_fixed_details':''
                    },
                    'C':{
                        'added_lines_repaired':'',
                        'added_lines_fixed':'',
                        'added_lines_repaired_details':'',
                        'added_lines_fixed_details':''
                    },
                    'D':{
                        'added_lines_repaired':'',
                        'added_lines_fixed':'',
                        'added_lines_repaired_details':'',
                        'added_lines_fixed_details':''
                    },
                    'E':{
                        'added_lines_repaired':'',
                        'added_lines_fixed':'',
                        'added_lines_repaired_details':'',
                        'added_lines_fixed_details':''
                    },
                    # 'E':{
                    #     'added_lines_repaired':len(repaired_diff["added_lines"]),
                    #     'added_lines_fixed':len(fixed_diff["added_lines"]),
                    #     'added_lines_repaired_details':repaired_diff["added_lines"],
                    #     'added_lines_fixed_details':fixed_diff["added_lines"]
                    # },
                    'original_content':item
                }
        }
    if item['repaired_code'] is  None:
        return ''
    for index, (key, value) in enumerate(item['repaired_code'].items()):
        # print(key,value)
        fixed_diff = calculate_code_diff(original_code, fixed_code)
        
        if value is None:
            back_format['model'][key]['added_lines_repaired'] = ''
            back_format['model'][key]['added_lines_fixed'] = len(fixed_diff["added_lines"])
            back_format['model'][key]['added_lines_repaired_details'] = ''
            back_format['model'][key]['added_lines_fixed_details'] = fixed_diff["added_lines"]
        else:
            print(key)
            try:    
                repaired_code = ast.literal_eval(before_process_data(value))
            except Exception as e:
                # repaired_code = None
                print(e)
                print(before_process_data(value))
                print(value)
                # repaired_code = value
            print(repaired_code)
            repaired_diff = calculate_code_diff(original_code, repaired_code[0]['repair_code'])
            back_format['model'][key]['added_lines_repaired'] = len(repaired_diff["added_lines"])
            print(back_format['model'][key]['added_lines_repaired'] )
            back_format['model'][key]['added_lines_fixed'] = len(fixed_diff["added_lines"])
            back_format['model'][key]['added_lines_repaired_details'] = repaired_diff["added_lines"]
            back_format['model'][key]['added_lines_fixed_details'] = fixed_diff["added_lines"]
            # break
            print(back_format)
        # for model in back_format['model']:
        #     answer_content = repaired_code[index]
        #     repaired_diff = calculate_code_diff(original_code, answer_content)
        #     fixed_diff = calculate_code_diff(original_code, fixed_code)
    print(back_format)
    repair_results.append(back_format)
    return repair_results


if __name__ == "__main__":
    with open('/home/casit205/code/SoftwareDefectDetection/Annotation/results/split2_output_gpt--4o_with_fixed_code.json', 'r') as f:
        data = json.load(f)

    report = []
    report.append(process_item(0,data[1]))
    # for index,item in enumerate(data):
    #     report.append(process_item(index,item))
    #     break
    
    with open('repair_validation_report.json', 'w') as f:
        json.dump(report, f, indent=4,ensure_ascii=False)
    # 输出结果
    # print(json.dumps(report, indent=2, ensure_ascii=False))



# with open('./small_sample_output_dir/With_Original_Code/Task3/split0_output_deepseek-chat_voted_five_models_with_code.json', 'r') as f:
#     data = json.load(f)

# reparied_codes_results = []
# for item in data:
#     repaired_codes = item['repaired_code']
#     for key, code in repaired_codes.items():
#         if code is None:
#             reparied_codes_results.append('')
#         else:
#             try:
#                 repaired_code = ast.literal_eval(before_process_data(code))
#                 reparied_codes_results.append(repaired_code[0]['repair_code'])
#             except Exception as e:
#                 print(key)
#                 print(code)
#                 print(e)
#     fixed_code = item['fixed_code']
#     original_code = item['question']
#     # print(reparied_codes_results)
#     break



AttributeError: 'str' object has no attribute 'items'

In [9]:
report

['']

In [28]:
print(calculate_code_diff(original_code,repaired_code[0]['repair_code']))

@org.junit.Test
public void calculateChangeInPriceSameNumbersShouldReturnZero() throws java.lang.Exception {
    ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculator calculator = new ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculator();
    double result = calculator.calculateChangeInPrice(java.math.BigDecimal.ONE, java.math.BigDecimal.ONE);
    assertEquals(0, result, 0);
}
@org.junit.Test
public void calculateChangeInPriceSameNumbersShouldReturnZero() throws java.lang.Exception {
ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculator calculator = new ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculator();
double result = calculator.calculateChangeInPrice(java.math.BigDecimal.ONE, java.math.BigDecimal.ONE);
assertEquals(0, result, 0.0001);
}
{'added_lines': ['assertEquals(0, result, 0.0001);'], 'removed_lines': ['assertEquals(0, result, 0);'], 'diff_details': ['--- ', '+++ ', '@@ -2,5 +2,5 @@', ' public void calculateChangeInPriceSameNumbersShouldReturn

In [15]:

import re
import ast

# 输入的字符串
input_str = '''[ { "repair_code": "@org.junit.Test\n" "public void calculateChangeInPriceSameNumbersShouldReturnZero() throws java.lang.Exception {\n" " ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculator calculator = new ru.unn.agile.ElasticityOfDemand.ElasticityOfDemandCalculator();\n" " double result = calculator.calculateChangeInPrice(java.math.BigDecimal.ONE, java.math.BigDecimal.ONE);\n" " assertEquals(0, result, 0.0001);\n" + // 修复了浮点数比较的精度问题 "}", "repair_method": "修改了 assertEquals 的第三个参数，增加了容差范围 0.0001，以解决浮点数精度比较问题。" } ]'''

# 使用正则表达式给缺失逗号的地方补充逗号
fixed_str = re.sub(r'(?<=")\s*(?=")', ', ', input_str)

# 检查修复后的字符串是否可以被安全地解析为 JSON 或 Python 字典列表
try:
    json_like_obj = ast.literal_eval(fixed_str)
    print("修正后可解析的对象是：", json_like_obj)
except ValueError as e:
    print("字符串解析失败：", e)

SyntaxError: EOL while scanning string literal (<unknown>, line 1)

In [None]:
# 2402131647
import ast
import json
import re
import difflib
def before_process_data(data):
    # cleaned_string = re.sub(r'//.*', '', data)
    # cleaned_string = re.sub(r"\s*\'\s*\+\s*\'\s*", '', cleaned_string)
    # cleaned_string = re.sub(r"\s*\"\s*\+\s*\"\s*", '', cleaned_string)

    match = re.search(r"['\"]repair_code['\"]:(.*?)['\"]repair_method['\"]", data, re.DOTALL)
    try:
        if match:
            between_code = match.group(1)
            # print(between_code)
        else:
            print("未找到 repair_code 和 repair_method 之间的代码")
    
        cleaned_string = between_code.replace("```json", "").replace('```,', '').replace('```java', '').replace("''',", "").replace('```javascript', '').replace('```python', '').replace('```cpp', '').replace('\'', '\"').replace('\"', '').replace('\\n', '\n').replace('```,', '')
    except:
        # print(data)
        return "error"
        breakpoint
    # print(cleaned_string)
    # print(cleaned_string)
    return cleaned_string
    # return text

def normalize_code(code: str) -> str:
    """
    代码标准化处理，移除无意义的空行和多余的空格。
    """
    return '\n'.join(line.strip() for line in code.splitlines() if line.strip())

def calculate_code_diff(original_code: str, repaired_code: str) -> dict:
    """
    使用 difflib 计算代码差异，并抽取添加和删除的代码行。
    """
    # print(original_code)
    # print(repaired_code)

    original_norm = normalize_code(original_code)
    repaired_norm = normalize_code(repaired_code)

    diff = list(
        difflib.unified_diff(
            original_norm.splitlines(),
            repaired_norm.splitlines(),
            lineterm=""
        )
    )
    added_lines = [line[1:] for line in diff if line.startswith('+') and not line.startswith('+++')]
    # removed_lines = [line[1:] for line in diff if line.startswith('-') and not line.startswith('---')]

    return {
        "added_lines": added_lines,
        # "removed_lines": removed_lines,
        "diff_details": diff
    }

def process_item(index: int,item: dict) -> dict:
    """处理单个数据条目"""
    # print(item['repaired_code'])
    
    fixed_code = item['fixed_code']
    original_code = item['question']
    repair_results = []
    back_format = {
                "repair_id": f"repair_{index+1}",
                # "model": item['llm_model_third_step'],
                'results':{
                    'added_lines_repaired':'',
                    'added_lines_fixed':'',
                    'added_lines_repaired_details':'',
                    'added_lines_fixed_details':''
                    },
                    
                'original_content':item
            }
    fixed_diff = calculate_code_diff(original_code, fixed_code)
    if item['repaired_code'] is  None:
        back_format['results']['added_lines_repaired'] = 0
        back_format['results']['added_lines_fixed'] = len(fixed_diff["added_lines"])
        back_format['results']['added_lines_repaired_details'] = ''
        back_format['results']['added_lines_fixed_details'] = fixed_diff["added_lines"]
    value = item['repaired_code']
        # print(key,value)
    
    
    if value is None:
        back_format['results']['added_lines_repaired'] = 0
        back_format['results']['added_lines_fixed'] = len(fixed_diff["added_lines"])
        back_format['results']['added_lines_repaired_details'] = ''
        back_format['results']['added_lines_fixed_details'] = fixed_diff["added_lines"]
    else:
        # try:    
        #     repaired_code = ast.literal_eval(before_process_data(value))
        # except Exception as e:
        #     # repaired_code = None
        #     print(e)
        #     return 'error'
        #     # print(before_process_data(value))
        #     # print(value)
        #     # repaired_code = value
        # # print(repaired_code)
        # repaired_diff = calculate_code_diff(original_code, repaired_code[0]['repair_code'])
        formatted_code = before_process_data(value)
        if formatted_code != 'error':
            repaired_diff = calculate_code_diff(original_code, formatted_code)
            back_format['results']['added_lines_repaired'] = len(repaired_diff["added_lines"])
            # print(back_format['results']['added_lines_repaired'] )
            back_format['results']['added_lines_fixed'] = len(fixed_diff["added_lines"])
            back_format['results']['added_lines_repaired_details'] = repaired_diff["added_lines"]
            back_format['results']['added_lines_fixed_details'] = fixed_diff["added_lines"]
        else:
            print(index)
            return 'error'
        # break
        # print(back_format)
        # for model in back_format['model']:
        #     answer_content = repaired_code[index]
        #     repaired_diff = calculate_code_diff(original_code, answer_content)
        #     fixed_diff = calculate_code_diff(original_code, fixed_code)
    # print(back_format)
    repair_results.append(back_format)
    return repair_results


if __name__ == "__main__":
    with open('/home/casit205/code/SoftwareDefectDetection/Annotation/results/split2_output_gpt--4o_all.json', 'r') as f:
        data = json.load(f)

    # report = []
    # report.append(process_item(0,data[3]))
    report = []
    error_count = 0
    errors = []
    for index,item in enumerate(data):
        result = process_item(index,item)
        if result=='error':
            error_count += 1
            errors.append(item)
        else:
            report.append(result)
        # break
    
    with open('repair_validation_report.json', 'w') as f:
        json.dump(report, f, indent=4,ensure_ascii=False)
    # 输出结果
    # print(json.dumps(report, indent=2, ensure_ascii=False))



# with open('./small_sample_output_dir/With_Original_Code/Task3/split0_output_deepseek-chat_voted_five_models_with_code.json', 'r') as f:
#     data = json.load(f)

# reparied_codes_results = []
# for item in data:
#     repaired_codes = item['repaired_code']
#     for key, code in repaired_codes.items():
#         if code is None:
#             reparied_codes_results.append('')
#         else:
#             try:
#                 repaired_code = ast.literal_eval(before_process_data(code))
#                 reparied_codes_results.append(repaired_code[0]['repair_code'])
#             except Exception as e:
#                 print(key)
#                 print(code)
#                 print(e)
#     fixed_code = item['fixed_code']
#     original_code = item['question']
#     # print(reparied_codes_results)
#     break



In [8]:
errors

[{'question': 'private void loadGlobalSettings(android.database.sqlite.SQLiteDatabase db) {\n    android.database.sqlite.SQLiteStatement stmt = null;\n    try {\n        stmt = db.compileStatement(("INSERT OR IGNORE INTO global(name,value)" + " VALUES(?,?);"));\n        loadBooleanSetting(stmt, Settings.Global.AIRPLANE_MODE_ON, R.bool.def_airplane_mode_on);\n        loadStringSetting(stmt, Settings.Global.AIRPLANE_MODE_RADIOS, R.string.def_airplane_mode_radios);\n        loadStringSetting(stmt, Settings.Global.AIRPLANE_MODE_TOGGLEABLE_RADIOS, R.string.airplane_mode_toggleable_radios);\n        loadBooleanSetting(stmt, Settings.Global.ASSISTED_GPS_ENABLED, R.bool.assisted_gps_enabled);\n        loadBooleanSetting(stmt, Settings.Global.AUTO_TIME, R.bool.def_auto_time);\n        loadBooleanSetting(stmt, Settings.Global.AUTO_TIME_ZONE, R.bool.def_auto_time_zone);\n        loadSetting(stmt, Settings.Global.STAY_ON_WHILE_PLUGGED_IN, (("1".equals(android.os.SystemProperties.get("ro.kernel.qem

In [9]:
with open('gpt-4o-errors.json','w') as f:
    json.dump(errors,f,indent=4,ensure_ascii=False)

In [12]:
report[0]

[{'repair_id': 'repair_1',
  'results': {'added_lines_repaired': 0,
   'added_lines_fixed': 9,
   'added_lines_repaired_details': '',
   'added_lines_fixed_details': ['public java.lang.Object instantiateItem(android.view.ViewGroup container, final int position) {',
    'final uk.co.senab.photoview.PhotoViewAttacher attacher = new uk.co.senab.photoview.PhotoViewAttacher(imageView);',
    'attachers.put(position, attacher);',
    'if ((listener) != null) {',
    'attacher.setOnPhotoTapListener(listener);',
    '}',
    'android.util.Log.i(be.ugent.zeus.hydra.viewpager.ImagePagerAdapter.TAG, ("Loaded image #" + position));',
    'attacher.update();',
    'android.util.Log.i(be.ugent.zeus.hydra.viewpager.ImagePagerAdapter.TAG, "Image error");']},
  'original_content': {'question': '@java.lang.Override\npublic java.lang.Object instantiateItem(android.view.ViewGroup container, int position) {\n    android.widget.FrameLayout layout = new android.widget.FrameLayout(this.context);\n    layout.s

In [12]:
data[206]

{'question': '#include <bits/stdc++.h>\r\n\r\nusing namespace std;\r\n\r\n#define FAST ios_base::sync_with_stdio(false); cin.tie(NULL); cout.tie(NULL);\r\n#define ll long long\r\n#define ull unsigned long long\r\n#define pi 2*acos(0.0)\r\n#define readl(v,n) for(ll i=0;i<n;i++) {ll val; cin>>val; v.pb(val);}\r\n#define readi(v,n) for(int i=0;i<n;i++) {int val; cin>>val; v.pb(val);}\r\n#define srt(v) sort(v.begin(), v.end());\r\n#define rsrt(v) sort(v.rbegin(), v.rend());\r\n#define MIN(v) *min_element(v.begin(), v.end())\r\n#define MAX(v) *max_element(v.begin(), v.end())\r\n#define sz(x) ((ll) (x).size())\r\n#define all(x) (x).begin(), (x).end()\r\n#define rep(i, a, b) for(ll i = (a); i < (b); i++)\r\n#define rep2(i, a, b) for(ll i = (a); i <= (b); i++)\r\n#define vll vector <ll>\r\n#define vii vector <int>\r\n#define pii pair <int, int>\r\n#define pll pair <ll, ll>\r\n#define M 1000007\r\n#define MOD 1000000007\r\n#define pb push_back\r\n#define mp make_pair\r\n#define ff first\r\n#def

In [3]:
error_count,len(report)

(0, 500)

In [10]:
errors

[]

In [15]:
for item in report:
    if item=="":
        error_count+=1
error_count

75

In [11]:
errors[0]

IndexError: list index out of range

In [74]:
if __name__ == "__main__":
    with open('/home/casit205/code/SoftwareDefectDetection/Annotation/results/split2_random_model_sample_output_with_fixed_code.json', 'r') as f:
        data = json.load(f)

    report = []
    report.append(process_item(0,errors[0]))
    
    # 输出结果
    print(json.dumps(report, indent=2, ensure_ascii=False))

[
    {
        "repair_code":
        cpp
#include <set>
#include <map>
#include <ctime>
#include <queue>
#include <cmath>
#include <stack>
#include <bitset>
#include <vector>
#include <cstdio>
#include <sstream>
#include <cstring>
#include <cstdlib>
#include <iostream>
#include <algorithm>
#define eps 1e-8
using namespace std;

typedef long long ll;
typedef pair<int, int> P;
const int mod = 268435456;
const int N = 1e6 + 10;


string jia(string a, string b) {
    if (a.empty() || a == "0")
        return b;
    else if (b.empty() || b == "0")
        return a;

    string sum = "";
    int la = a.length();
    int lb = b.length();
    int len = max(la, lb);
    reverse(a.begin(), a.end());
    reverse(b.begin(), b.end());

    int carry = 0;
    for (int i = 0; i < len; ++i) {
        int da = (i < la) ? (a[i] - "0") : 0;
        int db = (i < lb) ? (b[i] - "0") : 0;
        int tmp = da + db + carry;
        carry = tmp / 2;
        sum += (char)("0" + tmp % 2);
    }
    if (carry)

In [22]:
repaired_added_lines = report[0][0]['results']['added_lines_repaired_details']
fixed_added_lines = report[0][0]['results']['added_lines_fixed_details']

item1 = repaired_added_lines[0]
item1

'int main() {'

In [13]:
import re


def get_tokens(code):
    token_patterns = [
        (r'[a-zA-Z_]\w*', 'IDENTIFIER_OR_KEYWORD'), # 标识符或关键字
        (r'\d+', 'NUMBER'),                        # 数字
        (r'[{}()\[\];]', 'DELIMITER'),             # 界符
        (r'[+\-*/=]', 'OPERATOR'),                 # 操作符
        (r'\s+', None),                            # 空白符 (忽略)
    ]

    token_regex = '|'.join(f'({pattern})' for pattern, _ in token_patterns)

    matches = re.finditer(token_regex, code)

    tokens = []
    for match in matches:
        for i, (pattern, token_type) in enumerate(token_patterns):
            if match.group(i + 1):  # 匹配某一个分组
                if token_type:     # 忽略 None 类型的 token
                    tokens.append(match.group())
                break
    return tokens



for element in report:
    # repaired_added_lines = element[0]['results']['added_lines_repaired_details']
    # fixed_added_lines = element[0]['results']['added_lines_fixed_details']
    results = []
    print(element[0]['results']['added_lines_repaired_details'])
    if element[0]['results']['added_lines_repaired_details'] is  None:
        element[0]['results']['added_lines_repaired_details_tokens'] = ''
        element[0]['results']['added_lines_fixed_details_tokens'] = ''
        continue

    for item in element[0]['results']['added_lines_repaired_details']:
        tokens = get_tokens(item)
        results.append(tokens)
    element[0]['results']['added_lines_repaired_details_tokens'] = results
    results = []
    for item in element[0]['results']['added_lines_fixed_details']:
        tokens = get_tokens(item)
    results.append(tokens)
    element[0]['results']['added_lines_fixed_details_tokens'] = results



['scanf(%100099s, s + 1);  // 增加输入长度限制，防止缓冲区溢出', 'if (s[i] == () ++sl;', 'if (s[i] == #) la = i;', 'if (sl < n / 2) {printf(-1); return 0;}', 'if (s[i] == #)', 'if (i != la) printf(1', ');', 'else printf(%d', ', sl * 2 - n + 1);']

['if(a <= 0 || b <= 0) {', 'cout << Invalid input;', 'return 0;', '}', 'vector<int> arr(a);', 'cout<<0;', 'cout<<0;', '} else {']
['int n, m, tab[200002], x; // 将数组大小调整为200002，避免越界', 'int bs(int pocz, int kon, int x)', 'while (pocz <= kon) { // 使用迭代替代递归，避免栈溢出风险', 'int mid = (pocz + kon) / 2;', 'if (tab[mid] <= x && tab[mid + 1] > x) return mid;', 'if (tab[mid] <= x) pocz = mid + 1;', 'else kon = mid - 1;', '}', 'return -1; // 返回一个安全值，避免未找到时的行为错误', 'scanf(%d%d, &n, &m);', 'for (int a = 1; a <= n; a++) scanf(%d, &tab[a]);', 'sort(tab + 1, tab + n + 1);', 'tab[n + 1] = 1000000010;', 'for (int a = 1; a <= m; a++) {', 'scanf(%d, &x);', 'printf(%d , bs(1, n, x));']
['while (in.hasNextLine()) {', 'if (line.isEmpty()) {', 'continue;', '}', 'if (operator == +) {', '

In [28]:
report[1][0]

{'repair_id': 'repair_2',
 'model': 'gpt-4o',
 'results': {'added_lines_repaired': 34,
  'added_lines_fixed': 7,
  'added_lines_repaired_details': ['#include<limits>',
   'int n, b, d, a;',
   'int sum = 0;',
   '// 验证输入是否正确以及范围',
   'while (!(cin >> n) || n < 1 || n > numeric_limits<int>::max()) {',
   'cout << 请输入一个有效的整数n: ;',
   'cin.clear();',
   'cin.ignore(numeric_limits<streamsize>::max(),',
   ');',
   '}',
   'while (!(cin >> b) || b < 1 || b > numeric_limits<int>::max()) {',
   'cout << 请输入一个有效的整数b: ;',
   'cin.clear();',
   'cin.ignore(numeric_limits<streamsize>::max(),',
   ');',
   '}',
   'while (!(cin >> d) || d < 1 || d > numeric_limits<int>::max()) {',
   'cout << 请输入一个有效的整数d: ;',
   'cin.clear();',
   'cin.ignore(numeric_limits<streamsize>::max(),',
   ');',
   '}',
   'n = min(n, b);',
   'for (int i = 1; i <= n; i++)',
   'while (!(cin >> a) || a < 0 || a > numeric_limits<int>::max()) {',
   'cout << 请输入一个有效的整数a: ;',
   'cin.clear();',
   'cin.ignore(numeric_limits<

In [14]:
import numpy as np
def calculate_repeat_rate(tokens1, tokens2):
    intersection = set(tokens1) & set(tokens2)
    return len(intersection)

average_nums = []
difference_scores = []
for element in report:
    # print(element[0]['results']['added_lines_repaired_details_tokens'])
    # print(element[0]['results']['added_lines_fixed_details_tokens'])
    if element[0]['results']['added_lines_repaired_details_tokens'] == [] or element[0]['results']['added_lines_fixed_details_tokens'] == []:
        
        average_nums.append(0)
        difference_scores.append(0.5*(element[0]['results']['added_lines_repaired'] - element[0]['results']['added_lines_fixed']))
        continue
    repeat_matrix = np.zeros((len(element[0]['results']['added_lines_fixed_details_tokens']),len(element[0]['results']['added_lines_repaired_details_tokens'])))
    for index1,item1 in enumerate(element[0]['results']['added_lines_fixed_details_tokens']):
        for index2,item2 in enumerate(element[0]['results']['added_lines_repaired_details_tokens']):
            repeat_rate = calculate_repeat_rate(item1, item2)
            length = len(item2)
            if length == 0:
                repeat_matrix[index1,index2] = 0
            else:
                repeat_matrix[index1,index2] = repeat_rate/length
    difference_score = 0.5*(element[0]['results']['added_lines_repaired'] - element[0]['results']['added_lines_fixed'])
    # print(repeat_matrix)
    try:
        repeat_matrix_max = np.max(repeat_matrix,axis=1)
    except:
        print(element)
        print(repeat_matrix)
    # print(repeat_matrix_max)
    average_nums.append(repeat_matrix_max.mean())
    difference_scores.append(difference_score)
average_nums,difference_scores


([0,
  np.float64(1.0),
  0,
  np.float64(0.5),
  np.float64(0.6666666666666666),
  np.float64(0.5),
  np.float64(0.875),
  np.float64(0.3333333333333333),
  np.float64(1.0),
  np.float64(0.6),
  np.float64(1.0),
  np.float64(0.5714285714285714),
  np.float64(0.75),
  np.float64(0.3333333333333333),
  np.float64(1.0),
  np.float64(0.5),
  np.float64(0.0),
  np.float64(1.0),
  np.float64(1.0),
  np.float64(0.875),
  0,
  np.float64(0.5714285714285714),
  0,
  0,
  np.float64(0.6666666666666666),
  0,
  np.float64(0.6666666666666666),
  np.float64(1.0),
  np.float64(1.0),
  np.float64(0.6153846153846154),
  np.float64(1.0),
  np.float64(0.6666666666666666),
  np.float64(1.0),
  np.float64(0.3333333333333333),
  np.float64(0.75),
  np.float64(1.0),
  np.float64(0.0),
  np.float64(0.4444444444444444),
  np.float64(1.0),
  np.float64(0.75),
  0,
  0,
  np.float64(1.0),
  np.float64(1.0),
  0,
  np.float64(0.8),
  np.float64(0.6923076923076923),
  np.float64(1.0),
  np.float64(0.428571428571

In [15]:
sum(average_nums)/500

np.float64(0.60623918643448)

In [16]:
sum(difference_scores)/500

7.593