In [12]:
!pip install jieba 

Collecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m948.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: jieba
  Building wheel for jieba (setup.py) ... [?25ldone
[?25h  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314458 sha256=ea6654d2d4e54036533bcad551493f3a272fb04274cdd10ff2ec901727444420
  Stored in directory: /home/casit205/.cache/pip/wheels/7d/74/cf/08c94db4b784e2c1ef675a600b7b5b281fd25240dcb954ee7e
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.42.1


In [13]:
import nltk
import jieba
import numpy as np
from collections import Counter
import math
import re
import os
import json

In [19]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/casit205/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [14]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/casit205/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [15]:
def get_words(file_path):
    with open(file_path,'r') as f:
        data = json.load(f)

    function_description = []
    vulnerability_analysis = []
    repaired_code = []
    for item in data:
        function_description.append(item['function_description'])
        vulnerability_analysis.append(item['vulnerability_analysis'])
        repaired_code.append(item['repaired_code'])
    return function_description, vulnerability_analysis, repaired_code


In [24]:
def is_chinese(char):
    """判断一个字符是否是中文"""
    return '\u4e00' <= char <= '\u9fa5'

def mixed_tokenize(text):
    """混合分词，英文用 nltk 分词，中文用 jieba 分词"""
    tokens = []
    # 使用正则表达式分割中文和非中文
    parts = re.split(r'([\u4e00-\u9fa5]+)', text)
    for part in parts:
        if part:  # 如果 part 非空
            if re.search(r'[\u4e00-\u9fa5]', part):  # 如果包含中文
                tokens.extend(jieba.lcut(part))
            else:  # 否则，当英文处理
                tokens.extend(nltk.word_tokenize(part))
    return tokens

def calculate_entropy(text):
    """计算一个字符串的熵（混合分词）"""
    tokens = mixed_tokenize(text)
    token_counts = Counter(tokens)
    total_tokens = len(tokens)
    entropy = 0
    for count in token_counts.values():
        probability = count / total_tokens
        entropy -= probability * math.log2(probability)
    return entropy

def calculate_average_entropy(responses):
    """计算多个回复的平均熵"""
    entropies = [calculate_entropy(str(response)) for response in responses]
    return np.mean(entropies), np.std(entropies)

In [None]:
if __name__ == '__main__':
    """
        熵值越高，说明信息量越大，信息越不确定，信息的不确定性越大，重复性越低，词汇多样性越高。
        熵值标准差越大，说明回复的差异性越大。一致性越低。
    """
    function_description, vulnerability_analysis, repaired_code = get_words('./small_sample_output_dir/split0_output_deepseek-coder.json')

    avg_entropy_fd, std_entropy_fd = calculate_average_entropy(function_description)
    avg_entropy_va, std_entropy_va = calculate_average_entropy(vulnerability_analysis)
    avg_entropy_rc, std_entropy_rc = calculate_average_entropy(repaired_code)
    
    print('function_description: 平均熵 =', avg_entropy_fd, ', 标准差 =', std_entropy_fd)
    print('vulnerability_analysis: 平均熵 =', avg_entropy_va, ', 标准差 =', std_entropy_va)
    print('repaired_code: 平均熵 =', avg_entropy_rc, ', 标准差 =', std_entropy_rc) 

function_description: 平均熵 = 3.4890293518505495 , 标准差 = 0.38095940555415836
vulnerability_analysis: 平均熵 = 5.003193624166888 , 标准差 = 1.0044439479472367
repaired_code: 平均熵 = 5.506753289207661 , 标准差 = 1.0981017160249487
