生成的候选文本应该是如下结构：<br/>
text = [
    ['word11' ... 'word1n'],
    ['word21' ... 'word2m'],
    ...
]里面可以包含特殊符号'\<start\>', '\<end\>', '\<pad\>'<br/>
现在只考虑了一个输入只生成一个候选句子的情况，每个候选句子只有一个参考句子<br/>
每个评测指标都是一个生成句子对应一个参考句子的指标值<br/>
evaluation返回的是所有生成句子的评测指标的总平均值

In [1]:
import string
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import json
import os
import numpy as np
from torch.nn.utils.rnn import pad_sequence


def load_word_vectors(filename):
    word_vectors = {}
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            word_vectors[word] = vector
    return word_vectors


# 加载词向量
word_vectors = load_word_vectors('test_captions.json_word_vectors.txt')  # 替换为您的词向量文件路径


def text_to_vectors(text, word_vectors):
    # 转换为小写并移除标点
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    # 获取词向量的维度
    vector_dim = len(next(iter(word_vectors.values())))

    # 创建开始符和结束符的词向量
    start_vector = np.ones(vector_dim)
    end_vector = np.full(vector_dim, 2)

    # 将开始符、词向量、结束符合并
    vectors = [start_vector] + [word_vectors.get(word, np.zeros(vector_dim)) for word in words] + [end_vector]
    return np.array(vectors), text


class ImageTextDataset(Dataset):
    def __init__(self, json_file_path, image_folder_path, word_vectors):
        """
            Args:
               json_file_path (string): JSON文件的路径，包含图片名称和对应的描述。
               image_folder_path (string): 包含图片的文件夹路径。
        """
        with open(json_file_path, 'r', encoding='utf-8') as file:
            self.descriptions = json.load(file)
        self.image_folder_path = image_folder_path
        self.word_vectors = word_vectors

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, idx):
        image_name, description = list(self.descriptions.items())[idx]
        image_path = os.path.join(self.image_folder_path, image_name)

        # 加载图像并调整尺寸
        image = Image.open(image_path)
        image = image.resize((750, 1101))  # 将图像大小调整为 750x1101

        # 转换图像为numpy数组，然后转为Tensor
        image = np.array(image)
        if image.shape[2] == 4:  # 检查是否有alpha通道
            image = image[..., :3]  # 仅保留RGB通道
        image = torch.from_numpy(image).permute(2, 0, 1)  # 转换为CHW格式

        # 处理文本描述
        description_vectors, description = text_to_vectors(description, self.word_vectors)

        return {'image': image, 'description_vectors': description_vectors, 'description': description}


def collate_fn(batch):
    images = [item['image'] for item in batch]
    description = [item['description'] for item in batch]
    descriptions = [torch.tensor(item['description_vectors'], dtype=torch.float32) for item in batch]
    descriptions_padded = pad_sequence(descriptions, batch_first=True, padding_value=0)
    images = torch.stack(images, dim=0)  # 现在images是Tensor列表
    return {'image': images, 'description_vectors': descriptions_padded, 'description': description}


# 使用示例
json_file_path = 'deepfashion-multimodal/test_captions.json'  # 替换为您的JSON文件路径
image_folder_path = 'deepfashion-multimodal\images'  # 替换为您的图片文件夹路径
dataset = ImageTextDataset(json_file_path, image_folder_path, word_vectors)
dataloader = DataLoader(dataset, batch_size=4, collate_fn=collate_fn)

METEOR

函数代码

In [2]:
from nltk.translate import meteor_score
import torch
import numpy as np

def filter_useless_words(sent, filterd_words):
    return [w for w in sent if w not in filterd_words]

def evaluate_with_meteor(data_loader, model, config):
    '''
    data_loader : 数据集 {'image': image, 'description_vectors': description_vectors, 'description': description}
    model : 模型
    config : 有关数据集的信息，
            config.captions_per_image 每个图片包含的描述数 
            config.beam_k 用于束搜索（beam search）的参数，表示每个时间步保留的候选文本的数量。
            config.max_len 生成文本的最大长度。
    '''
    # model.eval()
    cands = []  # 存储候选文本
    refs = []   # 存储参考文本
    filterd_words = ['<start>', '<end>', '<pad>']
    # cpi = config.captions_per_image
    # device = next(model.parameters()).device

    for i, batch in enumerate(data_loader):
        with torch.no_grad():
            # 生成候选文本
            texts = [des.split() for des in batch['description']] # 这是调试时用的，用参考文本先代替候选文本
            # texts = model.generate_by_beamsearch(imgs.to(device), config.beam_k, config.max_len+2)
            # 候选文本
            for text in texts:
                cands.append(filter_useless_words(text, filterd_words))
            # 参考文本
            for ref in batch['description']:
                refs.append(filter_useless_words(ref.split(), filterd_words))

    # multiple_refs = []
    # for idx in range(len(refs)):
    #     multiple_refs.append(refs[(idx//cpi)*cpi : (idx//cpi)*cpi+cpi])

    # 计算 METEOR 分数
    meteor_score_value = compute_meteor_score(refs, cands)
    model.train()
    return meteor_score_value

def compute_meteor_score(references, candidates):
    # 计算 METEOR 分数
    scores = [
        meteor_score.meteor_score([ref], cand)
        for ref, cand in zip(references, candidates)
    ]
    return np.mean(scores)


evaluate_with_meteor(dataloader, None, None)

AttributeError: 'NoneType' object has no attribute 'train'

ROUGE-L 返回F值<br/>
要 conda install rouge 或 pip install rouge

In [4]:
from rouge import Rouge 

hypothesis = "the #### transcript is a written version of each day 's cnn student news program use this transcript to he     lp students with reading comprehension and vocabulary use the weekly newsquiz to test your knowledge of storie s you     saw on cnn student news"

reference = ["this page includes the show transcript use the transcript to help students with reading comprehension and"  ,   "vocabulary at the bottom of the page , comment for a chance to be mentioned on cnn student news . you must be a teac" ,   "her or a student age # # or older to request a mention on the cnn student news roll call . the weekly newsquiz tests",     "students ' knowledge of even ts in the news"]

rouge = Rouge(metrics=['rouge-l'])
scores = [rouge.get_scores(hypothesis, ref) for ref in reference]
r = [s[0]['rouge-l']['p'] for s in scores]

p = scores[:][0]['rouge-l']['p']
f = scores[:][0]['rouge-l']['f']
print(scores)
print(r)
print(p)
print(f)

TypeError: list indices must be integers or slices, not str

In [10]:
from rouge import Rouge

# 生成的文本（hypothesis）
hypothesis = "This is a generated sentence."

# 多个参考文本（reference）组成的列表
references = [
    "This is a reference sentence 1.",
    "This is a reference sentence 2.",
    "This is a reference sentence 3."
]

# 初始化 Rouge 对象
rouge = Rouge()

# 计算 ROUGE 指标
scores = rouge.get_scores(hypothesis, references)

# 输出结果
print(scores)


AttributeError: 'list' object has no attribute 'split'

In [21]:
from rouge import Rouge

def list_to_string(cand, ref):
    return ' '.join(cand), ' '.join(ref)

cand = ['A', 'B', 'C', 'D']
muti_ref = [['aa', 'sd', 'we', 'we'], ['A', 'BC', 'D', 'tt']]

rouge = Rouge(metrics=['rouge-l'])

a = [[' '.join(ref)] for ref in muti_ref]
rouge_L = [rouge.get_scores(' '.join(cand), ' '.join(ref)) for ref in muti_ref]
print(rouge_L)
print(a)

[[{'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}}], [{'rouge-l': {'r': 0.5, 'p': 0.5, 'f': 0.4999999950000001}}]]
[['aa sd we we'], ['A BC D tt']]


函数代码

In [3]:
from rouge import Rouge
import torch
import numpy as np



def filter_useless_words(sent, filterd_words):
    return [w for w in sent if w not in filterd_words]


def list_to_string(cand, ref):
    return ' '.join(cand), ' '.join(ref)


def evaluate_with_rougeL(data_loader, model, config):
    '''
    data_loader : 数据集 {'image': image, 'description_vectors': description_vectors, 'description': description}
    model : 模型
    config : 有关数据集的信息，
            config.captions_per_image 每个图片包含的描述数 
            config.beam_k 用于束搜索（beam search）的参数，表示每个时间步保留的候选文本的数量。
            config.max_len 生成文本的最大长度。
    '''
    # model.eval()
    cands = []  # 存储候选文本
    refs = []   # 存储参考文本
    filterd_words = ['<start>', '<end>', '<pad>']
    # cpi = config.captions_per_image
    # device = next(model.parameters()).device

    for i, batch in enumerate(data_loader):
        with torch.no_grad():
            # 生成候选文本
            texts = [des.split() for des in batch['description']] # 这是调试时用的，用参考文本先代替候选文本
            # texts = model.generate_by_beamsearch(imgs.to(device), config.beam_k, config.max_len+2)
            # 候选文本
            for text in texts:
                cands.append(filter_useless_words(text, filterd_words))
            # 参考文本
            for ref in batch['description']:
                refs.append(filter_useless_words(ref.split(), filterd_words))

    # 计算 ROUGE-L 分数
    rouge_score_value = compute_rouge_score(refs, cands)
    model.train()
    return rouge_score_value

def compute_rouge_score(references, candidates):
    # 计算 ROUGE-L 分数
    rouge = Rouge(metrics=['rouge-l'])
    scores = []
    for cand, ref in zip(candidates, references):
        # 保证 cand 和 ref 都得是字符串才行
        rouge_L = [rouge.get_scores(' '.join(cand), ' '.join(ref))]
        # 取每个 ROUGE-L 的平均值？
        f = np.mean([rl[0]['rouge-l']['f'] for rl in rouge_L])
        scores.append(f)
    return np.mean(scores)


evaluate_with_rougeL(dataloader, None, None)

AttributeError: 'NoneType' object has no attribute 'train'

ciderD <br/>
要 pip install git+https://github.com/michelecafagna26/cider.git#egg=cidereval

感觉这个库计算的数值不大正常，迟点换pycoco的试一试

In [15]:
from cidereval import cider, ciderD

# refs and preds are lists of strings, the method will re-format them for you
# 生成的文本（hypothesis）
hypothesis =[
    ["This is a generated sentence."],
    ["This is a generated sentence."],
    ["This is a generated sentence."]
]

# 多个参考文本（reference）组成的列表
references = [
    ["This is adsad reference sentence 1."],
    ["Thiadsds isadas a abd sentence 2."],
    ["Thisdas is a rcance sentence 3."]
]


a = [ciderD(predictions=hy, references=re, df='coco-val') for hy, re in zip(hypothesis,references)]
#cider_scores is a dict-like object with "avg_score" and "scores"

print(a)

[{'avg_score': 1.132545730129139e-05, 'scores': array([1.13254573e-05])}, {'avg_score': 3.002962163221202e-05, 'scores': array([3.00296216e-05])}, {'avg_score': 1.918020994573542e-05, 'scores': array([1.91802099e-05])}]


In [19]:
from cidereval import cider, ciderD

# refs and preds are lists of strings, the method will re-format them for you

references = [
    "This is reference 1.",
    "This is reference 2.",
    "This is reference 3."
]

predictions = [
    "This is a prediction for reference 1.",
    "This is a prediction for reference 2.",
    "This is a prediction for reference 3."
]

a = [ciderD(predictions=predictions, references=references, df='coco-val')]
#cider_scores is a dict-like object with "avg_score" and "scores"

print(a)

[{'avg_score': 0.030670514815346384, 'scores': array([0.035809  , 0.02742222, 0.02878032])}]


In [22]:
from cidereval import cider, ciderD

# refs and preds are lists of strings, the method will re-format them for you

references = [
    "This is reference 1.",
    "This is reference 2.",
    "This is reference 3."
]

predictions = [
    "This is a prediction for reference 3."
]

a = [ciderD(predictions=predictions * 3, references=references, df='coco-val')]
b = [cider(predictions=predictions * 3, references=references)]
#cider_scores is a dict-like object with "avg_score" and "scores"

print(a)
print(b)

[{'avg_score': 0.00959344110232026, 'scores': array([0.        , 0.        , 0.02878032])}]
[{'avg_score': 0.015816910404604298, 'scores': array([0.        , 0.        , 0.04745073])}]


函数代码

In [7]:
from cidereval import ciderD
import torch
import numpy as np

def filter_useless_words(sent, filterd_words):
    return [w for w in sent if w not in filterd_words]

def list_to_string(cands, refs):
    return [' '.join(cand) for cand in cands], [' '.join(ref) for ref in refs]


def evaluate_with_ciderD(data_loader, model, config):
    '''
    data_loader : 数据集 {'image': image, 'description_vectors': description_vectors, 'description': description}
    model : 模型
    config : 有关数据集的信息，
            config.captions_per_image 每个图片包含的描述数 
            config.beam_k 用于束搜索（beam search）的参数，表示每个时间步保留的候选文本的数量。
            config.max_len 生成文本的最大长度。
    '''
    # model.eval()
    cands = []  # 存储候选文本
    refs = []   # 存储参考文本
    filterd_words = ['<start>', '<end>', '<pad>']
    # cpi = config.captions_per_image
    # device = next(model.parameters()).device

    for i, batch in enumerate(data_loader):
        with torch.no_grad():
            # 生成候选文本
            texts = [des.split() for des in batch['description']] # 这是调试时用的，用参考文本先代替候选文本
            # texts = model.generate_by_beamsearch(imgs.to(device), config.beam_k, config.max_len+2)
            # 候选文本
            for text in texts:
                cands.append(filter_useless_words(text, filterd_words))
            # 参考文本
            for ref in batch['description']:
                refs.append(filter_useless_words(ref.split(), filterd_words))

    # multiple_refs = []
    # for idx in range(len(refs)):
    #     multiple_refs.append(refs[(idx//cpi)*cpi : (idx//cpi)*cpi+cpi])

    # 计算 CIDEr-D 分数
    cider_score_value = compute_cider_score(refs, cands)
    model.train()
    return cider_score_value


def compute_cider_score(references, candidates):
    # 计算 CIDEr-D 分数
    scores = []
    cands, refs = list_to_string(references, candidates)
    # for cand, ref in zip(candidates, references):
    #     # 将 cand 重复n遍，使其长度等于 muti_ref
    #     cand = [' '.join(cand)] * len(muti_ref)
    #     muti_ref = [' '.join(ref) for ref in muti_ref]
    #     # 取 CIDEr-D 的平均值？
    #     cD = ciderD(cand, muti_ref, df='coco-val')
    #     scores.append(cD['avg_score'])
    cD = ciderD(cands, refs, df = 'coco-val')
    return cD['avg_score']


evaluate_with_ciderD(dataloader, None, None)

{'avg_score': 5.826569362267109e-11, 'scores': array([2.85749833e-13, 4.94224528e-17, 2.25441072e-37, ...,
       7.80027756e-18, 0.00000000e+00, 1.43228523e-20])} {'avg_score': 0.0, 'scores': array([0., 0., 0., ..., 0., 0., 0.])}


AttributeError: 'NoneType' object has no attribute 'train'

SPICE <br/>
用pycoco的库<br/>
pip install pycocoevalcap

In [23]:
# Collect all references from dataset as references: dict
# Collect all captions generated by model as captions: dict

references = {
    "1": ["this is a tree", "this is an apple"],
    "2": ["a man is sitting", "a man in the street"],
}

captions = {
    "1": ["this is a big tree"],
    "2": ["a man is sitting"],
}

# Save them as correct json files
import json

new_cap = []
for k, v in captions.items():
    new_cap.append({'image_id': k, 'caption': v[0]})

new_ref = {'images': [], 'annotations': []}
for k, refs in references.items():
    new_ref['images'].append({'id': k})
    for ref in refs:
        new_ref['annotations'].append({'image_id': k, 'id': k, 'caption': ref})

with open('references.json', 'w') as fgts:
    json.dump(new_ref, fgts)
with open('captions.json', 'w') as fres:
    json.dump(new_cap, fres)

In [None]:

import torch
import numpy as np
import json


def filter_useless_words(sent, filterd_words):
    return [w for w in sent if w not in filterd_words]


def save_json(cand, ref):
    new_cap = []
    for k, v in captions.items():
        new_cap.append({'image_id': k, 'caption': v[0]})

    new_ref = {'images': [], 'annotations': []}
    for k, refs in references.items():
        new_ref['images'].append({'id': k})
        for ref in refs:
            new_ref['annotations'].append({'image_id': k, 'id': k, 'caption': ref})

    with open('references.json', 'w') as fgts:
        json.dump(new_ref, fgts)
    with open('captions.json', 'w') as fres:
        json.dump(new_cap, fres)


def evaluate_with_meteor(data_loader, model, config):
    model.eval()
    cands = {}  # 存储候选文本
    refs = {}   # 存储参考文本
    filterd_words = set({model.vocab['<start>'], model.vocab['<end>'], model.vocab['<pad>']})
    cpi = config.captions_per_image
    device = next(model.parameters()).device

    for i, (imgs, caps, caplens) in enumerate(data_loader):
        with torch.no_grad():
            # 生成参考文本
            texts = model.generate_by_beamsearch(imgs.to(device), config.beam_k, config.max_len+2)
            # 候选文本
            cands[f'{i}'] = [filter_useless_words(text, filterd_words) for text in texts]
            # 参考文本
            refs[f'{i}'] = [filter_useless_words(cap, filterd_words) for cap in caps.tolist()]

    multiple_refs = {}
    for idx in range(len(refs)):
        multiple_refs.append(refs[(idx//cpi)*cpi : (idx//cpi)*cpi+cpi])

    # 计算 CIDEr-D 分数
    spice_score_value = compute_spice_score(multiple_refs, cands)
    model.train()
    return spice_score_value


def compute_spice_score(references, candidates):
