In [11]:
import json
import os
import math
import sys
import numpy as np
import torch
from tqdm import tqdm
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from collections import Counter


def compute_distinct(genereated_responses):
    intra_dist_1 = []
    intra_dist_2 = []
    inter_dist_1 = []
    inter_dist_2 = []
    
    for gen_resps in generated_responses:
        inta_dist1 = []
        inta_dist2 = []
        
        unigrams_all = Counter()
        bigrams_all = Counter()
        n_unigrams = 0
        n_bigrams = 0
        for gen_resp in gen_resps:
            if len(gen_resp) == 0:
                continue
                
            unigrams = Counter([tuple(gen_resp[i:i+1]) for i in range(len(gen_resp)-1)])
            inta_dist1.append(len(unigrams)/len(gen_resp))
            
            if len(gen_resp) > 1:
                bigrams = Counter([tuple(gen_resp[i:i+2]) for i in range(len(gen_resp)-1)])
                inta_dist2.append(len(bigrams)/(len(gen_resp)-1))
            
            unigrams_all.update([tuple(gen_resp[i:i+1]) for i in range(len(gen_resp)-1)])
            bigrams_all.update([tuple(gen_resp[i:i+2]) for i in range(len(gen_resp)-1)])
            
            n_unigrams += len(gen_resp) 
            if len(gen_resp) > 1:
                n_bigrams += (len(gen_resp)-1)
        
        intra_dist_1.append(np.mean(inta_dist1))
        intra_dist_2.append(np.mean(inta_dist2))
        
        inter_dist_1.append(len(unigrams_all)/n_unigrams)
        
        if n_bigrams > 0:
            inter_dist_2.append(len(bigrams_all)/n_bigrams)
    
    return np.mean(intra_dist_1), np.mean(intra_dist_2), np.mean(inter_dist_1), np.mean(inter_dist_2)
        

def compute_PPL(model, tokenizer, generated_responses):
    perplexity_scores = []
    for gen_resps in tqdm(generated_responses):
        for gen_resp in gen_resps:
            gen_resp = ' '.join(gen_resp)
            try:
                input_ids = torch.tensor(tokenizer.encode(gen_resp)).unsqueeze(0) 
                input_ids = input_ids.cuda()
                with torch.no_grad():
                    outputs = model(input_ids, labels=input_ids)
                loss, logits = outputs[:2]
                perplexity_scores.append(math.exp(loss))
            except:
                pass
                #print(gen_resp)
                #print('PPL computation error!')
    
    return np.mean(perplexity_scores)

def compute_ASL(generated_responses):
    asl = []
    for gen_resps in generated_responses:
        for resp in gen_resps:
            asl.append(len(resp))
    
    return np.mean(asl)

def compute_TTR(generated_responses):
    all_tokens = []
    for gen_resp in generated_responses:
        all_tokens.extend(gen_resp)
    
    return len(set(all_tokens)) / len(all_tokens)

def compute_bleu(generated_responses, ground_truth):
    
    recall_bleus = []
    precision_bleus = []
    for index in range(len(ground_truth)):
        bleu_scores = []
        for gen_resp in generated_responses[index]:
            try:
                bleu_scores.append(sentence_bleu([ground_truth[index]], gen_resp, smoothing_function=SmoothingFunction().method7, weights=[1./3, 1./3, 1./3]))
            except:
                bleu_scores.append(0.0)
        
        recall_bleus.append(np.max(bleu_scores))
        precision_bleus.append(np.mean(bleu_scores))
    
    
    return np.mean(recall_bleus), np.mean(precision_bleus)

def count_short_responses(generated_responses):
    counts = [0, 0]
    for gen_resps in generated_responses:
        for gen_resp in gen_resps:
            if len(gen_resp) == 0:
                counts[0] += 1
            elif len(gen_resp) == 1:     
                counts[1] += 1
    
    return counts

def load_DialogWAE_results(f_obj):
    output = []
    o_dict = {'generated_sentences': []}
    lines = f_obj.readlines()
    for line in tqdm(lines):
        #print(line)
        if line.startswith('Target >>') == True:
            line = line.rstrip('</s>')
            o_dict['next sentence'] = line[line.index('>>')+2:].strip()
        elif line.startswith('Sample') == True:     
            line = line.rstrip('</s>')
            o_dict['generated_sentences'].append(line[line.index('>>')+2:].strip())
        elif line.strip() == '':
            assert len(o_dict['generated_sentences']) == 10 and o_dict['next sentence'] != '', f'Error! {o_dict}'
            output.append(o_dict)
            o_dict = {'generated_sentences': []}
    
    return output

def compute_token_usage(predicted_tokens, generated_responses):
    
    response_w_tokens = 0
    tokens_found_counts = []
    for q_index in range(len(predicted_tokens)):
        for r_index in range(len(predicted_tokens[q_index])):
            bow = predicted_tokens[q_index][r_index]
            response = generated_responses[q_index][r_index]
            tokens_found = 0
            for token in bow:
                if token in response:
                    tokens_found += 1
            
            if tokens_found > 0:
                response_w_tokens += 1
                
            tokens_found_counts.append(tokens_found)
    
    return (response_w_tokens / len(tokens_found_counts)) * 100, np.mean(tokens_found_counts)
            
            
        
    

os.chdir('/collection/ka2khan/thesis/Cond_Text_Gen')
print(os.getcwd())

output_files = {
    #'TACGAN End-to-End': 'outputs/DailyDialog_t5-large_gtk_w_keywords_TACGAN_EndtoEnd_output.json',
    #'TACGAN wo ENC Dec Backprop': 'outputs/DailyDialog_TACGAN_wo_EncDec_Backprop_test_output_12.json',
    #'T5 wo Keywords': 'outputs/DailyDialog_t5_wo_keywords_test_output_11.json',
    #'DialogWAE': 'outputs/DailyDialog_DialogWAE_results.json',
    #'S10': 'outputs/S10_output.json',
    #'DialogWAE_DailyDialog': 'outputs/DialogWAE_DailyDialog.txt',
    #'DialogWAE_SWDA': 'outputs/DialogWAE_SWDA.txt',
    'DailyDialog': 'outputs/S6_DailyDialog.json',
    'SWDA': 'outputs/S6_SWDA.json',
    'T5-DailyDialog': 'outputs/T5_DailyDialog.json',
    'T5-SWDA': 'outputs/T5_SWDA.json',
    }

for exp_name, file_path in output_files.items():
    predicted_tokens = []
    generated_responses = []
    ground_truth = []
    if exp_name == 'DialogWAE_DailyDialog' or exp_name == 'DialogWAE_SWDA':
        f_obj = open(file_path)
        output = load_DialogWAE_results(f_obj)
        for item in tqdm(output):
            ground_truth.append(word_tokenize(item['next sentence']))
            
            gen_resps = []
            for index, gen_resp in enumerate(item['generated_sentences']):
                gen_resps.append(word_tokenize(gen_resp))
                
            generated_responses.append(gen_resps)
    else:
        with open(file_path) as f_obj:
            output = json.load(f_obj)
        
        for item in output:
            predicted_tokens.append(item['predicted_sent_bow'])
            ground_truth.append(item['next sentence'])
            gen_resps = []
            for index, gen_resp in enumerate(item['generated_sentences']):
                #index = gen_resp.rfind('.')
                #if index != -1:
                #    en_resp = gen_resp[:index]
                gen_resps.append(word_tokenize(gen_resp))
                
            generated_responses.append(gen_resps)


    avg_recall_bleu, avg_precision_bleu = compute_bleu(generated_responses, ground_truth)
    
    first_responses = []
    for gen_resps in generated_responses:
        first_responses.append(gen_resps[0])

    
    assert len(generated_responses) == len(first_responses)
        
    ttr = compute_TTR(first_responses)
    asl = compute_ASL(generated_responses)
    
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
    config = GPT2Config.from_pretrained('gpt2-large')
    model = GPT2LMHeadModel.from_pretrained('gpt2-large', config=config)
    model.cuda()
    model.eval()
    
    ppl = compute_PPL(model, tokenizer, generated_responses)
    
    if exp_name in ['DailyDialog', 'SWDA']:
        token_percent, tokens_avg = compute_token_usage(predicted_tokens, generated_responses)
        
        print(f'Responses generated using at least one token: {token_percent:.2f}')
        print(f'Avg. token usage: {tokens_avg}')
    
    counts = count_short_responses(generated_responses)
    
    intra_dist_1, intra_dist_2, inter_dist_1, inter_dist_2 = compute_distinct(generated_responses)
    
    print(f'Experiment: {exp_name}')
    print(f'Avg Recall Bleu: {avg_recall_bleu:.2f}, Avg Precision Bleu: {avg_precision_bleu:.2f}')
    print(f'TTR: {ttr * 100}')
    print(f'ASL: {asl}')
    print(f'PPL: {ppl}')
    print(f'Short responses count 0 words: {counts[0]}, 1 words: {counts[1]}')
    print(f'Intra Dist-1: {intra_dist_1:.2f}, Intra Dist-2: {intra_dist_2:.2f}, Inter Dist-1: {inter_dist_1:.2f}, Inter Dist-2: {inter_dist_2:.2f}')
    
    print()    
    
    

/collection/ka2khan/thesis/Cond_Text_Gen


100%|██████████| 5566/5566 [22:07<00:00,  4.19it/s]


Responses generated using at least one token: 99.71
Avg. token usage: 4.762738052461373
Experiment: DailyDialog
Avg Recall Bleu: 0.09, Avg Precision Bleu: 0.05
TTR: 1.1042088165731057
ASL: 13.05298239310097
PPL: 127.06401101180019
Short responses count 0 words: 0, 1 words: 0
Intra Dist-1: 0.82, Intra Dist-2: 0.98, Inter Dist-1: 0.52, Inter Dist-2: 0.89



100%|██████████| 5046/5046 [20:12<00:00,  4.16it/s]


Responses generated using at least one token: 99.92
Avg. token usage: 5.184839476813318
Experiment: SWDA
Avg Recall Bleu: 0.08, Avg Precision Bleu: 0.02
TTR: 0.45499198593661133
ASL: 10.904558065794689
PPL: 183.07041754111867
Short responses count 0 words: 0, 1 words: 0
Intra Dist-1: 0.74, Intra Dist-2: 0.91, Inter Dist-1: 0.48, Inter Dist-2: 0.84



100%|██████████| 5566/5566 [22:13<00:00,  4.18it/s]


Experiment: T5-DailyDialog
Avg Recall Bleu: 0.08, Avg Precision Bleu: 0.04
TTR: 7.660672400312745
ASL: 11.499730506647502
PPL: 436.09244729570787
Short responses count 0 words: 0, 1 words: 42
Intra Dist-1: 0.84, Intra Dist-2: 0.99, Inter Dist-1: 0.50, Inter Dist-2: 0.84



100%|██████████| 5046/5046 [19:52<00:00,  4.23it/s]


Experiment: T5-SWDA
Avg Recall Bleu: 0.07, Avg Precision Bleu: 0.02
TTR: 7.316185076334886
ASL: 7.548612762584225
PPL: 2721.637323190426
Short responses count 0 words: 21, 1 words: 13227
Intra Dist-1: 0.57, Intra Dist-2: 0.98, Inter Dist-1: 0.52, Inter Dist-2: 0.89



In [1]:
import json
import os
import math
import numpy as np
import torch
from tqdm import tqdm
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from collections import Counter


def compute_distinct(genereated_responses):
    intra_dist_1 = []
    intra_dist_2 = []
    inter_dist_1 = []
    inter_dist_2 = []
    
    for gen_resps in generated_responses:
        inta_dist1 = []
        inta_dist2 = []
        
        unigrams_all = Counter()
        bigrams_all = Counter()
        n_unigrams = 0
        n_bigrams = 0
        for gen_resp in gen_resps:
            if len(gen_resp) == 0:
                continue
                
            unigrams = Counter([tuple(gen_resp[i:i+1]) for i in range(len(gen_resp)-1)])
            inta_dist1.append(len(unigrams)/len(gen_resp))
            
            if len(gen_resp) > 1:
                bigrams = Counter([tuple(gen_resp[i:i+2]) for i in range(len(gen_resp)-1)])
                inta_dist2.append(len(bigrams)/(len(gen_resp)-1))
            
            unigrams_all.update([tuple(gen_resp[i:i+1]) for i in range(len(gen_resp)-1)])
            bigrams_all.update([tuple(gen_resp[i:i+2]) for i in range(len(gen_resp)-1)])
            
            n_unigrams += len(gen_resp) 
            if len(gen_resp) > 1:
                n_bigrams += (len(gen_resp)-1)
        
        intra_dist_1.append(np.mean(inta_dist1))
        intra_dist_2.append(np.mean(inta_dist2))
        
        inter_dist_1.append(len(unigrams_all)/n_unigrams)
        
        if n_bigrams > 0:
            inter_dist_2.append(len(bigrams_all)/n_bigrams)
    
    return np.mean(intra_dist_1), np.mean(intra_dist_2), np.mean(inter_dist_1), np.mean(inter_dist_2)
        

def compute_PPL(model, tokenizer, generated_responses):
    perplexity_scores = []
    for gen_resps in tqdm(generated_responses):
        for gen_resp in gen_resps:
            gen_resp = ' '.join(gen_resp)
            try:
                input_ids = torch.tensor(tokenizer.encode(gen_resp)).unsqueeze(0) 
                input_ids = input_ids.cuda()
                with torch.no_grad():
                    outputs = model(input_ids, labels=input_ids)
                loss, logits = outputs[:2]
                perplexity_scores.append(math.exp(loss))
            except:
                pass
                #print(gen_resp)
                #print('PPL computation error!')
    
    return np.mean(perplexity_scores)

def compute_avg_PPL(model, tokenizer, generated_responses):
    perplexity_scores = []
    for gen_resp in tqdm(generated_responses):
        try:
            input_ids = torch.tensor(tokenizer.encode(gen_resp)).unsqueeze(0) 
            input_ids = input_ids.cuda()
            with torch.no_grad():
                outputs = model(input_ids, labels=input_ids)
            loss, logits = outputs[:2]
            perplexity_scores.append(math.exp(loss))
        except:
            pass
            #print(gen_resp)
            #print('PPL computation error!')
    
    return np.mean(perplexity_scores)

def compute_ASL(generated_responses):
    asl = []
    for gen_resps in generated_responses:
        for resp in gen_resps:
            asl.append(len(resp))
    
    return np.mean(asl)

def compute_TTR(generated_responses):
    all_tokens = []
    for gen_resp in generated_responses:
        all_tokens.extend(gen_resp)
    
    return len(set(all_tokens)) / len(all_tokens)

def compute_bleu(generated_responses, ground_truth):
    
    recall_bleus = []
    precision_bleus = []
    for index in range(len(ground_truth)):
        bleu_scores = []
        for gen_resp in generated_responses[index]:
            try:
                bleu_scores.append(sentence_bleu([ground_truth[index]], gen_resp, smoothing_function=SmoothingFunction().method7, weights=[1./3, 1./3, 1./3]))
            except:
                bleu_scores.append(0.0)
        
        recall_bleus.append(np.max(bleu_scores))
        precision_bleus.append(np.mean(bleu_scores))
    
    
    return np.mean(recall_bleus), np.mean(precision_bleus)

def count_short_responses(generated_responses):
    counts = [0, 0]
    for gen_resps in generated_responses:
        for gen_resp in gen_resps:
            if len(gen_resp) == 0:
                counts[0] += 1
            elif len(gen_resp) == 1:     
                counts[1] += 1
    
    return counts
    
    

os.chdir('/collection/ka2khan/thesis/Cond_Text_Gen')
print(os.getcwd())

output_files = {
    'VAE-AM-multiturn': 'outputs/multiturn_dialog_vae_gan_mse_multi.txt',
}


for exp_name, file_path in output_files.items():
    f_obj = open(file_path)
    generated_responses = []
    for line in f_obj:
        generated_responses.append(line)


    tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
    config = GPT2Config.from_pretrained('gpt2-large')
    model = GPT2LMHeadModel.from_pretrained('gpt2-large', config=config)
    model.cuda()
    model.eval()
    
    ppl = compute_avg_PPL(model, tokenizer, generated_responses)
    
    print(f'PPL: {ppl}')
    
    

/collection/ka2khan/thesis/Cond_Text_Gen


100%|██████████| 44660/44660 [17:53<00:00, 41.60it/s]

PPL: 3457.175266843699



