# Analyzing Results

In [1]:
import os
import time
import glob
import random
import struct
import csv
from collections import namedtuple
import json, re, shutil, sys
import collections, itertools
import numpy as np

import data
from data import Vocab
import nltk
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
UNKNOWN_TOKEN = '[UNK]'

def get_file_names(path,root):
    name = os.path.split(path)[1]
    example_id = name.split('_')[0]
    article_file = os.path.join(root, f"reference/{example_id}_article.txt")
    ref_file = os.path.join(root, f"reference/{example_id}_reference.txt")
    return example_id,article_file,ref_file
    
    
def read_contents(file):
    with open(file, 'r') as f:
        summary = f.read()
    return summary

def get_article_oov(article, vocab):
    oov_list = []
    unk_token = vocab.word2id(UNKNOWN_TOKEN)
    words = article.split(' ')
    for w in words:
        if vocab.word2id(w)==unk_token:
            oov_list.append(w)
    return oov_list

def get_novel_words(abstract, vocab, article_oovs):
    unk_token = vocab.word2id(UNKNOWN_TOKEN)
    words = abstract.split(' ')
    new_words = []
    copy_words = []
    all_words = []
    for w in words:
        if vocab.word2id(w) == unk_token: 
            if w in article_oovs:
                copy_words.append(w)
            else:
                new_words.append(w)
    return new_words,copy_words


def summary_repetition(abstract,count=1):
    words = abstract.split(' ')
    bi_grams = list(nltk.ngrams(words, 2))
    bi_grams_freq = nltk.FreqDist(bi_grams)     
    tri_grams = list(nltk.ngrams(words, 3))
    tri_grams_freq = nltk.FreqDist(tri_grams)     
    four_grams = list(nltk.ngrams(words, 4))
    four_grams_freq = nltk.FreqDist(four_grams)   
    return list(filter(lambda x: x[1]>=count,bi_grams_freq.items())),list(filter(lambda x: x[1]>=count,tri_grams_freq.items())),list(filter(lambda x: x[1]>=count,four_grams_freq.items()))


def get_freq_dist(text,n):
    words = text.split(' ')
    n_grams = list(nltk.ngrams(words, n))
    n_grams_freq = nltk.FreqDist(n_grams)
    return n_grams_freq
    

def log_details(message,log):
    if log:
        print(message)

def get_analysis_data(input_df,model,vocab,log=False):
    
    data = []
    path = f'/home/ubuntu/W266/final_0/W266_Final/{model}/saved/decode'
    summary_dir = os.path.join(path, "generated/*")
    filelist = glob.glob(summary_dir) 
    column_list = ['model','Article OOV','Generated','Copied','2-Gram(repeat)','3-Gram(repeat)','4-Gram(repeat)']

    
    for f in filelist:
        log_details('=============================================================',log)
        example_id,article_f,reference_f = get_file_names(f,path)
        log_details(f'EXAMPLE: {example_id}',log)
        summary = read_contents(f)
        article = read_contents(article_f)
        reference = read_contents(reference_f)

        article_oovs = get_article_oov(article,vocab)
        generated_words,copied_words = get_novel_words(summary, vocab, article_oovs)
        bi,tri,four = summary_repetition(summary,2)

        d = [model,
               len(article_oovs),
               len(generated_words),
               len(copied_words),
               len(bi),
               len(tri),
               len(four)]

        data.append(d)

        log_details(f'[OOV       : {len(article_oovs)}] : {article_oovs}',log)
        log_details(f'[GENERATED : {len(generated_words)}] : {generated_words}',log)
        log_details(f'[COPIED    : {len(copied_words)}] : {copied_words}',log)
        log_details(f'[REPEAT]    :  Bi - {len(bi)} ;; Tri - {len(tri)} ;; Four - {len(four)}',log)
        log_details('=============================================================',log)
    
    df = pd.DataFrame(data,columns=column_list)  
    
    if input_df is not None:
        input_df = input_df.append(df,ignore_index=True)
    else:
        input_df = df
        
    return input_df        


def get_top_repetitions(model):
    path = f'/home/ubuntu/W266/final_0/W266_Final/{model}/saved/decode'
    summary_dir = os.path.join(path, "generated/*")
    filelist = glob.glob(summary_dir) 
    summary_all = ' '    
    reference_all = ' '
    for f in filelist:
        example_id,article_f,reference_f = get_file_names(f,path)
        summary = read_contents(f)
        summary_all = summary_all + summary
        
        reference = read_contents(reference_f)
        reference_all = reference_all + reference        
    
    summary_bi = get_freq_dist(summary_all,2)
    summary_tri = get_freq_dist(summary_all,3)
    summary_four = get_freq_dist(summary_all,4)

    ref_bi = get_freq_dist(reference_all,2)
    ref_tri = get_freq_dist(reference_all,3)
    ref_four = get_freq_dist(reference_all,4)
        
    return summary_bi,summary_tri,summary_four,ref_bi,ref_tri,ref_four        


In [3]:
vocab = Vocab('/home/ubuntu/W266/final_0/W266_Final/data/final_processed/vocab',50000)

NameError: name 'Vocab' is not defined

# Baseline Model

In [None]:
model = 'model_3'

In [None]:
df_baseline = get_analysis_data(None,model,vocab,False)

In [None]:
df_baseline.head()

In [None]:
df_baseline.mean().plot(kind='bar')#discuss-1b

In [None]:
sfd_2,sfd_3,sfd_4,rfd_2,rfd_3,rfd_4 = get_top_repetitions(model)

# Pointer Generator network

In [None]:
model = 'model_4'

In [None]:
df_pointgen = get_analysis_data(None,'model_4',vocab,False)

In [None]:
df_pointgen.head()

In [None]:
df_pointgen.shape

In [None]:
df_pointgen.mean().plot(kind='bar') #discuss-1a

In [None]:
sfd_2,sfd_3,sfd_4,rfd_2,rfd_3,rfd_4 = get_top_repetitions(model)

In [None]:
sfd_2.plot(5,cumulative=False) #discuss-2a

In [None]:
rfd_2.plot(5,cumulative=False)#discuss-2b

In [None]:
sfd_3.plot(5,cumulative=False) #discuss-3a

In [None]:
rfd_3.plot(5,cumulative=False)#discuss-3b

# Combined Analysis