In [None]:
from typing import List
import pickle
import pandas as pd
import numpy as np

def read_results(tasks: List[str], data_sources: List[str], mask_ratios: List[str], context_type,):
    
    all_ans_paths = []
    for task in tasks:
        if task != 'continue_conversation':
            for data_source in data_sources:
                for mask_ratio in mask_ratios:
                    if data_source == 'news' or data_source == 'conversation':
                        ans_path = f'/vol/research/lyc/llm_memorize/answer_{task}_{data_source}_{mask_ratio}.pkl'
                    elif data_source == 'arxiv':
                        ans_path = f'/vol/research/lyc/llm_memorize/{"arxiv_buggy/" if context_type == "Random-phrase" else ""}answer_{task}_{data_source}_{mask_ratio}.pkl'
                    all_ans_paths.append(ans_path)
        else:
            for mask_ratio in mask_ratios:
                ans_path = f'/vol/research/lyc/llm_memorize/answer_{task}_conversation_{mask_ratio}.pkl'
                all_ans_paths.append(ans_path)
    
    results = []
    for ans_path in all_ans_paths:
        print(ans_path)

        with open(ans_path, 'rb') as f:
            ans = pickle.load(f)
        
        r = ans.metrics[context_type]
        if 'precision' in r:
            r['bertscore_precision'] = np.mean(r['precision'])
            r['bertscore_recall'] = np.mean(r['recall'])
            r['bertscore_f1'] = np.mean(r['f1'])
        results.append(r)
    
    df = pd.DataFrame(results)
    df = df[['bleu', 'meteor', 'rouge1', 'rouge2', 'rougeL', 'bertscore_precision', 'bertscore_recall', 'bertscore_f1']]
    avg_results = df.mean()

    return avg_results

def main(context_type):
    tasks = ['summarisation', 'qa', 'continue_conversation']
    data_sources = ['news', 'arxiv']
    mask_ratios = [0.2, 0.35, 0.5, 0.65, 0.8]
    
    all_results = {}
    for ratio in mask_ratios:
        avg_results = read_results(tasks=tasks, data_sources=data_sources, mask_ratios=[ratio], context_type=context_type)
        print(avg_results)
        all_results[ratio] = avg_results
    
    df = pd.DataFrame.from_dict(all_results, orient='index')
    print(df)


In [None]:
main('self-info-phrase')

In [None]:
main('Random-phrase')

In [None]:
main('no2-phrase')

In [None]:
def main(context_type):
    tasks = ['summarisation', 'qa', 'conversation']
    data_sources = ['news', 'arxiv',]
    mask_ratios = [0.2, 0.35, 0.5, 0.65, 0.8]
    
    all_results = []
    for ratio in mask_ratios:
        ratio_results = []
        method = f"SC-{ratio}"
        for task in tasks:
            if task == 'conversation':
                avg_results = read_results(tasks=['continue_conversation'], data_sources=['conversation'], mask_ratios=[ratio], context_type=context_type)
            else:
                avg_results = read_results(tasks=[task], data_sources=data_sources, mask_ratios=[ratio], context_type=context_type)
            avg_results['Method'] = method
            avg_results['Task'] = task
            avg_r = avg_results.to_frame().T.set_index(['Method', 'Task'])
            print(ratio, '----', task)
            print(avg_r)
            ratio_results.append(avg_r)
        print(ratio, '*****')
        df = pd.concat(ratio_results)
        print(df)
        ratio_avg = df.mean()
        ratio_avg.name
        df.loc[(method, 'avg'), :] = ratio_avg
        print(df, "^^^^^^^^^^")
        all_results.append(df)
    
    df = pd.concat(all_results, axis=0)
    print(df)
    return df

In [None]:
df = main('self-info-phrase')

In [None]:
import os

In [None]:
df.to_csv('/user/HS502/yl02706/LLMs_Memorize/self-info-phrase.csv')

In [None]:
def main(context_type):
    tasks = ['summarisation', 'qa', 'conversation']
    data_sources = ['news', 'arxiv']
    mask_ratios = [0.2, 0.35, 0.5, 0.65, 0.8]
    
    all_results = []
    method = f"Original"
    for task in tasks:
        if task == 'conversation':
            avg_results = read_results(tasks=['continue_conversation'], data_sources=['conversation'], mask_ratios=mask_ratios, context_type=context_type)
        else:
            avg_results = read_results(tasks=[task], data_sources=data_sources, mask_ratios=mask_ratios, context_type=context_type)
        avg_results['Method'] = method
        avg_results['Task'] = task
        avg_r = avg_results.to_frame().T.set_index(['Method', 'Task'])
        print(avg_r)
        all_results.append(avg_r)
    df = pd.concat(all_results)
    print(df)
    ratio_avg = df.mean()
    ratio_avg.name
    df.loc[(method, 'avg'), :] = ratio_avg
    print(df, "^^^^^^^^^^")
    # all_results.append(df)
    
    # df = pd.concat(all_results, axis=0)
    print(df)
    return df

In [None]:
df2 = main('no2-phrase')

In [None]:
df

In [None]:
df.drop(index=('Original', 'avg'), inplace=True)

In [None]:
df

In [None]:
pd.concat([df2, df])

In [None]:
df2.loc[('Original', 'avg')] - df.loc[('SC-0.2', 'avg')]

In [None]:
new_sc = []
for index, line in df.iterrows():
    task = index[1]
    origin_r = df2.loc[('Original', task)]
    gap = origin_r - line
    new_line = {}
    for key, value in line.items():
        value = f"{value:.3f}"[1:]
        if 'bert' in key:
            gap_ = f"{gap[key]:.3f}"[1:]
        else:
            gap_ = f"{gap[key]:.2f}"[1:]
        new_line[key] = f"{value} ({gap_})"
    new_sc.append(pd.Series(new_line, name=index))
df = pd.concat(new_sc, axis=1).T
print(df)

In [None]:
new_sc = []
for index, line in df2.iterrows():
    task = index[1]
    new_line = {}
    for key, value in line.items():
        value = f"{value:.3f}"[1:]
        new_line[key] = f"{value}"
    new_sc.append(pd.Series(new_line, name=index))
df2 = pd.concat(new_sc, axis=1).T
print(df2)

In [None]:
pd.concat([df2, df])

In [None]:
print(pd.concat([df2, df]).to_latex())

In [None]:
def main(context_type):
    tasks = ['summarisation', 'qa', 'reconstruction', 'continue_conversation']
    data_sources = ['news', 'arxiv']
    mask_ratios = [0.2, 0.35, 0.5, 0.65, 0.8]
    
    all_results = []
    method = f"Original"
    for mask in mask_ratios:
        avg_results = read_results(tasks=tasks, data_sources=data_sources, mask_ratios=[mask], context_type=context_type)
        avg_results['Method'] = context_type
        avg_results['Ratio'] = mask
        avg_r = avg_results.to_frame().T.set_index(['Method', 'Ratio'])
        print(avg_r)
        all_results.append(avg_r)
    df = pd.concat(all_results)
    # print(df)
    # ratio_avg = df.mean()
    # ratio_avg.name
    # df.loc[(method, 'avg'), :] = ratio_avg
    # print(df, "^^^^^^^^^^")
    # all_results.append(df)
    
    # df = pd.concat(all_results, axis=0)
    print(df)
    return df

In [None]:
main('Random-phrase')

In [None]:
df = main('Random-phrase')
df2 = main('self-info-phrase')

In [None]:
import matplotlib.pyplot as plt

df2 = df2.reset_index()
df = df.reset_index()

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 4), dpi=120)
df2.plot(y='bleu', x='Ratio', ax=axes[0], marker='^', label = 'Selective Context')
df.plot(y='bleu', x='Ratio', ax=axes[0], marker='+', label = 'Random')
axes[0].set_title('BLEU')
axes[0].set_xlabel('')
axes[0].set_xticks([0.2, 0.35, 0.5, 0.65, 0.8])
# axes[0].set_xlabel('Filtered Ratio')

df2.plot(y='rouge1', x='Ratio', ax=axes[1], marker='^', label = 'Selective Context')
df.plot(y='rouge1', x='Ratio', ax=axes[1], marker='+', label = 'Random')
axes[1].set_title('ROUGE1')
axes[1].set_xlabel('')
axes[1].set_xticks([0.2, 0.35, 0.5, 0.65, 0.8])
# axes[1].set_ylim(0.2, 0.7)
# axes[1].set_xlabel('Filtered Ratio')

df2.plot(y='bertscore_f1', x='Ratio', ax=axes[2], marker='^', label = 'Selective Context')
df.plot(y='bertscore_f1', x='Ratio', ax=axes[2], marker='+', label = 'Random')
axes[2].set_title('BERTScore')
axes[2].set_xlabel('')
axes[2].set_xticks([0.2, 0.35, 0.5, 0.65, 0.8])
# axes[-1].set_xlabel('Filtered Ratio')
# axes[2].set_ylim(0.5, 1)
fig.text(0.52, -0.03, 'Context reduction ratio', ha='center', fontsize=14)
plt.tight_layout()

In [None]:
def task_wise(context_type):
    tasks = ['summarisation', 'qa', 'reconstruction', 'continue_conversation']
    data_sources = ['news', 'arxiv']
    mask_ratios = [0.2, 0.35, 0.5, 0.65, 0.8]
    
    all_results = []
    method = f"Original"
    for mask in mask_ratios:
        for task in tasks:
            avg_results = read_results(tasks=[task], data_sources=data_sources, mask_ratios=[mask], context_type=context_type)
            avg_results['Task'] = task
            avg_results['Ratio'] = mask
            avg_r = avg_results.to_frame().T.set_index(['Task', 'Ratio'])
            print(avg_r)
            all_results.append(avg_r)

    df = pd.concat(all_results)
    print(df)
    return df

In [None]:
def data_wise(context_type):
    tasks = ['summarisation', 'qa', 'reconstruction',]
    data_sources = ['news', 'arxiv', 'conversation']
    mask_ratios = [0.2, 0.35, 0.5, 0.65, 0.8]
    
    all_results = []
    method = f"Original"
    for mask in mask_ratios:
        for data_source in data_sources:
            if data_source == 'conversation':
                avg_results = read_results(tasks=['continue_conversation'], data_sources=[data_source], mask_ratios=[mask], context_type=context_type)
            else:
                avg_results = read_results(tasks=tasks, data_sources=[data_source], mask_ratios=[mask], context_type=context_type)
            avg_results['Data'] = data_source
            avg_results['Ratio'] = mask
            avg_r = avg_results.to_frame().T.set_index(['Data', 'Ratio'])
            print(avg_r)
            all_results.append(avg_r)

    df = pd.concat(all_results)
    print(df)
    return df

In [None]:
data_wise('self-info-phrase')
# task_wise('self-info-phrase')

In [None]:
import matplotlib.pyplot as plt

df = task_wise('self-info-phrase')
df = df.reset_index()

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 4), dpi=120)

markers = {
    'Conversation': '+',
    'Reconstruction': 's',
    'Summarisation': 'v',
    'QA': '^',
}

colors = {
    'Conversation': 'salmon',
    'Reconstruction': 'y',
    'Summarisation': 'grey',
    'QA': 'violet',
}

for task_name, group in df.groupby('Task'):
    if task_name == 'continue_conversation':
        task_name = 'Conversation'
    if task_name == 'reconstruction':
        task_name = 'Reconstruction'
    if task_name == 'summarisation':
        task_name = 'Summarisation'
    if task_name == 'qa':
        task_name = 'QA'
    group.plot(y='bleu', x='Ratio', ax=axes[0], marker=markers[task_name], label = task_name, color=colors[task_name])
    group.plot(y='rouge1', x='Ratio', ax=axes[1], marker=markers[task_name], label = task_name, color=colors[task_name])
    group.plot(y='bertscore_f1', x='Ratio', ax=axes[2], marker=markers[task_name], label = task_name, color=colors[task_name])

axes[0].set_title('BLEU')
axes[0].set_xlabel('')
axes[0].set_xticks([0.2, 0.35, 0.5, 0.65, 0.8])

axes[1].set_title('ROUGE1')
axes[1].set_xlabel('')
axes[1].set_xticks([0.2, 0.35, 0.5, 0.65, 0.8])

axes[2].set_title('BERTScore')
axes[2].set_xlabel('')
axes[2].set_xticks([0.2, 0.35, 0.5, 0.65, 0.8])
fig.text(0.52, -0.03, 'Context reduction ratio', ha='center', fontsize=14)
plt.tight_layout()

In [None]:
import matplotlib.pyplot as plt

df = data_wise('self-info-phrase')
df = df.reset_index()

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 4), dpi=120)

markers = {
    'arxiv': '+',
    'BBC': 's',
    'ShareGPT': 'v',
}

colors = {
    'arxiv': 'salmon',
    'BBC': 'y',
    'ShareGPT': 'violet',
}

for data_source, group in df.groupby('Data'):
    if data_source == 'news':
        data_source = 'BBC'
    if data_source == 'conversation':
        data_source = 'ShareGPT'
    group.plot(y='bleu', x='Ratio', ax=axes[0], marker=markers[data_source], label = data_source, color=colors[data_source])
    group.plot(y='rouge1', x='Ratio', ax=axes[1], marker=markers[data_source], label = data_source, color=colors[data_source])
    group.plot(y='bertscore_f1', x='Ratio', ax=axes[2], marker=markers[data_source], label = data_source, color=colors[data_source])

axes[0].set_title('BLEU')
axes[0].set_xlabel('')
axes[0].set_xticks([0.2, 0.35, 0.5, 0.65, 0.8])

axes[1].set_title('ROUGE1')
axes[1].set_xlabel('')
axes[1].set_xticks([0.2, 0.35, 0.5, 0.65, 0.8])

axes[2].set_title('BERTScore')
axes[2].set_xlabel('')
axes[2].set_xticks([0.2, 0.35, 0.5, 0.65, 0.8])
fig.text(0.52, -0.03, 'Context reduction ratio', ha='center', fontsize=14)
plt.tight_layout()

In [None]:
with open('/vol/research/lyc/llm_memorize/news/NewsContextManager_sent.pkl', 'rb') as f:
    data = pickle.load(f)

In [None]:
print(data.articles[0].units[1])

In [None]:
import pickle

In [None]:
articles = []
for article in data.articles:
    prompt = ''
    for role, utterance in article.context[:-1]:
        prompt += f"{role}: {utterance}\n"
    prompt += 'gpt: '
    article.prompt = prompt
    articles.append(article)

In [None]:
articles[0].units[1].text

In [None]:
articles[0].units[1].self_info

In [None]:
data.articles = articles

In [None]:
print(data.articles[2].prompt)

In [None]:
with open('/vol/research/lyc/llm_memorize/conversation/ConversationContextManager_sent.pkl', 'wb') as f:
    pickle.dump(data, f)

In [None]:
from transformers import GPT2Tokenizer

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
import numpy as np

n_sent = []
n_phrase = []
n_token = []
for article in data.articles:
    n_sent.append(len(article.units[0].text))
    n_phrase.append(len(article.units[1].text))
    n_token.append(len(article.units[2].text))

print(np.mean(n_sent), np.mean(n_phrase), np.mean(n_token))

In [None]:
data.articles[2].units[1].text

In [None]:
def read_lexical_units(article, mask_level = 'phrase'):
    if mask_level == 'sent':
        lexical_units = article.units[0]
        assert lexical_units.unit_type == 'sent'
    elif mask_level == 'phrase':
        lexical_units = article.units[1]
        assert lexical_units.unit_type == 'phrase'
    elif mask_level == 'token':
        lexical_units = article.units[2]
        assert lexical_units.unit_type == 'token'

    tokens = lexical_units.text
    self_info = lexical_units.self_info
    new_self_info = [i for i in self_info if i != 100]
    # self_info = [x**1.2 for x in self_info]

    max_score = max(new_self_info)
    min_score = min(self_info)

    mid = np.percentile(self_info, 50)

    lines = []
    highlighted = []
    buffer = []
    for token, score in zip(tokens, self_info):
        if score == 100:
            lines.append(token)
            highlighted.append(token)
            continue
        normalized_score = ((score - min_score) / (max_score - min_score)) * 100
        line = f"\\colorize{{{normalized_score}}}{{{token}}}"
        if score > mid:
            highlighted.append(line)
            lines.append(line)
        else:
            token = f"\\sdelete{{{token}}}"
            line = f"\\colorize{{{normalized_score}}}{{{token}}}"
            lines.append(line)

    return '\n'.join(lines) + '\n\n\n' + '\n'.join(highlighted)

In [None]:
print(read_lexical_units(data.articles[2], mask_level = 'phrase'))