In [None]:
import os
import json
import datetime
import pandas as pd
import numpy as np
from summary.doc_summary import DocSummary
from summary.doc_summary import summaryMetrics
from summary.doc_summary import NLGMetrics

In [None]:
# Read bill_sum_data.csv 
df_data = pd.read_csv("./data/bill_sum_data.csv")
df_data.shape,df_data.columns


In [None]:
# threshold_small_medium = 1500
# threshold_medium_long = 3000
# threshold_length_limit = 4500

In [None]:
# Filter documents with len < 10000
df_long = df_data[df_data["text_len"] >= 10000].copy()
df_long.shape


In [None]:
df_long = df_long.drop(columns = ["Unnamed: 0"],axis=1)

In [None]:
df_long = df_long.reset_index(drop=True)
df_long

In [None]:
def get_tokens(text):
    docSum = DocSummary(text=text)
    return docSum.llm.get_num_tokens(text)

In [None]:
df_long["num_tokens"] = df_long["text"].apply(lambda x : get_tokens(x))


In [None]:
def get_summary_long(text):
    docSum = DocSummary(text=text)
    summary_kmeans = docSum.summary_long()
    summary_agglomerative = docSum.summary_long(clustering_type="agglomerative")
    summary_map_reduce = docSum.summary_medium()
    return summary_kmeans[0],summary_kmeans[1],summary_agglomerative[0],summary_agglomerative[1],summary_map_reduce[0],summary_map_reduce[1]
    

In [None]:
text = df_long.loc[df_long.index[0], 'text']
results = get_summary_long(text)

In [None]:
results

In [None]:
# Num tokens > 2000. We will use long doc summarization 
# reduce the chun sizes in doc summary class
# expt with num of clusters
df_long[['summary_kmeans', 'kmeans_resp_time','summary_agglomerative', 'agglomerative_resp_time','summary_mapreduce','mapreduce_resp_time']] = df_long.apply(lambda row: pd.Series(get_summary_long(row['text'])), axis=1)





In [None]:
df_long.to_csv("./data/long_doc_output.csv")

Metrics

In [None]:
def get_metrics(candidate,reference):
    metrics = summaryMetrics(summary_text=candidate,reference_text=reference)
    rouge_1_p,rouge_1_r,rouge_1_f,rouge_2_p,rouge_2_r,rouge_2_f,rouge_l_p,rouge_l_r,rouge_l_f = metrics.get_rouge_score()
    bert_p,bert_r,bert_f = metrics.get_bert_score()
    return  rouge_1_p,rouge_1_r,rouge_1_f,rouge_2_p,rouge_2_r,rouge_2_f,rouge_l_p,rouge_l_r,rouge_l_f,bert_p,bert_r,bert_f

In [None]:
df_long[['kmeans_rouge_1_p', 'kmeans_rouge_1_r','k_means_rouge_1_f','kmeans_rouge_2_p', 'kmeans_rouge_2_r','kmeans_rouge_3_f','kmeans_rouge_l_p', 'kmeans_rouge_l_r','kmeans_rouge_l_f','kmeans_bert_p','kmeans_bert_r','kmeans_bert_f']] = df_long.apply(lambda row: pd.Series(get_metrics(row['summary_kmeans'],row['summary'])), axis=1)

In [None]:
df_long[['agglomerative_rouge_1_p', 'agglomerative_rouge_1_r','agglomerative_rouge_1_f','agglomerative_rouge_2_p', 'agglomerative_rouge_2_r','agglomerative_rouge_3_f','agglomerative_rouge_l_p', 'agglomerative_rouge_l_r','agglomerative_rouge_l_f','agglomerative_bert_p','agglomerative_bert_r','agglomerative_bert_f']] = df_long.apply(lambda row: pd.Series(get_metrics(row['summary_agglomerative'],row['summary'])), axis=1)

In [None]:
df_long[['mapreduce_rouge_1_p', 'mapreduce_rouge_1_r','mapreduce_rouge_1_f','mpreduce_rouge_2_p', 'mapreduce_rouge_2_r','mapreduce_rouge_3_f','mapreduce_rouge_l_p', 'mapreduce_rouge_l_r','mapreduce_rouge_l_f','mapreduce_bert_p','mapreduce_bert_r','mapreduce_bert_f']] = df_long.apply(lambda row: pd.Series(get_metrics(row['summary_mapreduce'],row['summary'])), axis=1)

In [None]:
df_long.shape

In [None]:
df_long.to_csv("./data/long_doc_metrics.csv")

In [None]:
df_long.columns

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


# Create subplots (1 row, 2 columns)
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 8))

# Plot bar graph for columns r_1, r_2, r_3
selected_cols = ["kmeans_bert_p","agglomerative_bert_p","mapreduce_bert_p"]
df_long[selected_cols].mean().plot(kind='bar', ax=axes[0][0], edgecolor='black')

axes[0][0].set_title('Bert P Scores')
axes[0][0].set_ylabel('Scores')

# Plot bar graph bert r
selected_cols = ["kmeans_bert_r","agglomerative_bert_r","mapreduce_bert_r"]
df_long[selected_cols].mean().plot(kind='bar', ax=axes[0][1], edgecolor='black')
axes[0][1].set_title('Bert R scores')
axes[0][1].set_ylabel('Scores')


# Plot bar graph for columns other_col_1, other_col_2
selected_cols = ["kmeans_bert_f","agglomerative_bert_f","mapreduce_bert_f"]
df_long[selected_cols].mean().plot(kind='bar', ax=axes[1][0], edgecolor='black')
axes[1][0].set_title('Bert F scores')
axes[1][0].set_ylabel('Scores')

# Plot bar graph for resp time
selected_cols = ["kmeans_resp_time","agglomerative_resp_time","mapreduce_resp_time"]
df_long[selected_cols].mean().plot(kind='bar', ax=axes[1][1], edgecolor='black')
axes[1][1].set_title('Response Time')
axes[1][1].set_ylabel('Response time(sec)')


# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()


In [None]:
# Bert score plots

# Create subplots (1 row, 2 columns)
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(8, 6))

# Plot bar graph for columns r_1, r_2, r_3
selected_cols = ["k_means_rouge_1_f","agglomerative_rouge_1_f","mapreduce_rouge_1_f"]
df_long[selected_cols].mean().plot(kind='bar', ax=axes[0], edgecolor='black')

axes[0].set_title('Rouge 1 F Scores')
axes[0].set_ylabel('Scores')

# Plot bar graph for columns other_col_1, other_col_2
selected_cols = ["kmeans_rouge_3_f","agglomerative_rouge_3_f","mapreduce_rouge_3_f"]
df_long[selected_cols].mean().plot(kind='bar', ax=axes[1], edgecolor='black')
axes[1].set_title('Rouge 2 F scores')
axes[1].set_ylabel('Scores')


# Plot bar graph for columns other_col_1, other_col_2
selected_cols = ["kmeans_rouge_l_f","agglomerative_rouge_l_f","mapreduce_rouge_l_f"]
df_long[selected_cols].mean().plot(kind='bar', ax=axes[2], edgecolor='black')
axes[2].set_title('Rouge l F scores')
axes[2].set_ylabel('Scores')

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()


In [None]:
def get_summary(text):
    docSum = DocSummary(config_file="./config/config.json",text=text)
    num_tokens = docSum.llm.get_num_tokens(text)
    if num_tokens < threshold_small_medium:
        return docSum.summary_short()
    elif num_tokens < threshold_medium_long:
        return docSum.summary_medium()
    elif num_tokens < threshold_length_limit:
        return docSum.summary_long()
    else:
        return "too long to process",""

In [None]:
df_data[['aoai_summary', 'response_time']] = df_data.apply(lambda row: pd.Series(get_summary(row['text'])), axis=1)

In [None]:
df_data.head()

In [None]:
df_data.to_csv("./data/long_doc_output.csv")

Summary Evaluation - Rouge & Bert scores

In [None]:
#load output_data csv
df_data = pd.read_csv("./data/output_data.csv")
df_data.head()

In [None]:
df_data[['rouge_1_p', 'rouge_1_r','rouge_1_f','rouge_2_p', 'rouge_2_r','rouge_3_f','rouge_l_p', 'rouge_l_r','rouge_l_f','bert_p','bert_r','bert_f']] = df_data.apply(lambda row: pd.Series(get_metrics(row['aoai_summary'],row['summary'])), axis=1)