In [None]:
import logging
import os
import json
import numpy as np
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
# from langchain.chat_models import AzureChatOpenAI
# from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.mapreduce import MapReduceChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.schema import HumanMessage
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.output_parsers import OutputFixingParser,StructuredOutputParser,ResponseSchema
import datetime
import pandas as pd



In [None]:
from summary.doc_summary import DocSummary
from summary.doc_summary import summaryMetrics
from summary.doc_summary import NLGMetrics

Scientific papers

In [None]:
# Get Hugging face dataset 
import pandas as pd

df_papers = pd.read_csv("./data/scientific-papers.csv")

In [None]:
df_papers.shape, df_papers.columns

In [None]:
df_selected = df_papers[(df_papers["num_tokens"] >= 20000) & (df_papers["num_tokens"] <30000)]

In [None]:
df_selected.shape,df_selected.columns

In [None]:
import time
def get_summary_kmeans(text):
    docSum = DocSummary(text=text)
    return docSum.summary_long()

def get_summary_agglo(text):
    docSum = DocSummary(text=text)
    return docSum.summary_long(clustering_type="agglomerative")

def get_summary_mapreduce(text):
    docSum = DocSummary(text=text)
    return docSum.summary_medium()
   


In [None]:
summary_list = []

In [None]:
def process_results(summary_kmeans,summary_agglomerative,summary_mapreduce):
    summary_dict = {}
    summary_dict["sum_kmeans"] = summary_kmeans[0]
    summary_dict["resp_time_kmeans"] = summary_kmeans[1]
    summary_dict["sum_agglomerative"] = summary_agglomerative[0]
    summary_dict["resp_time_agglomerative"] = summary_agglomerative[1]
    summary_dict["sum_mapreduce"] = summary_mapreduce[0]
    summary_dict["resp_time_mapreduce"] = summary_mapreduce[1]
    # summary_dict["article"] = df_selected.iloc[n]["article"]
    # summary_dict["abstract"] = df_selected.iloc[n]["abstract"]
    # summary_dict["section_names"] = df_selected.iloc[n]["section_names"]
    # summary_dict["num_tokens"] = df_selected.iloc[n]["num_tokens"]
    return summary_dict

In [None]:
#Choose 10 data points randomly
import random

# Choose 10 numbers randomly from the range 1 to 100
random_numbers = random.sample(range(1, df_selected.shape[0]), 10)

print(random_numbers)


In [None]:
for n in random_numbers[6:]:
    print(n)
    summary_kmeans,summary_agglomerative,summary_mapreduce= "","",""
    text = df_selected.iloc[n]["article"]
    num_tokens = df_selected.iloc[n]["num_tokens"]
    print(num_tokens)
    summary_kmeans = get_summary_kmeans(text)
    summary_agglomerative = get_summary_agglo(text)
    summary_mapreduce = get_summary_mapreduce(text)
    summary_dict = process_results(summary_kmeans,summary_agglomerative,summary_mapreduce)
    summary_dict["article"] = df_selected.iloc[n]["article"]
    summary_dict["abstract"] = df_selected.iloc[n]["abstract"]
    summary_dict["section_names"] = df_selected.iloc[n]["section_names"]
    summary_dict["num_tokens"] = df_selected.iloc[n]["num_tokens"]
    summary_list.append(summary_dict)



In [None]:
df = pd.DataFrame(summary_list)

In [None]:
df.to_csv("./data/paper_summary_gpt35_201k.csv",index=False)

Get metrics

In [None]:
df = pd.read_csv("./data/paper_summary_gpt35_201k.csv")

In [None]:
df_data = df.drop(columns=["article","section_names"])
df_data.shape,df_data.columns

In [None]:
# calucalte metrics
def get_metrics(candidate,reference):
    metrics = summaryMetrics(summary_text=candidate,reference_text=reference)
    rouge_1_p,rouge_1_r,rouge_1_f,rouge_2_p,rouge_2_r,rouge_2_f,rouge_l_p,rouge_l_r,rouge_l_f = metrics.get_rouge_score()
    bert_p,bert_r,bert_f = metrics.get_bert_score()
    return  rouge_1_p,rouge_1_r,rouge_1_f,rouge_2_p,rouge_2_r,rouge_2_f,rouge_l_p,rouge_l_r,rouge_l_f,bert_p,bert_r,bert_f

In [None]:
df_data[['kmeans_rouge_1_p', 'kmeans_rouge_1_r','k_means_rouge_1_f','kmeans_rouge_2_p', 'kmeans_rouge_2_r','kmeans_rouge_3_f','kmeans_rouge_l_p', 'kmeans_rouge_l_r','kmeans_rouge_l_f','kmeans_bert_p','kmeans_bert_r','kmeans_bert_f']] = df_data.apply(lambda row: pd.Series(get_metrics(row['sum_kmeans'],row['abstract'])), axis=1)

In [None]:
df_data.columns

In [None]:
df_data[['agglomerative_rouge_1_p', 'agglomerative_rouge_1_r','agglomerative_rouge_1_f','agglomerative_rouge_2_p', 'agglomerative_rouge_2_r','agglomerative_rouge_3_f','agglomerative_rouge_l_p', 'agglomerative_rouge_l_r','agglomerative_rouge_l_f','agglomerative_bert_p','agglomerative_bert_r','agglomerative_bert_f']] = df_data.apply(lambda row: pd.Series(get_metrics(row['sum_agglomerative'],row['abstract'])), axis=1)

In [None]:
df_data[['mapreduce_rouge_1_p', 'mapreduce_rouge_1_r','mapreduce_rouge_1_f','mapreduce_rouge_2_p', 'mapreduce_rouge_2_r','mapreduce_rouge_3_f','mapreduce_rouge_l_p', 'mapreduce_rouge_l_r','mapreduce_rouge_l_f','mapreduce_bert_p','mapreduce_bert_r','mapreduce_bert_f']] = df_data.apply(lambda row: pd.Series(get_metrics(row['sum_mapreduce'],row['abstract'])) if row['sum_mapreduce'] != '' else pd.Series(), axis=1)

In [None]:
df_data.dropna(inplace=True)
df_data.shape

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create subplots (1 row, 2 columns)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

# Plot bar graph for Bert P Scores
selected_cols_p = ["kmeans_bert_p", "agglomerative_bert_p", "mapreduce_bert_p"]
df_data[selected_cols_p].mean().plot(kind='bar', ax=axes[0], edgecolor='black')
axes[0].set_title('Bert P Scores')
axes[0].set_ylabel('Scores')
axes[0].set_ylim(0.82, 0.86)
axes[0].grid(True)

# Plot bar graph for Bert R Scores
selected_cols_r = ["kmeans_bert_r", "agglomerative_bert_r", "mapreduce_bert_r"]
df_data[selected_cols_r].mean().plot(kind='bar', ax=axes[1], edgecolor='black')
axes[1].set_title('Bert R Scores')
axes[1].set_ylabel('Scores')
axes[1].set_ylim(0.75, 0.82)
axes[1].grid(True)

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


# Create subplots (1 row, 2 columns)
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(5,5))

# # Plot bar graph for resp time
selected_cols = ["resp_time_kmeans","resp_time_agglomerative","resp_time_mapreduce"]
# Plot each column against column 'n'

plt.scatter(df_data['num_tokens'], df_data["resp_time_kmeans"],label="response_time_kmeans")
plt.scatter(df_data['num_tokens'], df_data["resp_time_agglomerative"], label="response_time_agglomerative")
plt.scatter(df_data['num_tokens'], df_data["resp_time_mapreduce"],label="response_time_mapreduce")


plt.xlabel('num tokens')
plt.ylabel('response time')
plt.title('Response time')
plt.legend()
axes.set_ylim(0, 180)
plt.grid(True)
# Adjust layout
plt.tight_layout()
# plt.ylim(0.8, 0.9)
plt.show()



NLG Metrics

In [None]:
def nlg_metrics(doc,summary):
    metrics = NLGMetrics(doc,summary)
    metric_scores = metrics.get_nlg_metrics()
    return metric_scores['coherence'],metric_scores['consistency'],metric_scores['fluency'],metric_scores['relevance']

Note: The length of the document exceeds the permissible token limit for long documents. Hence abstract is used instead of original article.

In [None]:
df_data[['k_means_coherence','kmeans_consistency','kmeans_fluency','k_meansrelevance']] = df_data.apply(lambda row: pd.Series(nlg_metrics(row['abstract'],row['sum_kmeans'])), axis=1)

In [None]:
df_data[['agglomerative_coherence','agglomerative_consistency','agglomerative_fluency','agglomerative_relevance']] = df_data.apply(lambda row: pd.Series(nlg_metrics(row['abstract'],row['sum_agglomerative'])), axis=1)

In [None]:
df_data[['mapreduce_coherence','mapreduce_consistency','mapreduce_fluency','mapreduce_relevance']] = df_data.apply(lambda row: pd.Series(nlg_metrics(row['abstract'],row['sum_mapreduce'])), axis=1)

Appendix

In [None]:
%pip install datasets

In [None]:
from datasets import load_dataset

# Print all the available datasets
from huggingface_hub import list_datasets


In [None]:
papers_ds = load_dataset('scientific_papers','arxiv',split="train")

In [None]:
papers_list = []

for d in papers_ds :
    text = d["article"]
    abstract = d["abstract"]
    section_names = d["section_names"]
    d["num_tokens"] = llm.get_num_tokens(text)
    papers_list.append(d)

In [None]:
df_raw = pd.DataFrame(papers_list)
df_raw.shape,df_raw.columns

In [None]:
df_raw.to_csv("./data/scientific-papers.csv",index=False)