In [None]:
import logging
import os
import json
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
# from langchain.chat_models import AzureChatOpenAI
# from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.mapreduce import MapReduceChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.schema import HumanMessage
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.output_parsers import OutputFixingParser,StructuredOutputParser,ResponseSchema
import datetime



In [None]:
from summary.doc_summary import DocSummary
from summary.doc_summary import summaryMetrics
from summary.doc_summary import NLGMetrics

Scientific papers

In [None]:
# Get Hugging face dataset 
import pandas as pd

df_papers = pd.read_csv("./data/scientific-papers.csv")

In [None]:
df_papers.shape, df_papers.columns

#Medium Documents (tokens between 3.5K and 20K)

In [None]:
df_selected = df_papers[(df_papers["num_tokens"] >= 3500) & (df_papers["num_tokens"] < 20000)]

In [None]:
df_selected.shape,df_selected.columns

In [None]:
import time
def get_summary_medium(text):
    docSum = DocSummary(text=text)
    return docSum.summary_medium()

   


In [None]:
summary_list = []

In [None]:
def process_results(summary_medium):
    summary_dict = {}
    summary_dict["summary_medium"] = summary_medium[0]
    summary_dict["response_time_summary_medium"] = summary_medium[1]
    return summary_dict

In [None]:
#Choose data points randomly
import random

# Choose 10 numbers randomly from the range 1 to 100
random_numbers = random.sample(range(1, df_selected.shape[0]), 50)

print(random_numbers)


In [None]:
for n in random_numbers:
    print(n)
    summary_medium = ""
    text = df_selected.iloc[n]["article"]
    num_tokens = df_selected.iloc[n]["num_tokens"]
    print(num_tokens)
    summary_short = get_summary_medium(text)
    summary_dict = process_results(summary_short)
    summary_dict["article"] = df_selected.iloc[n]["article"]
    summary_dict["abstract"] = df_selected.iloc[n]["abstract"]
    summary_dict["section_names"] = df_selected.iloc[n]["section_names"]
    summary_dict["num_tokens"] = df_selected.iloc[n]["num_tokens"]
    summary_list.append(summary_dict)



In [None]:
df = pd.DataFrame(summary_list)

In [None]:
df.shape

In [None]:
df.to_csv("./data/paper_summary_medium_gpt354k.csv",index=False)

Get metrics

In [None]:
df_data= pd.read_csv("./data/paper_summary_medium_gpt354k.csv")
df_data.shape,df_data.columns

In [None]:
df_data.columns

In [None]:
# calucalte metrics
def get_metrics(candidate,reference):
    metrics = summaryMetrics(summary_text=candidate,reference_text=reference)
    rouge_1_p,rouge_1_r,rouge_1_f,rouge_2_p,rouge_2_r,rouge_2_f,rouge_l_p,rouge_l_r,rouge_l_f = metrics.get_rouge_score()
    bert_p,bert_r,bert_f = metrics.get_bert_score()
    return  rouge_1_p,rouge_1_r,rouge_1_f,rouge_2_p,rouge_2_r,rouge_2_f,rouge_l_p,rouge_l_r,rouge_l_f,bert_p,bert_r,bert_f

In [None]:
df_data[['summary_medium_rouge_1_p', 'summary_medium_rouge_1_r','summary_medium_rouge_1_f','summary_medium_rouge_2_p', 'summary_medium_rouge_2_r','summary_medium_rouge_3_f','summary_medium_rouge_l_p', 'summary_medium_rouge_l_r','summary_medium_rouge_l_f','summary_medium_bert_p','summary_medium_bert_r','summary_medium_bert_f']] = df_data.apply(lambda row: pd.Series(get_metrics(row['summary_medium'],row['abstract'])), axis=1)

In [None]:
df_data.columns,df.shape

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create subplots (1 row, 2 columns)
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4,4))

# Plot bar graph for Bert P Scores
selected_cols_p = ["summary_medium_bert_p","summary_medium_bert_r"]
df_data[selected_cols_p].mean().plot(kind='bar', ax=axes, edgecolor='black')
axes.set_title('Bert Scores')
axes.set_ylabel('Scores')
axes.set_ylim(0.7, 0.88)
axes.grid(True)

# Plot bar graph for Bert R Scores
# selected_cols_r = ["summary_short_bert_r"]
# df_data[selected_cols_r].mean().plot(kind='bar', ax=axes[1], edgecolor='black')
# axes[1].set_title('Bert R Scores')
# axes[1].set_ylabel('Scores')
# axes[1].set_ylim(0.75, 0.82)
axes.grid(True)

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


# Create subplots (1 row, 2 columns)
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4,4))

# # Plot bar graph for resp time

# Plot each column against column 'n'

plt.scatter(df_data['num_tokens'], df_data["response_time_summary_medium"],label="response_time_summary_medium")


plt.xlabel('num tokens')
plt.ylabel('response time(sec)')
plt.title('Response time')
plt.legend()
axes.set_ylim(4,30)
plt.grid(True)
# Adjust layout
plt.tight_layout()
# plt.ylim(0.8, 0.9)
plt.show()



NLG Metrics

In [None]:
def nlg_metrics(doc,summary):
    metrics = NLGMetrics(doc,summary)
    metric_scores = metrics.get_nlg_metrics()
    return metric_scores['coherence'],metric_scores['consistency'],metric_scores['fluency'],metric_scores['relevance']

Note: The length od the document exceeds the permissible token limit for medium documents. Hence abstract is used instead of entire article.

In [None]:
df_data[['coherence','consistency','fluency','relevance']] = df_data.apply(lambda row: pd.Series(nlg_metrics(row['abstract'],row['summary_medium'])), axis=1)

In [None]:
df_data.columns,df_data.shape

In [None]:
df_data.to_csv("./data/summary_medium_all_metrics.csv")

Appendix

In [None]:
%pip install datasets

In [None]:
from datasets import load_dataset

# Print all the available datasets
from huggingface_hub import list_datasets


In [None]:
papers_ds = load_dataset('scientific_papers','arxiv',split="train")

In [None]:
papers_list = []

for d in papers_ds :
    text = d["article"]
    abstract = d["abstract"]
    section_names = d["section_names"]
    d["num_tokens"] = llm.get_num_tokens(text)
    papers_list.append(d)

In [None]:
df_raw = pd.DataFrame(papers_list)
df_raw.shape,df_raw.columns

In [None]:
df_raw.to_csv("./data/scientific-papers.csv",index=False)