In [None]:
import altair as alt
from langchain_openai import ChatOpenAI
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain_text_splitters import CharacterTextSplitter
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
import numpy as np
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

from dsp_ai_eval import PROJECT_DIR, logging

model = SentenceTransformer('all-miniLM-L6-v2')

pd.set_option('display.width', 1000)

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
GPT_MODEL = 'gpt-3.5-turbo'
TEMP = 0
RQ = "How does technology diffusion impact UK growth and productivity?"

n_samples = 10

In [None]:
scite_abstracts = pd.read_parquet(PROJECT_DIR / "inputs/data/embeddings/scite_embeddings.parquet")

scite_subset = scite_abstracts[scite_abstracts['category']=='main']

titles_and_abstracts = []

for i in range(len(scite_subset)):
    titles_and_abstracts.append(f"TITLE: {scite_subset.iloc[i]['title']}. ABSTRACT: {scite_subset.iloc[i]['abstract']}")
    
llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY,
                    model_name=GPT_MODEL,
                    temperature=TEMP)

map_template = """The following is the title and abstract of an academic paper in the format "TITLE: <title>. ABSTRACT: <abstract>":

"{text}"

Based on this title and abstract, please identify the main themes 

SUMMARY:"""

map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Reduce
reduce_template = """The following is set of summaries:
{docs}
Take these and distill it into a final, consolidated summary of the main themes.
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="text",
    # Return the results of the map steps in the output
    return_intermediate_steps=True,
)

In [None]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.create_documents(titles_and_abstracts)

In [None]:
output = map_reduce_chain.invoke(split_docs)
output

In [None]:
documents = scite_subset['title_abstract'].to_list()
summaries = output["intermediate_steps"]
document_embeddings = model.encode(documents)
summary_embeddings = model.encode(summaries)
final_summary_embedding = model.encode(output["output_text"])

In [None]:
# Combine embeddings for dimensionality reduction
all_embeddings = np.vstack((document_embeddings, summary_embeddings, final_summary_embedding))

# Reduce dimensions to 2D using PCA
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(all_embeddings)

# Split back into separate arrays
reduced_doc_embeddings = reduced_embeddings[:len(documents), :]
reduced_summary_embeddings = reduced_embeddings[len(documents):len(documents)*2, :]
reduced_final_embedding = reduced_embeddings[-1, :]

In [None]:
summaries = output["intermediate_steps"]

# Create a DataFrame for Altair
df = pd.DataFrame(reduced_embeddings, columns=['PCA1', 'PCA2'])
df['Text'] = documents + summaries + [output["output_text"]]
df['Type'] = ['Title-abstract'] * len(documents) + ['Summary'] * len(summaries) + ['Metasummary']

# Create a DataFrame for the lines
lines_df = pd.DataFrame({
    'PCA1_start': reduced_embeddings[:len(documents), 0],
    'PCA2_start': reduced_embeddings[:len(documents), 1],
    'PCA1_end': reduced_embeddings[len(documents):len(documents)*2, 0],
    'PCA2_end': reduced_embeddings[len(documents):len(documents)*2, 1],
    'Text': documents  # or summaries if you prefer
})

# Create the points chart
points = alt.Chart(df).mark_circle(size=60).encode(
    x='PCA1',
    y='PCA2',
    color='Type',
    tooltip=['Text', 'Type']
).interactive().properties(
    width=800,
    height=600,
)

# Create the lines chart
lines = alt.Chart(lines_df).mark_line().encode(
    x='PCA1_start',
    y='PCA2_start',
    x2='PCA1_end',
    y2='PCA2_end',
    tooltip=['Text']
).properties(
    width=800,
    height=600,
)

# Combine the charts
chart = points + lines

# Display the chart
chart