In [1]:
from dotenv import load_dotenv
load_dotenv("../wawr_ingestion.env")

import json
from typing import List
import pandas as pd
from jinja2 import Template
from sklearn.metrics.pairwise import cosine_similarity

from aisyng.wawr.context import WAWRContext
from aisyng.wawr.models.models_factory import WAWRGraphElementTypes
from aisyng.base.llms.base import LLMProvider, LLMName
from aisyng.base.models.graph import GraphNode
from aisyng.base.utils import read_json
from aisyng.wawr.models.payload import WAWRAnswerWithReferencesFromNodes, nodes_to_reference_prompt_part
from aisyng.base.models.payload import AnswerWithReferencesFromNodes



In [2]:
FROM_DATE = '01 May 2024'
TO_DATE = '01 Jun 2024'
LLM_NAME = LLMName.OPENAI_GPT_4o
wawr_context: WAWRContext = WAWRContext.create_default()
EMBEDDING_KEY = "text-embedding-3-small-128"
THEMES_FINAL_FILE_NAME = '../data/2024_jun/themes_final.csv'


In [7]:
abstracts_and_embeddings = wawr_context.get_persistence().sqli.get_nodes_with_embeddings(
    embeddings_table=wawr_context.embedding_pool.get_table(EMBEDDING_KEY),
    from_date=FROM_DATE,
    to_date=TO_DATE,
    type_ids=[WAWRGraphElementTypes.Abstract]
)

In [8]:
abstracts = [e[0] for e in abstracts_and_embeddings]
abstract_embeddings = [e[1] for e in abstracts_and_embeddings]

In [16]:
def abstract_theme_preprocess_fn(abstract: GraphNode) -> str:
    return (f"Identify the main theme in the following abstract with title '{abstract.title}':\n'{abstract.text}'"
          f"\n\n. Answer with the theme name and only that, in less than 10 words.")

def common_theme_trials_preprocess_fn(paper_themes: List[str]) -> str:
    return (f'Extract the top main themes in the following list of keywords, maximum 20 themes, '
           f'in json format like this [{{"theme":..., "description":...}}]:\n{str(paper_themes)}')


def get_themes_for_abstracts(abstracts: List[GraphNode]) -> List[str]:
    print(f"Getting themes for each of {len(abstracts)} abstracts")
    themes = wawr_context.llm_providers.get_by_model_name(LLM_NAME).query_model_threaded(
        model=LLMName.OPENAI_GPT_35_TURBO,
        data=abstracts,
        preprocess_fn=abstract_theme_preprocess_fn
    )
    return [t[0] for t in themes]

def get_common_themes_from_abstract_themes(abstract_themes: List[str], trials: int) -> List[str]:
    common_theme_trials_full = wawr_context.llm_providers.get_by_model_name(LLM_NAME).query_model_threaded(
        model=LLM_NAME,
        preprocess_fn=common_theme_trials_preprocess_fn,
        data=[abstract_themes,] * trials,
        temperature=0.5
    )
    common_theme_trials = [t[0] for t in common_theme_trials_full]

    common_themes_full = wawr_context.llm_providers.get_by_model_name(LLM_NAME).query_model_validate_json(
        model=LLM_NAME,
        query=f"Here are the results of {trials} extracting themes from a set of paper abstracts:\n '{common_theme_trials}'\n "
              f"Create a single result, in the same format, with at most 20 themes, as an average of these trials. If one of the themes appears to be "
              f"a subtheme of another, add a field \"parent\": mentioning the parent theme name: ",
    )
    return common_themes_full

In [10]:
per_abstract_themes = get_themes_for_abstracts(abstracts)

Getting themes for each of 1565 abstracts


In [17]:
level1_themes = get_common_themes_from_abstract_themes(abstract_themes=per_abstract_themes, trials=5)
level1_themes

([{'theme': 'Efficiency and Performance Optimization',
   'description': 'Improving the efficiency and performance of AI models, including large language models, neural networks, and recommendation systems through techniques such as quantization, fine-tuning, and optimization.'},
  {'theme': 'Adversarial Robustness and Defense',
   'description': 'Developing methods to detect and defend against adversarial attacks on AI systems, ensuring robustness and security.'},
  {'theme': 'Personalization and Adaptability',
   'description': 'Enhancing AI systems to provide personalized feedback and adapt to individual user needs and contexts, particularly in education and customer service.'},
  {'theme': 'Explainability and Interpretability',
   'description': 'Creating models and systems that are transparent, interpretable, and understandable to human users.'},
  {'theme': 'AI Integration in Education',
   'description': 'Applying AI technologies to improve educational outcomes, including automa

In [40]:
#themes_gpt35 = [t[0] for t in themes]
#with open("../data/2024_jun/themes_gpt35.json", "w") as file:
#    file.write(json.dumps(themes_gpt35))

In [77]:
#pd.DataFrame.from_dict(main_themes[0]).to_csv(themes_final_file_name)

In [88]:
themes_final = pd.read_csv(THEMES_FINAL_FILE_NAME)
themes_final['papers'] = [list() for _ in range(themes_final.shape[0])]
themes_final_dict = themes_final.to_dict(orient="records")
themes_final

Unnamed: 0,theme,description,parent,papers
0,Architecture and Training,Improved internal architecture or training met...,,[]
1,Robustness,Enhancing the robustness of large language mod...,,[]
2,Personalization and Adaptability,Customizing language models to individual user...,,[]
3,Bias and Fairness,Addressing and mitigating biases in language m...,,[]
4,Multimodal Large Language Models,Integrating and processing multiple types of d...,,[]
5,Explainability and Interpretability,Making language model decisions and outputs mo...,,[]
6,Human-AI Collaboration,Facilitating effective collaboration between h...,,[]
7,Ethical Implications,Exploring the ethical considerations and socie...,,[]
8,Data Augmentation and Generation,Using language models to generate and augment ...,,[]
9,Creative Applications,Exploring the use of large language models in ...,,[]


In [89]:
def allocation_preprocess_fn(abstract: GraphNode) -> str:
    return (f'Given the paper abstract with title "{abstract.title}":"{abstract.text}"\n'
            f'Classify it into one or two of the following themes: {list(theme_strings)}\n\n'
            f'Output 2 themes if and only if the paper is clearly relevant for both, otherwise output just the dominant theme. '
            f'Answer with the exact beginning of the text of the theme up to the colon symbol, in json format, like "["Performance of Large Language Models", "Bias and Fairness"]" and only that.'   )        
theme_strings = themes_final.apply(lambda x: x['theme']+': '+x['description'], axis=1)
allocations = wawr_context.llm_providers.get_by_model_name(LLM_NAME).query_model_threaded(
    model=LLM_NAME,
    preprocess_fn=allocation_preprocess_fn ,
    data=abstracts,
    temperature=0.1,
    json=True,
    parallelism=20
)

In [90]:
for i, a in enumerate(allocations):
    a = a[0]
    found = False
    for theme in themes_final_dict:
        if any(theme['theme'] in a_elem for a_elem in a):            
            theme['papers'].append(abstracts[i])
            found = True
    if not found:
        print(f'Not found: {a[0]}')

In [91]:
sum = 0
for i, theme in enumerate(themes_final_dict):
    print(i, ". ", theme["theme"])
    print(theme["description"])
    print(len(theme["papers"]), " papers")
    sum += len(theme["papers"])
    # for a in theme['papers']:
    #     print(f"\t{a.title}")
    print("\n")
print(sum)

0 .  Architecture and Training
Improved internal architecture or training methods for large language models
378  papers


1 .  Robustness
Enhancing the robustness of large language models against various attacks
132  papers


2 .  Personalization and Adaptability
Customizing language models to individual user needs and contexts, improving user experience and interaction
114  papers


3 .  Bias and Fairness
Addressing and mitigating biases in language models to ensure fair and equitable outputs across different demographics and use cases
75  papers


4 .  Multimodal Large Language Models
Integrating and processing multiple types of data (e.g., text, images, audio) to enhance the capabilities and applications of language models
238  papers


5 .  Explainability and Interpretability
Making language model decisions and outputs more understandable and transparent to users and stakeholders
146  papers


6 .  Human-AI Collaboration
Facilitating effective collaboration between humans and AI sy

In [85]:
[p.title for p in themes_final_dict[15]['papers']]

['ChatGPT in Data Visualization Education: A Student Perspective',
 'Extracting chemical food safety hazards from the scientific literature\n  automatically using large language models',
 'Inferring State Machine from the Protocol Implementation via Large\n  Langeuage Model',
 'CookingSense: A Culinary Knowledgebase with Multidisciplinary Assertions',
 'ChatBI: Towards Natural Language to Complex Business Intelligence SQL',
 'Long-Term Human Trajectory Prediction using 3D Dynamic Scene Graphs',
 'Investigating Automatic Scoring and Feedback using Large Language Models',
 'WIBA: What Is Being Argued? A Comprehensive Approach to Argument Mining',
 'Sim-Grasp: Learning 6-DOF Grasp Policies for Cluttered Environments\n  Using a Synthetic Benchmark',
 'The Role of Model Architecture and Scale in Predicting Molecular\n  Properties: Insights from Fine-Tuning RoBERTa, BART, and LLaMA',
 'CACTUS: Chemistry Agent Connecting Tool-Usage to Science',
 'Automating the Analysis of Public Saliency and

In [115]:
class WAWRSummarisationFromNodes(AnswerWithReferencesFromNodes):
    prompt_template: str = ("This is a set of reference abstracts of research papers:\n"
                                   "{{ references_as_text }}\n\n"
                                   "Based on the knowledge from the references above, create a detailed summary on the following topic: "
                                   "\"{{ question }}\".\n"
                                   f"Highlight the themes that are addressed by multiple papers, describe each theme thoroughly, summarise challenges and achievements across all theme papers, and provide at least 2 references for each theme."
                                   f"Format the text "
                                   f"using html tags, with h4 for the themes, and h3 with class=collapsible for the title, ready to insert as-is into a html page. "
                                   f"Quote the abstracts above in your answer by their index, in [1][2] format. "
                                   f"Do not list references, only use numbers in your answer to refer to the facts. "
                                   f"Write in concise style, in British English, but be very thorough, take a deep breath, and take into account "
                                   f"all relevant references. "
                                   f"Do not write references or bibliography at the end. "
                                   f"Do not write references, only insert indexes towards given references."
                                   )
    def nodes_to_references_prompt_part(self, **kwargs):
        return nodes_to_reference_prompt_part(self.nodes)

    def get_prompt(self):
        return Template(self.prompt_template).render(
            references_as_text=self.nodes_to_references_prompt_part(),
            question=self.ask
        )

In [116]:
def summary_preprocess_fn(theme) -> str:
    worker = WAWRSummarisationFromNodes(
        context=wawr_context, 
        llm_name=LLM_NAME, 
        nodes=theme['papers'], 
        ask= f"{theme['theme']}: {theme['description']}"
    )
    return worker.get_prompt()

answers = wawr_context.llm_providers.get_by_model_name(LLM_NAME).query_model_threaded(
    model=LLM_NAME,
    preprocess_fn=summary_preprocess_fn,
    data=themes_final_dict,
    temperature=0.1
)

In [117]:
from IPython.core.display import display, HTML
display(HTML(answers[0][0]))

  from IPython.core.display import display, HTML


In [120]:
answers_ref = list()
for i, (theme, answer) in enumerate(zip(themes_final_dict, answers)):
    to_add = answer[0]
    for abstract_index, abstract in enumerate(theme['papers']):
        replacement = f'<a href="http://arxiv.org/abs/{abstract.id}" target="_blank">[{i+1}.{abstract_index}]</a>'
        to_add = to_add.replace(f"[{abstract_index+1}]", replacement)
    answers_ref.append(to_add)

In [123]:
display(HTML(answers_ref[13]))

In [122]:
for i, a in enumerate(answers_ref):
    with open(f"../data/2024_jun/{i}.html", "w") as file:
        file.write(a)

In [124]:
report = "\n".join(answers_ref)
with open(f"../data/2024_jun/report.html", "w") as file:
        file.write(report)

In [3]:
from aisyng.base.models.payload import SolvedExternallyTopicSolver
from aisyng.wawr.models.models_factory import create_topic_solver_relationship, create_topic_node
from aisyng.wawr.workers import init_topic_solving
from datetime import datetime
from aisyng.wawr.models.payload import DirectSimilarityTopicSolver
embedding_key = "text-embedding-3-small-128"

topic_node = create_topic_node(ask="Large Language Models Research in May 2024", source_id="remove")
topic_solver = SolvedExternallyTopicSolver(
        from_date=datetime(2024, 5, 1),
        to_date=datetime(2024, 6, 1),
        model="gpt-4o",
        embedding_key=embedding_key,
        limit=0,
        llm_name = LLM_NAME
    )
topic_solver_node = init_topic_solving(topic_node=topic_node, context=wawr_context, topic_solver=topic_solver)

In [4]:
topic_solver_node.id

'951e0831-399f-4d73-bd79-0bf7f487aff4'

In [21]:
with open("../data/2024_jun/report.html", "r") as file:
    html_content = file.read()
topic_solver_node.meta.solve(ask="Large Language Models Research in May 2024", ask_embedding=None, context=wawr_context, answer=html_content)
topic_solver_node.text = topic_solver.answer
wawr_context.get_persistence().persist(objects_merge=[topic_solver_node])


In [9]:
topic_solver.answer

'<style>\n        .inpost-collapsible {\n            cursor: hand;\n            padding: 10px;\n            width: 100%;\n            border: none;\n            text-align: justify;\n            outline: none;\n            font-size: 18px;\n        }\n        .inpost-content {\n            padding: 0 18px;\n            display: none;\n            overflow: hidden;\n            background-color: #f9f9f9;\n        }\n        .inpost-disclaimer {\n            font-style:italic;\n        }\n</style>\n\n<h1>Large Language Models Research in May 2024</h1>\n<p>\n     In May 2024, we added to <a href="https://wawr.ai" target="_blank">WAWR</a>\'s knowledge base approximately 1,500 paper abstracts on language models published on <a href="https://arxiv.org/" target="_blank">arXiv</a> during the month.\n    This article provides a concise overview of the main topics and subtopics derived from these abstracts, designed to be read in 15 - 30 minutes.\n    For more information on the limitations and 