In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from typing import List
import pandas as pd
from sklearn.model_selection import train_test_split


from src.mle import utils as mle_utils
from src import tags, constants


In [3]:
index_name = "ai-papers"

path_data_eval_qs = mle_utils.path_data_raw / "eval-questions"

In [4]:
"""

The following pdf document is a paper about artificial intelligence.
1. Examine the paper and get a view of the overall structure, most important ideas, innovations and base work
2. For each section, skip references or contributors,  summarize most important ideas, the overall summary should take into account topics like:
    * Main goal or objective of the paper and team invoved
    * Novelties introduced by the paper
    * Model architecture
    * Training procedure, resources and time
    * Hiperatarmeter tunning and ablation studies
    * Datasets used to train and evaluate
    * Infraestructure, number of GPUs, etc., CO2 signature
    * Results, comparisons, benchmarks and results
    * Limitations and risks
5. Finally, based on the prior analysis sumarization, write down at least 10 interesting question and their related answers and source
    * Groud truth: Extract the anwswer as close as its original statement as posible
    * Source: The section where the anwswer is obtained, title and number
The set of 10 questions should give a complete overview of the paper, anwsering the topics identifed inthe summarization step
At least 2 questions for each paper should be complex questions whose anwsers spans across different sections
Skip questions from references or contributors

Provide the output as python code as follows:
```
Overall structure summary and most important ideas as string
List of questions in a dictionary form like this:
list_of_questions: List =         
        "paper": "TimesFM",
        "question": "What is TimesFM and what problem does it aim to solve?",
        "ground_truth": "TimeSFM is a decoder-only foundation model for time-series forecasting, designed to provide accurate zero-shot forecasts across various domains without the need for task-specific supervised training.",
        "source": "Abstract, Introduction"
```

"""

'\n\nThe following pdf document is a paper about artificial intelligence.\n1. Examine the paper and get a view of the overall structure, most important ideas, innovations and base work\n2. For each section, skip references or contributors,  summarize most important ideas, the overall summary should take into account topics like:\n    * Main goal or objective of the paper and team invoved\n    * Novelties introduced by the paper\n    * Model architecture\n    * Training procedure, resources and time\n    * Hiperatarmeter tunning and ablation studies\n    * Datasets used to train and evaluate\n    * Infraestructure, number of GPUs, etc., CO2 signature\n    * Results, comparisons, benchmarks and results\n    * Limitations and risks\n5. Finally, based on the prior analysis sumarization, write down at least 10 interesting question and their related answers and source\n    * Groud truth: Extract the anwswer as close as its original statement as posible\n    * Source: The section where the an

In [5]:
# Overall structure summary and most important ideas
overall_summary = """
The paper introduces TimesFM, a decoder-only foundation model designed for time-series forecasting. Inspired by advances in NLP foundation models, the team sought to develop a time-series model that offers zero-shot forecasting capabilities across diverse datasets without task-specific training. TimesFM is trained on a mix of synthetic and real-world datasets, including Google Trends and Wiki Pageviews, and employs a patched-decoder attention architecture for long-horizon forecasting. The model demonstrates competitive results compared to state-of-the-art supervised models, highlighting its generalization abilities across unseen datasets. The paper presents the model's architecture, training procedures, datasets, ablation studies, and empirical evaluations on various benchmarks, concluding that TimesFM achieves high accuracy with fewer computational resources.
"""

# List of questions in dictionary form
list_of_questions = [
    {
        "paper": "TimesFM",
        "question": "What is the primary goal of the TimesFM model?",
        "ground_truth": "The primary goal of TimesFM is to provide accurate zero-shot forecasting across a variety of domains without the need for task-specific supervised training.",
        "source": "Abstract, Introduction"
    },
    {
        "paper": "TimesFM",
        "question": "What novelties does the TimesFM introduce in time-series forecasting?",
        "ground_truth": "The main novelties include a patched-decoder style attention model, efficient pre-training on a large corpus of synthetic and real-world time-series data, and the ability to perform zero-shot forecasting across different domains and granularities.",
        "source": "Abstract, Introduction, Model Architecture"
    },
    {
        "paper": "TimesFM",
        "question": "What datasets were used to train TimesFM, and how were they prepared?",
        "ground_truth": "TimesFM was trained on real-world datasets like Google Trends, Wiki Pageviews, the M4 dataset, Electricity, Traffic datasets, and synthetic datasets generated using ARMA processes, seasonal patterns, trends, and step functions.",
        "source": "Pretraining Details, Section 5"
    },
    {
        "paper": "TimesFM",
        "question": "What is the core architecture of TimesFM?",
        "ground_truth": "TimesFM employs a decoder-only architecture with patched input tokens. It processes the input time-series into patches and uses residual blocks and stacked transformer layers with multi-head causal self-attention.",
        "source": "Model Architecture, Section 4"
    },
    {
        "paper": "TimesFM",
        "question": "How does TimesFM handle long-horizon forecasts, and why is it important?",
        "ground_truth": "TimesFM uses longer output patches during decoding to make predictions over extended horizons, reducing the number of autoregressive steps needed. This is important for improving accuracy in long-horizon forecasting tasks.",
        "source": "Model Architecture, Section 4"
    },
    {
        "paper": "TimesFM",
        "question": "What was the computational infrastructure used for training TimesFM?",
        "ground_truth": "TimesFM was trained on a pretraining corpus of around 100B timepoints using a patched-decoder attention architecture. The model has around 200M parameters.",
        "source": "Pretraining Details, Section 5"
    },
    {
        "paper": "TimesFM",
        "question": "What hyperparameters were crucial in training the TimesFM model, and how were they tuned?",
        "ground_truth": "Important hyperparameters included the input and output patch lengths, model dimension (1280), number of transformer layers (20), and number of heads (16). A cosine decay learning rate schedule with a peak of 5e-4 was used.",
        "source": "Training, Pretraining Details, Section 5"
    },
    {
        "paper": "TimesFM",
        "question": "How does TimesFM's performance compare to other state-of-the-art models in zero-shot forecasting?",
        "ground_truth": "TimesFM performs close to or surpasses state-of-the-art supervised models in zero-shot settings across various datasets like Monash, Darts, and ETT. It outperforms llmtime and other deep learning models on several benchmarks.",
        "source": "Empirical Results, Section 6.1"
    },
    {
        "paper": "TimesFM",
        "question": "What are the key limitations of TimesFM?",
        "ground_truth": "The key limitations of TimesFM include its smaller parameter size and pretraining data size compared to large language models. Additionally, fine-tuning and few-shot capabilities were not extensively explored in this paper.",
        "source": "Conclusion, Section 7"
    },
    {
        "paper": "TimesFM", 
        "question": "What risks or potential issues did the authors identify with TimesFM?",
        "ground_truth": "The authors note challenges related to the lack of vast amounts of publicly available time-series data, making it harder to generalize across more diverse datasets. Further work is needed to understand how the model handles out-of-distribution data.",
        "source": "Introduction, Conclusion"
    },
    {
        "paper": "TimesFM",
        "question": "How does TimesFM handle varying time granularities and context lengths during inference?",
        "ground_truth": "TimesFM can adapt to varying time granularities and context lengths by using a masked patching strategy during training, allowing the model to handle different temporal resolutions and history lengths at inference time.",
        "source": "Model Architecture, Section 4"
    },
    {
        "paper": "TimesFM",
        "question": "What impact does the input patch length have on TimesFM's performance?",
        "ground_truth": "A larger input patch length, like 32, improves performance by balancing efficiency and accuracy. Too small a patch length leads to inefficient training, while too large a patch length shifts the model towards an encoder-decoder structure, reducing its flexibility.",
        "source": "Ablation Studies, Section 6.2"
    },
    {
        "paper": "TimesFM",
        "question": "How does the output patch length affect long-horizon forecasting in TimesFM?",
        "ground_truth": "Longer output patches (e.g., 128) enable fewer autoregressive steps during long-horizon forecasting, which improves the model's ability to make accurate predictions over extended periods.",
        "source": "Ablation Studies, Section 6.2"
    } 

]

df_timesfm = pd.DataFrame(list_of_questions)
df_timesfm

Unnamed: 0,paper,question,ground_truth,source
0,TimesFM,What is the primary goal of the TimesFM model?,The primary goal of TimesFM is to provide accu...,"Abstract, Introduction"
1,TimesFM,What novelties does the TimesFM introduce in t...,The main novelties include a patched-decoder s...,"Abstract, Introduction, Model Architecture"
2,TimesFM,"What datasets were used to train TimesFM, and ...",TimesFM was trained on real-world datasets lik...,"Pretraining Details, Section 5"
3,TimesFM,What is the core architecture of TimesFM?,TimesFM employs a decoder-only architecture wi...,"Model Architecture, Section 4"
4,TimesFM,How does TimesFM handle long-horizon forecasts...,TimesFM uses longer output patches during deco...,"Model Architecture, Section 4"
5,TimesFM,What was the computational infrastructure used...,TimesFM was trained on a pretraining corpus of...,"Pretraining Details, Section 5"
6,TimesFM,What hyperparameters were crucial in training ...,Important hyperparameters included the input a...,"Training, Pretraining Details, Section 5"
7,TimesFM,How does TimesFM's performance compare to othe...,TimesFM performs close to or surpasses state-o...,"Empirical Results, Section 6.1"
8,TimesFM,What are the key limitations of TimesFM?,The key limitations of TimesFM include its sma...,"Conclusion, Section 7"
9,TimesFM,What risks or potential issues did the authors...,The authors note challenges related to the lac...,"Introduction, Conclusion"


In [6]:
overall_structure_summary = """
The paper "Mamba: Linear-Time Sequence Modeling with Selective State Spaces" introduces a novel approach to sequence modeling. The goal is to overcome the limitations of the Transformer architecture, which dominates current deep learning applications but suffers from inefficiencies in modeling long sequences. The authors propose a new type of model called the Selective State Space Model (SSM), integrated into a simplified architecture named Mamba. This model aims to address both the computational inefficiency of Transformers and the weaknesses of structured state-space models (SSMs), which have previously struggled with discrete data like text.

Key innovations include the introduction of selective state space dynamics, where the model parameters change based on input data, enabling the model to remember or forget information dynamically. This results in a flexible model that can process sequences of arbitrary length efficiently. The model uses a hardware-aware parallel algorithm that ensures fast processing on modern GPUs, leading to linear time complexity for long sequence processing.

The paper explores the architecture, hardware optimization, empirical performance on multiple domains (e.g., language, audio, genomics), and scalability. It also provides benchmarks against state-of-the-art models like Transformers, showing that Mamba can achieve better or comparable performance with fewer computational resources, even on sequences exceeding one million tokens. Furthermore, the model demonstrates faster inference times and higher throughput, making it suitable for practical applications.
"""

list_of_questions = [
    {
        "paper": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
        "question": "What is the main goal of the Mamba paper?",
        "ground_truth": "The main goal is to overcome the computational inefficiency of Transformer models when processing long sequences and to improve on the weaknesses of structured state space models (SSMs) in handling discrete data like text.",
        "source": "Abstract, Introduction"
    },
    {
        "paper": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
        "question": "What is the Selective State Space Model (SSM) and how does it work?",
        "ground_truth": "Selective SSMs allow the model parameters to change dynamically based on the input, enabling the model to remember or forget information depending on the data. This improves the model's ability to selectively propagate relevant information across sequences.",
        "source": "Abstract, Section 1, 3.2"
    },
    {
        "paper": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
        "question": "How does the Mamba architecture differ from Transformers?",
        "ground_truth": "Mamba does not use attention or MLP blocks like Transformers. Instead, it relies on selective state space models, enabling linear-time scaling in sequence length and eliminating the need to store the entire context during inference.",
        "source": "Abstract, Section 1, 3.4"
    },
    {
        "paper": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
        "question": "What datasets were used to train and evaluate the Mamba model?",
        "ground_truth": "Mamba was trained and evaluated on synthetic tasks (copying and induction heads), language (The Pile), audio (YouTubeMix), and DNA sequences (HG38 genome).",
        "source": "Section 4.1-4.3"
    },
    {
        "paper": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
        "question": "How does Mamba perform compared to Transformers in terms of inference speed?",
        "ground_truth": "Mamba achieves 5× higher inference throughput than Transformers of similar size due to its recurrent architecture and lack of a need for caching previous elements.",
        "source": "Section 4.5, Figure 8"
    },
    {
        "paper": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
        "question": "What is the hardware optimization that enables Mamba to scale efficiently on long sequences?",
        "ground_truth": "Mamba uses a hardware-aware parallel algorithm that avoids materializing the expanded state in GPU memory, enabling efficient processing of long sequences by leveraging memory hierarchies.",
        "source": "Abstract, Section 3.3"
    },
    {
        "paper": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
        "question": "How does Mamba handle long-context sequences and why is this important?",
        "ground_truth": "Mamba improves performance on tasks with long context sequences, up to million-length tokens. Its selection mechanism allows the model to filter out irrelevant information, making it suitable for long-range dependencies like in DNA and language.",
        "source": "Section 4.3, Section 4.1.2"
    },
    {
        "paper": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
        "question": "What were the results of the ablation studies on the Mamba model architecture?",
        "ground_truth": "Ablation studies show that selective SSMs (S6) significantly outperform other SSM variants and architectures, highlighting the importance of the selection mechanism for sequence modeling.",
        "source": "Section 4.6"
    },
    {
        "paper": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
        "question": "What is the environmental impact of training the Mamba model in terms of computational resources?",
        "ground_truth": "Mamba is designed to be computationally efficient, reducing the FLOPs required for long-sequence tasks. Although specific CO2 emissions are not detailed, the model’s linear-time complexity and efficient hardware usage imply reduced environmental impact compared to Transformer models.",
        "source": "Section 4.5"
    },
    {
        "paper": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
        "question": "What are the main limitations and risks associated with the Mamba model?",
        "ground_truth": "While Mamba shows strong performance in various domains, the paper does not explicitly explore risks such as potential biases in pretraining datasets or limitations in handling very short sequences.",
        "source": "Section 5"
    }
]


df_mamba = pd.DataFrame(list_of_questions)
df_mamba

Unnamed: 0,paper,question,ground_truth,source
0,Mamba: Linear-Time Sequence Modeling with Sele...,What is the main goal of the Mamba paper?,The main goal is to overcome the computational...,"Abstract, Introduction"
1,Mamba: Linear-Time Sequence Modeling with Sele...,What is the Selective State Space Model (SSM) ...,Selective SSMs allow the model parameters to c...,"Abstract, Section 1, 3.2"
2,Mamba: Linear-Time Sequence Modeling with Sele...,How does the Mamba architecture differ from Tr...,Mamba does not use attention or MLP blocks lik...,"Abstract, Section 1, 3.4"
3,Mamba: Linear-Time Sequence Modeling with Sele...,What datasets were used to train and evaluate ...,Mamba was trained and evaluated on synthetic t...,Section 4.1-4.3
4,Mamba: Linear-Time Sequence Modeling with Sele...,How does Mamba perform compared to Transformer...,Mamba achieves 5× higher inference throughput ...,"Section 4.5, Figure 8"
5,Mamba: Linear-Time Sequence Modeling with Sele...,What is the hardware optimization that enables...,Mamba uses a hardware-aware parallel algorithm...,"Abstract, Section 3.3"
6,Mamba: Linear-Time Sequence Modeling with Sele...,How does Mamba handle long-context sequences a...,Mamba improves performance on tasks with long ...,"Section 4.3, Section 4.1.2"
7,Mamba: Linear-Time Sequence Modeling with Sele...,What were the results of the ablation studies ...,Ablation studies show that selective SSMs (S6)...,Section 4.6
8,Mamba: Linear-Time Sequence Modeling with Sele...,What is the environmental impact of training t...,Mamba is designed to be computationally effici...,Section 4.5
9,Mamba: Linear-Time Sequence Modeling with Sele...,What are the main limitations and risks associ...,While Mamba shows strong performance in variou...,Section 5


In [7]:
# Overall structure summary and most important ideas as string
overall_structure_summary = """
The paper "Lag-Llama: Towards Foundation Models for Probabilistic Time Series Forecasting" presents a new approach to univariate probabilistic time series forecasting by introducing a foundation model called Lag-Llama. This model is based on a decoder-only transformer architecture and uses lag features and covariates to handle time series data from various domains. The goal is to achieve strong zero-shot generalization performance and state-of-the-art results without relying on task-specific models.

Key innovations include:
- The use of lag features as covariates in a decoder-only transformer architecture.
- Pretraining on a large corpus of time series data across several domains.
- Strong zero-shot performance on unseen datasets and state-of-the-art performance after fine-tuning on smaller datasets.
- Robust few-shot adaptation capabilities, making the model highly adaptable to various levels of historical data.

The paper explores the scalability of the model and the diversity of the pretraining corpus. It compares Lag-Llama's performance against several statistical and deep learning models, showing that Lag-Llama outperforms competitors, especially after fine-tuning.

The infrastructure used for training involves a single Nvidia Tesla-P100 GPU, with 100 hyperparameter configurations tested during training. The datasets used are from diverse domains such as energy, economics, and transportation. The model's performance is evaluated using the Continuous Ranked Probability Score (CRPS).

Limitations of the paper include the model's focus on univariate data, leaving multivariate extensions and more complex distribution heads for future work. Furthermore, the study suggests the need for large-scale time series datasets and highlights the challenges in adapting foundation models for time series data.
"""

# List of questions and answers in dictionary form
list_of_questions = [
    {
        "paper": "Lag-Llama",
        "question": "What is the main objective of Lag-Llama?",
        "ground_truth": "The main objective of Lag-Llama is to develop a general-purpose foundation model for univariate probabilistic time series forecasting, with strong zero-shot performance and adaptability across different domains, without task-specific training.",
        "source": "Abstract"
    },
    {
        "paper": "Lag-Llama",
        "question": "What are the novelties introduced by the Lag-Llama model?",
        "ground_truth": "The novelties include the use of a decoder-only transformer architecture with lag features as covariates and its capability to perform zero-shot forecasting across various time series domains.",
        "source": "Abstract, Section 1: Introduction"
    },
    {
        "paper": "Lag-Llama",
        "question": "What architecture is used in Lag-Llama, and how does it process the time series data?",
        "ground_truth": "Lag-Llama is based on a decoder-only transformer architecture. It tokenizes time series data using lagged features and covariates, and passes them through multiple layers of transformers with pre-normalization and Rotary Positional Encoding (RoPE).",
        "source": "Section 4: Lag-Llama Architecture"
    },
    {
        "paper": "Lag-Llama",
        "question": "How was the Lag-Llama model pretrained, and what was the training setup?",
        "ground_truth": "The model was pretrained on a large corpus of 27 time series datasets across six domains, comprising around 352 million data windows (tokens). The training batch size was 256, with a learning rate of 10^-4, and early stopping based on validation loss.",
        "source": "Section 5.1: Datasets, Section 5.3: Hyperparameter Search and Model Training Setups"
    },
    {
        "paper": "Lag-Llama",
        "question": "What is the choice of the distribution head used in Lag-Llama?",
        "ground_truth": "The model uses a Student's t-distribution head, which outputs the parameters of the distribution, including degrees of freedom, mean, and scale. The choice was made to keep the model simple and effective for training.",
        "source": "Section 4.3: Choice of Distribution Head"
    },
    {
        "paper": "Lag-Llama",
        "question": "What datasets were used to train and evaluate the Lag-Llama model?",
        "ground_truth": "The model was trained on 27 datasets across domains like energy, economics, transportation, nature, and air quality. Datasets were selected to test the model's generalization capabilities across diverse prediction horizons.",
        "source": "Section 5.1: Datasets"
    },
    {
        "paper": "Lag-Llama",
        "question": "What infrastructure was used for the Lag-Llama model's training, and what resources were required?",
        "ground_truth": "Lag-Llama was trained on a single Nvidia Tesla-P100 GPU with 12 GB of memory, 4 CPU cores, and 24 GB of RAM. The training involved 100 hyperparameter configurations and large-scale pretraining.",
        "source": "Section 5.3: Hyperparameter Search and Model Training Setups"
    },
    {
        "paper": "Lag-Llama",
        "question": "How does Lag-Llama perform compared to other state-of-the-art models?",
        "ground_truth": "Lag-Llama outperforms various statistical and deep learning models, particularly in zero-shot and fine-tuning scenarios. After fine-tuning, it consistently achieves state-of-the-art performance across diverse datasets.",
        "source": "Section 6: Results"
    },
    {
        "paper": "Lag-Llama",
        "question": "What are the limitations of the Lag-Llama model?",
        "ground_truth": "The limitations include its focus on univariate time series, with multivariate extensions and more complex distribution heads left for future work. The need for large-scale time series datasets is also highlighted.",
        "source": "Section 8: Discussion"
    },
    {
        "paper": "Lag-Llama",
        "question": "How does the Lag-Llama model adapt in few-shot scenarios with limited historical data?",
        "ground_truth": "Lag-Llama demonstrates strong few-shot adaptation capabilities, showing increasing performance as more historical data becomes available. It outperforms baseline models even when limited history is available.",
        "source": "Section 6.2: Few-Shot Adaptation Performance on Unseen Data"
    }
]



df_lagllama = pd.DataFrame(list_of_questions)
df_lagllama

Unnamed: 0,paper,question,ground_truth,source
0,Lag-Llama,What is the main objective of Lag-Llama?,The main objective of Lag-Llama is to develop ...,Abstract
1,Lag-Llama,What are the novelties introduced by the Lag-L...,The novelties include the use of a decoder-onl...,"Abstract, Section 1: Introduction"
2,Lag-Llama,"What architecture is used in Lag-Llama, and ho...",Lag-Llama is based on a decoder-only transform...,Section 4: Lag-Llama Architecture
3,Lag-Llama,"How was the Lag-Llama model pretrained, and wh...",The model was pretrained on a large corpus of ...,"Section 5.1: Datasets, Section 5.3: Hyperparam..."
4,Lag-Llama,What is the choice of the distribution head us...,The model uses a Student's t-distribution head...,Section 4.3: Choice of Distribution Head
5,Lag-Llama,What datasets were used to train and evaluate ...,The model was trained on 27 datasets across do...,Section 5.1: Datasets
6,Lag-Llama,What infrastructure was used for the Lag-Llama...,Lag-Llama was trained on a single Nvidia Tesla...,Section 5.3: Hyperparameter Search and Model T...
7,Lag-Llama,How does Lag-Llama perform compared to other s...,Lag-Llama outperforms various statistical and ...,Section 6: Results
8,Lag-Llama,What are the limitations of the Lag-Llama model?,The limitations include its focus on univariat...,Section 8: Discussion
9,Lag-Llama,How does the Lag-Llama model adapt in few-shot...,Lag-Llama demonstrates strong few-shot adaptat...,Section 6.2: Few-Shot Adaptation Performance o...


In [8]:
Overall_structure_summary = """
The paper "ColPali: Efficient Document Retrieval with Vision Language Models" introduces ColPali, a novel retrieval system designed to optimize document retrieval through both visual and textual cues. It seeks to address inefficiencies in modern retrieval systems, particularly in handling visually rich documents that combine text with images, tables, and other visual elements. 

The paper is structured as follows:
1. **Abstract and Introduction**: The main goal is to improve retrieval in visually rich documents by combining text and vision-language models. The work highlights the importance of using both text and visual elements for better document matching, with ColPali being introduced as an efficient alternative.
2. **Problem Formulation and Related Work**: The paper contrasts ColPali's design with existing retrieval methods, emphasizing the importance of late interaction mechanisms for processing multi-modal documents.
3. **ViDoRe Benchmark**: Introduces the Visual Document Retrieval Benchmark (ViDoRe), used to test ColPali’s performance. This section outlines a comprehensive dataset covering various document types, domains, and languages.
4. **Architecture**: Describes the ColPali model architecture, which integrates visual embeddings with text embeddings using a late interaction mechanism.
5. **Training Procedure**: Discusses how the ColPali model was trained, including datasets, hyperparameters, and infrastructure used.
6. **Results and Comparisons**: ColPali outperforms baseline models on several benchmarks in both performance and latency, especially in visually complex tasks like tables and figures.
7. **Ablation Studies**: Investigates various design choices, such as the number of image patches or vision component fine-tuning, and their impact on performance.
8. **Conclusions**: The paper concludes by highlighting ColPali’s superior performance, flexibility, and future avenues for combining retrieval with visual question answering.
9. **Limitations and Ethical Considerations**: Discusses potential biases and the need for more diverse document types and languages in future studies.

Key Innovations:
- A late interaction model architecture specifically designed for visually rich document retrieval.
- ViDoRe benchmark, which evaluates retrieval across diverse document formats and languages.
- Superior indexing speed and low query latency, allowing fast and accurate document retrieval.

Datasets and Infrastructure:
- The paper mentions training ColPali on 127,460 query-page pairs.
- Uses an 8-GPU setup with LoRA adapters for efficiency.
- Estimates a carbon footprint of around 15 kg CO2 eq. for the experiments.

Results:
- ColPali outperforms state-of-the-art methods on the ViDoRe benchmark, particularly in tasks involving infographics and tables.
- Demonstrates faster indexing and querying times compared to traditional methods.
- Significant improvements in complex document retrieval tasks, especially with late interaction mechanisms.

Limitations:
- Focus on PDF documents with less emphasis on web or handwritten content.
- Generalization to lower-resource languages remains untested.

"""

list_of_questions: List = [
    {
        "paper": "ColPali",
        "question": "What problem does ColPali aim to solve in document retrieval?",
        "ground_truth": "ColPali addresses the inefficiencies in current document retrieval systems, particularly in handling visually rich documents that contain text, tables, images, and layouts, which are often overlooked by modern text-centric systems.",
        "source": "Abstract, Introduction"
    },
    {
        "paper": "ColPali",
        "question": "What is the main architectural innovation introduced by ColPali?",
        "ground_truth": "ColPali leverages Vision Language Models to index documents based on visual features and integrates a late interaction mechanism to enhance query matching.",
        "source": "Section 2, Problem Formulation & Related Work"
    },
    {
        "paper": "ColPali",
        "question": "What is the ViDoRe benchmark, and why was it introduced?",
        "ground_truth": "ViDoRe is a benchmark specifically designed to evaluate document retrieval systems on visually rich documents, considering both textual and visual elements across multiple domains and languages.",
        "source": "Section 3, The ViDoRe Benchmark"
    },
    {
        "paper": "ColPali",
        "question": "What datasets were used to train the ColPali model?",
        "ground_truth": "The model was trained on 127,460 query-page pairs, with 63% coming from openly available academic datasets and 37% from synthetic datasets composed of web-crawled PDFs and VLM-generated pseudo-questions.",
        "source": "Section 4.2, Model training"
    },
    {
        "paper": "ColPali",
        "question": "What training infrastructure was used for ColPali?",
        "ground_truth": "ColPali was trained on an 8-GPU setup with data parallelism, LoRA adapters (r=32), and paged_adamw_8bit optimizer.",
        "source": "Section 4.2, Model training"
    },
    {
        "paper": "ColPali",
        "question": "What were the main performance results of ColPali on the ViDoRe benchmark?",
        "ground_truth": "ColPali outperformed all evaluated models on the ViDoRe benchmark, particularly excelling in visually complex tasks like InfographicVQA, ArxivQA, and TabFQuAD, with significantly higher NDCG@5 scores.",
        "source": "Section 5.1, Results"
    },
    {
        "paper": "ColPali",
        "question": "How does ColPali achieve faster indexing and querying times compared to other systems?",
        "ground_truth": "ColPali directly encodes pages from their image representation, bypassing time-consuming steps like layout detection, OCR, and chunking, resulting in faster indexing times.",
        "source": "Section 5.2, Latencies & Memory Footprint"
    },
    {
        "paper": "ColPali",
        "question": "What ablation studies were conducted, and what were the findings inColPali paper?",
        "ground_truth": "Ablation studies on patch number and query augmentation tokens showed trade-offs between performance and memory usage, with 1024 patches yielding the best results. Query augmentation tokens had a marginal impact on English but improved performance in French tasks.",
        "source": "Section 6, Ablation study"
    },
    {
        "paper": "ColPali",
        "question": "What are the main limitations of the ColPali model?",
        "ground_truth": "ColPali primarily focuses on PDF-type documents and high-resource languages, with limited evaluation on web screenshots or low-resource languages.",
        "source": "Section 7, Limitations"
    },
    {
        "paper": "ColPali",
        "question": "What environmental impact did the training of ColPali have?",
        "ground_truth": "The experiments consumed 1405 Mi250x GPU hours, resulting in an estimated carbon footprint of around 15kg CO2 eq, using low-carbon nuclear energy.",
        "source": "Section 7, Ethical Considerations"
    }
]


df_colpali = pd.DataFrame(list_of_questions)
df_colpali


Unnamed: 0,paper,question,ground_truth,source
0,ColPali,What problem does ColPali aim to solve in docu...,ColPali addresses the inefficiencies in curren...,"Abstract, Introduction"
1,ColPali,What is the main architectural innovation intr...,ColPali leverages Vision Language Models to in...,"Section 2, Problem Formulation & Related Work"
2,ColPali,"What is the ViDoRe benchmark, and why was it i...",ViDoRe is a benchmark specifically designed to...,"Section 3, The ViDoRe Benchmark"
3,ColPali,What datasets were used to train the ColPali m...,"The model was trained on 127,460 query-page pa...","Section 4.2, Model training"
4,ColPali,What training infrastructure was used for ColP...,ColPali was trained on an 8-GPU setup with dat...,"Section 4.2, Model training"
5,ColPali,What were the main performance results of ColP...,ColPali outperformed all evaluated models on t...,"Section 5.1, Results"
6,ColPali,How does ColPali achieve faster indexing and q...,ColPali directly encodes pages from their imag...,"Section 5.2, Latencies & Memory Footprint"
7,ColPali,"What ablation studies were conducted, and what...",Ablation studies on patch number and query aug...,"Section 6, Ablation study"
8,ColPali,What are the main limitations of the ColPali m...,ColPali primarily focuses on PDF-type document...,"Section 7, Limitations"
9,ColPali,What environmental impact did the training of ...,The experiments consumed 1405 Mi250x GPU hours...,"Section 7, Ethical Considerations"


In [9]:
Overall_structure_summary = """
The paper "PaliGemma: A versatile 3B VLM for transfer" introduces PaliGemma, a versatile vision-language model (VLM) with 3 billion parameters. The primary goal of this work is to provide a flexible and transferable model for a wide range of vision-language tasks, including image captioning, visual question answering (VQA), segmentation, and many more. The model architecture combines a vision encoder (SigLIP-So400m) and a language decoder (Gemma-2B). The paper evaluates PaliGemma on almost 40 tasks and highlights its performance across a broad spectrum of domains and benchmarks.

The paper is structured as follows:
1. **Introduction**: The paper introduces PaliGemma, explaining its foundation in the PaLI and Gemma models. The objective is to create a model that is both powerful and transferable, able to perform well on standard VLM tasks and specialized areas like remote sensing and video captioning.
2. **Related Work**: This section discusses prior work in VLMs, including notable models like CLIP, ALIGN, and PaLI, and how they paved the way for PaliGemma.
3. **Model Architecture**: The PaliGemma model consists of three main components: the SigLIP vision encoder, the Gemma-2B language model, and a linear projection layer connecting the vision and language models. The architecture supports various multimodal tasks, leveraging a simple image+text input and generating text outputs for different applications.
4. **Training Procedure**: The training process follows three main stages: unimodal pretraining (using existing off-the-shelf models), multimodal pretraining (combining vision and language models), and a resolution increase phase to enable the model to process higher-resolution images.
5. **Results**: PaliGemma is tested on over 30 benchmarks, including tasks like image captioning, visual question answering, and segmentation. The model demonstrates state-of-the-art performance on tasks requiring higher resolution and specialized vision-language capabilities.
6. **Ablation Studies**: This section explores various model design choices, such as the effectiveness of freezing components during pretraining and the impact of higher resolution on performance.
7. **Transferability**: The paper examines the model's transferability across tasks, highlighting its flexibility with minimal fine-tuning.
8. **Conclusion**: PaliGemma is presented as a robust, open, and flexible VLM, offering strong performance across a wide range of vision-language tasks, with the potential for future research in instruction tuning and further specialized applications.

Key Innovations:
- Combines the SigLIP vision encoder and the Gemma-2B language model into a powerful yet efficient VLM.
- Introduces multimodal pretraining strategies and increases image resolution during training for high-resolution tasks.
- Outperforms larger models on many benchmarks while being smaller in parameter count.

Datasets and Infrastructure:
- The training is conducted on TPUv5e with 256 cores, with the training phase taking about 3 days for Stage 1 and 15 hours for Stage 2.
- The model sees 1 billion multimodal examples during pretraining and is further fine-tuned on specialized tasks.

Results:
- PaliGemma achieves strong results on benchmarks like COCO Captions, VQA v2, and Remote Sensing VQA, demonstrating that smaller models can achieve performance on par with much larger VLMs.

Limitations:
- The model lacks instruction tuning, which could improve zero-shot generalization.
- Potential biases in the pretraining data due to the use of publicly available datasets.
"""

list_of_questions: List = [
    {
        "paper": "PaliGemma",
        "question": "What is PaliGemma and what problem does it aim to solve?",
        "ground_truth": "PaliGemma is an open Vision-Language Model (VLM) that combines the SigLIP-So400m vision encoder and the Gemma-2B language model, aiming to provide a versatile base model for a wide range of vision-language tasks with efficient transferability.",
        "source": "Introduction"
    },
    {
        "paper": "PaliGemma",
        "question": "What are the key components of PaliGemma’s architecture?",
        "ground_truth": "PaliGemma consists of three components: the SigLIP image encoder, the Gemma-2B language decoder, and a linear projection layer that connects the vision and language models.",
        "source": "Model Architecture"
    },
    {
        "paper": "PaliGemma",
        "question": "How does PaliGemma handle different image resolutions during training and evaluation?",
        "ground_truth": "PaliGemma undergoes Stage 2 training to increase the image resolution from 224px to 448px and 896px, enhancing its performance on tasks requiring high-resolution inputs like detection and segmentation.",
        "source": "Training Procedure, Section 3.2.3"
    },
    {
        "paper": "PaliGemma",
        "question": "What datasets were used to train and evaluate PaliGemma?",
        "ground_truth": "PaliGemma was trained on a broad mixture of multimodal tasks, and it was evaluated on over 30 benchmarks, including tasks like COCO Captions, VQA, and Remote Sensing VQA.",
        "source": "Training Procedure, Section 3.2; Results, Section 4"
    },
    {
        "paper": "PaliGemma",
        "question": "How does PaliGemma perform compared to larger VLM models?",
        "ground_truth": "PaliGemma, with less than 3B parameters, achieves performance on par with much larger models like PaLI-X (55B) and PaLM-E (562B), particularly on benchmarks like ScienceQA and VQA v2.",
        "source": "Introduction, Section 1; Results, Section 4"
    },
    {
        "paper": "PaliGemma",
        "question": "What pretraining stages does PaliGemma undergo, and what is the purpose of each stage?",
        "ground_truth": "PaliGemma undergoes three stages: unimodal pretraining using off-the-shelf models, multimodal pretraining for task alignment, and a resolution increase stage to improve high-resolution task performance.",
        "source": "Training Procedure, Section 3.2"
    },
    {
        "paper": "PaliGemma",
        "question": "What infrastructure was used to train PaliGemma, and how long did the training take?",
        "ground_truth": "PaliGemma was trained on Cloud TPUv5e, with Stage 1 pretraining taking 3 days and Stage 2 lasting 15 hours.",
        "source": "Training Infrastructure, Section 3.2.6"
    },
    {
        "paper": "PaliGemma",
        "question": "What ablation studies were conducted, and what were the findings in PaliGemma paper?",
        "ground_truth": "Ablation studies showed that freezing the vision encoder during pretraining degraded performance on spatial tasks, and increasing image resolution significantly boosted performance on tasks requiring high-resolution input.",
        "source": "Ablation Studies, Section 5.1; Section 5.7"
    },
    {
        "paper": "PaliGemma",
        "question": "What are PaliGemma's main limitations?",
        "ground_truth": "The main limitations include the lack of instruction tuning, which affects zero-shot generalization, and potential biases in the training data derived from publicly available datasets.",
        "source": "Limitations, Section 8"
    },
    {
        "paper": "PaliGemma",
        "question": "How does PaliGemma achieve transferability across different tasks?",
        "ground_truth": "PaliGemma uses a flexible fine-tuning approach and transferability recipe, where a few hyperparameters like resolution, learning rate, and epochs are adapted for different tasks, allowing it to transfer effectively across a wide range of benchmarks.",
        "source": "Transferability, Section 6"
    }
]

df_paligemma = pd.DataFrame(list_of_questions)
df_paligemma

Unnamed: 0,paper,question,ground_truth,source
0,PaliGemma,What is PaliGemma and what problem does it aim...,PaliGemma is an open Vision-Language Model (VL...,Introduction
1,PaliGemma,What are the key components of PaliGemma’s arc...,PaliGemma consists of three components: the Si...,Model Architecture
2,PaliGemma,How does PaliGemma handle different image reso...,PaliGemma undergoes Stage 2 training to increa...,"Training Procedure, Section 3.2.3"
3,PaliGemma,What datasets were used to train and evaluate ...,PaliGemma was trained on a broad mixture of mu...,"Training Procedure, Section 3.2; Results, Sect..."
4,PaliGemma,How does PaliGemma perform compared to larger ...,"PaliGemma, with less than 3B parameters, achie...","Introduction, Section 1; Results, Section 4"
5,PaliGemma,What pretraining stages does PaliGemma undergo...,PaliGemma undergoes three stages: unimodal pre...,"Training Procedure, Section 3.2"
6,PaliGemma,What infrastructure was used to train PaliGemm...,"PaliGemma was trained on Cloud TPUv5e, with St...","Training Infrastructure, Section 3.2.6"
7,PaliGemma,"What ablation studies were conducted, and what...",Ablation studies showed that freezing the visi...,"Ablation Studies, Section 5.1; Section 5.7"
8,PaliGemma,What are PaliGemma's main limitations?,The main limitations include the lack of instr...,"Limitations, Section 8"
9,PaliGemma,How does PaliGemma achieve transferability acr...,PaliGemma uses a flexible fine-tuning approach...,"Transferability, Section 6"


In [10]:
Overall_structure_summary = """
The paper titled "Llama 3: Herd of Models" presents Llama 3, a new set of foundation models designed for language tasks and multimodal AI tasks, developed by Meta's AI team. Llama 3 models natively support multilinguality, reasoning, coding, and tool use, with a flagship model boasting 405B parameters and a context window of up to 128K tokens. The primary goal of this paper is to introduce Llama 3 and evaluate its performance across a variety of benchmarks, showing that it performs comparably with top models like GPT-4.

The paper structure is as follows:
1. **Introduction**: Llama 3's core contribution is its scale and optimization in data, multilinguality, coding, and reasoning tasks. The model outperforms many competitors, demonstrating significant improvements over its predecessors, Llama and Llama 2.
2. **Model Architecture**: Llama 3 adopts a dense Transformer architecture with some modifications such as grouped query attention (GQA) and a larger vocabulary to improve efficiency, multilinguality, and long-context capabilities. The flagship model has 405B parameters and supports 128K tokens in context.
3. **Training Procedure**: The training process for Llama 3 involved large-scale pretraining on 15.6T tokens, with an emphasis on multilingual and diverse data sources. The models were trained on Meta's AI infrastructure using up to 16K H100 GPUs.
4. **Scaling Laws**: Extensive use of scaling laws guided the development and optimization of Llama 3, ensuring a compute-optimal balance between model size and training data.
5. **Results and Benchmarks**: Llama 3 performs competitively across multiple benchmarks like MMLU, GSM8K, HumanEval, and ARC Challenge, achieving state-of-the-art results in several categories. The model also excels in multilingual tasks.
6. **Ablation Studies**: Various experiments were conducted to analyze the effects of different hyperparameters, including the impact of increased sequence length, GQA, and long-context performance.
7. **Infrastructure**: Llama 3's pretraining was performed on Meta’s AI supercluster using 16K GPUs. The compute budget was 3.8x10^25 FLOPs, and the model was trained on 15.6T tokens. Efficient parallelism strategies like 4D parallelism were used to optimize memory and speed.
8. **Limitations and Future Directions**: The paper highlights limitations, such as challenges in scaling beyond 405B parameters, potential biases in pretraining data, and areas for future research including instruction tuning and better multimodal integration.

Key Innovations:
- **Grouped Query Attention (GQA)**: Improved inference speed and memory efficiency.
- **Long-context Handling**: Extended to 128K tokens, enhancing reasoning and document-processing tasks.
- **Multimodal Extensions**: Initial efforts to integrate image, video, and speech capabilities.

Datasets and Infrastructure:
- Llama 3 was trained on 15.6 trillion tokens of text data and required significant computational resources, involving up to 16,000 H100 GPUs for training.

Results:
- Llama 3 achieves state-of-the-art performance in several benchmarks, outperforming its predecessors and other models with comparable parameter sizes. It also demonstrates strong generalization across languages.

Limitations:
- Further tuning is needed for better zero-shot performance and instruction-following capabilities. Moreover, biases in pretraining data could influence model behavior.
"""

list_of_questions: List = [
    {
        "paper": "Llama 3",
        "question": "What is Llama 3 and what is the primary goal of the paper?",
        "ground_truth": "Llama 3 is a herd of language models designed to support multilinguality, reasoning, coding, and tool use. The goal of the paper is to present Llama 3 and demonstrate its performance across benchmarks, showing comparable quality to GPT-4.",
        "source": "Introduction, Section 1"
    },
    {
        "paper": "Llama 3",
        "question": "What are the main components of Llama 3’s model architecture?",
        "ground_truth": "Llama 3 adopts a dense Transformer architecture with 405B parameters. The architecture includes grouped query attention (GQA) for improved inference speed and larger vocabulary for better token efficiency.",
        "source": "Model Architecture, Section 3.2"
    },
    {
        "paper": "Llama 3",
        "question": "How was Llama 3 trained, and what infrastructure was used?",
        "ground_truth": "Llama 3 was trained on a corpus of 15.6T tokens using up to 16K H100 GPUs with a compute budget of 3.8×10^25 FLOPs. Training took place on Meta’s AI production clusters with optimized parallelism strategies.",
        "source": "Training Procedure, Section 3.3"
    },
    {
        "paper": "Llama 3",
        "question": "What datasets were used for pretraining Llama 3, and how was data quality ensured?",
        "ground_truth": "The pretraining dataset for Llama 3 includes 15.6T tokens from a mixture of multilingual, reasoning, mathematical, and code data. Data quality was ensured through aggressive de-duplication and filtering, particularly for web-based content.",
        "source": "Pre-Training Data, Section 3.1"
    },
    {
        "paper": "Llama 3",
        "question": "What performance benchmarks were used to evaluate Llama 3?",
        "ground_truth": "Llama 3 was evaluated on benchmarks such as MMLU, GSM8K, HumanEval, ARC Challenge, and multilingual tasks. The flagship model achieved state-of-the-art performance in several benchmarks.",
        "source": "Results, Section 4"
    },
    {
        "paper": "Llama 3",
        "question": "How does the Llama 3 model handle long-context processing?",
        "ground_truth": "Llama 3 is designed to handle long contexts, supporting up to 128K tokens. This feature is crucial for tasks that require understanding or generating long documents, improving performance on 'needle in a haystack' tasks.",
        "source": "Long Context Pre-Training, Section 3.4.2"
    },
    {
        "paper": "Llama 3",
        "question": "What is Grouped Query Attention (GQA) and why is it significant in Llama 3?",
        "ground_truth": "GQA is an attention mechanism used in Llama 3 with 8 key-value heads, which improves inference speed and reduces memory requirements, making the model more efficient during decoding.",
        "source": "Model Architecture, Section 3.2"
    },
    {
        "paper": "Llama 3",
        "question": "What strategies were used to scale the Llama 3 model efficiently?",
        "ground_truth": "Llama 3 uses 4D parallelism, combining tensor, pipeline, context, and data parallelism. This approach ensures efficient distribution of computation across thousands of GPUs, optimizing memory usage and performance.",
        "source": "Infrastructure, Section 3.3"
    },
    {
        "paper": "Llama 3",
        "question": "What ablation studies were conducted on Llama 3 and what were the findings?",
        "ground_truth": "Ablation studies focused on hyperparameters like sequence length and GQA. Findings showed that longer sequences and the use of GQA significantly improved model performance on tasks requiring reasoning and long-context understanding.",
        "source": "Ablation Studies, Section 5"
    },
    {
        "paper": "Llama 3",
        "question": "What are the limitations of Llama 3, and what future directions are suggested?",
        "ground_truth": "The limitations of Llama 3 include challenges with instruction tuning and potential biases in pretraining data. Future work includes improving multimodal integration and better instruction-following capabilities.",
        "source": "Limitations and Future Directions, Section 6"
    }
]

df_llama3herd = pd.DataFrame(list_of_questions)
df_llama3herd


Unnamed: 0,paper,question,ground_truth,source
0,Llama 3,What is Llama 3 and what is the primary goal o...,Llama 3 is a herd of language models designed ...,"Introduction, Section 1"
1,Llama 3,What are the main components of Llama 3’s mode...,Llama 3 adopts a dense Transformer architectur...,"Model Architecture, Section 3.2"
2,Llama 3,"How was Llama 3 trained, and what infrastructu...",Llama 3 was trained on a corpus of 15.6T token...,"Training Procedure, Section 3.3"
3,Llama 3,What datasets were used for pretraining Llama ...,The pretraining dataset for Llama 3 includes 1...,"Pre-Training Data, Section 3.1"
4,Llama 3,What performance benchmarks were used to evalu...,Llama 3 was evaluated on benchmarks such as MM...,"Results, Section 4"
5,Llama 3,How does the Llama 3 model handle long-context...,"Llama 3 is designed to handle long contexts, s...","Long Context Pre-Training, Section 3.4.2"
6,Llama 3,What is Grouped Query Attention (GQA) and why ...,GQA is an attention mechanism used in Llama 3 ...,"Model Architecture, Section 3.2"
7,Llama 3,What strategies were used to scale the Llama 3...,"Llama 3 uses 4D parallelism, combining tensor,...","Infrastructure, Section 3.3"
8,Llama 3,What ablation studies were conducted on Llama ...,Ablation studies focused on hyperparameters li...,"Ablation Studies, Section 5"
9,Llama 3,"What are the limitations of Llama 3, and what ...",The limitations of Llama 3 include challenges ...,"Limitations and Future Directions, Section 6"


In [11]:

df_questions = pd.concat([
    df_timesfm,
    df_mamba,
    df_lagllama,
    df_colpali,
    df_paligemma,
    df_llama3herd
], axis=0)

df_train_valid, df_test = train_test_split(df_questions, test_size=0.6, random_state= constants.RND_SEED, shuffle=True, stratify=df_questions['paper'])
df_train, df_valid = train_test_split(df_train_valid, test_size=0.5, random_state= constants.RND_SEED, shuffle=True, stratify=df_train_valid['paper'])

df_questions = pd.concat([
    df_train.assign(split_=mle_utils.Splits.TRAIN.value),
    df_valid.assign(split_=mle_utils.Splits.VALID.value),
    df_test.assign(split_=mle_utils.Splits.TEST.value)
], axis=0)


df_questions.head(2)

Unnamed: 0,paper,question,ground_truth,source,split_
9,Mamba: Linear-Time Sequence Modeling with Sele...,What are the main limitations and risks associ...,While Mamba shows strong performance in variou...,Section 5,1.train
4,PaliGemma,How does PaliGemma perform compared to larger ...,"PaliGemma, with less than 3B parameters, achie...","Introduction, Section 1; Results, Section 4",1.train


In [12]:
df_questions.to_csv(path_data_eval_qs / f"{index_name}.csv", index=False)

In [15]:
from langfuse import Langfuse
from dotenv import load_dotenv

load_dotenv()
langfuse = Langfuse()
langfuse.auth_check()

True

In [None]:
dd