In [11]:

mining_qa_prompts = {
  "title": "Q&A Prompts: Discovering Rich Visual Clues through Mining Question-Answer Prompts for VQA Requiring Diverse World Knowledge",
  "authors": ["Haibo Wang", "Weifeng Ge", "..."],
  "affiliation": "School of Computer Science, Fudan University",
  "focus": "Enhancing AI models for Visual Question Answering using Q&A Prompts method",
  "methodology": ["Training a visual question generation model", "Generating question-answer prompts", "Reasoning with these prompts"],
  "key_findings": "Improvement in AI's ability to answer complex visual questions with diverse world knowledge",
  "publication_date": "2024-01-19"
}

clinical_document_qa = {
  "title": "Dynamic Q&A of Clinical Documents with Large Language Models",
  "authors": ["Ran Elgedawy", "Sudarshan Srinivasan", "Ioana Danciu", "..."],
  "affiliation": "University of Tennessee Knoxville, Oak Ridge National Laboratory",
  "focus": "Development of a conversational interface using LLMs for querying clinical notes",
  "key_technologies": ["Large Language Models", "Semantic Embedding Models"],
  "applications": "Efficient querying and retrieval in EHRs",
  "challenges_future_work": ["Model optimization", "Domain-specific fine-tuning", "Evaluation challenges"],
  "publication_date": "2024-01-19"
}

medusa = {
  "title": "MEDUSA: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads",
  "authors": ["Tianle Cai", "Yuhong Li", "Zhengyang Geng", "..."],
  "affiliations": ["Princeton University", "University of Illinois Urbana-Champaign", "Carnegie Mellon University", "..."],
  "focus": "Accelerating LLM inference with MEDUSA method",
  "key_concepts": ["Multiple decoding heads", "Tree-based attention mechanism", "Fine-tuning strategies"],
  "results": "Significant speedup in LLM inference with minimal quality compromise",
  "publication_date": "2024-01-19"
}

antisemitic_detection = {
  "title": "Detection of Emerging Coded Antisemitic Terminology in Online Posts",
  "authors": ["Dhanush Kikkisetti", "Raza Ul Mustafa", "Wendy Melillo", "..."],
  "affiliation": "American University",
  "focus": "Detecting coded antisemitic terminology in social media posts",
  "methodology": ["Extraction of trending terms", "Comparison with known terms", "Fine-tuning BERT model"],
  "key_findings": "Identification of new antisemitic terms and development of methodologies for their detection",
  "publication_date": "2024-01-19"
}

jailbreak_resistance = {
  "title": "Pruning for Protection: Increasing Jailbreak Resistance in Aligned LLMs Without Fine-Tuning",
  "authors": ["Adib Hasan", "Ileana Rugina", "Alex Wang"],
  "affiliation": "Massachusetts Institute of Technology (MIT)",
  "focus": "Examining effects of pruning on LLM safety against jailbreaking prompts",
  "methodology": "Comparative analysis using a new dataset of malicious tasks",
  "key_findings": "Pruning can increase resistance to jailbreaking prompts",
  "publication_date": "2024-01-19"
}

rl_for_qa = {
  "title": "Reinforcement Learning for Question Answering in Programming Domain Using Public Community Scoring as Human Feedback",
  "authors": ["Alexey Gorbatovski", "Sergey Kovalchuk"],
  "affiliations": ["ITMO University", "Huawei"],
  "focus": "Enhancing GPT Neo 125M's performance in Community QA through RLHF",
  "methodology": ["Use of RLHF with community scoring", "Fine-tuning with Proximal Policy Optimization"],
  "key_insights": "Limitations of traditional metrics for programming QA and need for domain-specific evaluation methods",
  "publication_date": "2024-01-19",
}



In [14]:
import os
from typing import Dict
from langchain.document_loaders import PyPDFLoader
import pandas as pd

def load_pdfs(dir_path: str) -> Dict[str, str]:
    file_contents = {}
    files = [f for f in os.listdir(dir_path) if f.endswith('.pdf')]
    for file in files:
        loader = PyPDFLoader(os.path.join(dir_path, file))
        pages = loader.load_and_split()
        full_text = ' '.join([page.page_content for page in pages])  # Extracting page_content from each Document object
        file_contents[file] = full_text
    return file_contents

dir_path = "arxiv_pdfs"

pdf_contents = load_pdfs(dir_path)

structured_data = {
    "mining_qa_prompt.pdf": mining_qa_prompts,
    "clinical_document_qa.pdf": clinical_document_qa,
    "medusa.pdf": medusa,
    "antisemitic_detection.pdf": antisemitic_detection,
    "jailbreak_resistance.pdf": jailbreak_resistance,
    "rl_for_qa.pdf": rl_for_qa
}

df_list = []
for file_name, raw_content in pdf_contents.items():
    structured_info = structured_data.get(file_name, {})
    df_list.append({'raw': raw_content, 'structured': structured_info})

combined_df = pd.DataFrame(df_list)
combined_df.to_csv('arxiv_pdfs_data', index=False)

print(combined_df.head())


                                                 raw  \
0  Dynamic Q&A of Clinical Documents with Large\n...   
1  Pruning for Protection: Increasing Jailbreak\n...   
2  MEDUSA : Simple LLM Inference Acceleration\nFr...   
3  Using LLMs to discover emerging coded antisemi...   
4  Reinforcement learning for question answering ...   

                                          structured  
0  {'title': 'Dynamic Q&A of Clinical Documents w...  
1  {'title': 'Pruning for Protection: Increasing ...  
2  {'title': 'MEDUSA: Simple LLM Inference Accele...  
3  {'title': 'Detection of Emerging Coded Antisem...  
4  {'title': 'Reinforcement Learning for Question...  


In [13]:
print(structured_data)

{'2401.10712.pdf': {'title': 'Q&A Prompts: Discovering Rich Visual Clues through Mining Question-Answer Prompts for VQA Requiring Diverse World Knowledge', 'authors': ['Haibo Wang', 'Weifeng Ge', '...'], 'affiliation': 'School of Computer Science, Fudan University', 'focus': 'Enhancing AI models for Visual Question Answering using Q&A Prompts method', 'methodology': ['Training a visual question generation model', 'Generating question-answer prompts', 'Reasoning with these prompts'], 'key_findings': "Improvement in AI's ability to answer complex visual questions with diverse world knowledge", 'publication_date': '2024-01-19'}, '2401.10733.pdf': {'title': 'Dynamic Q&A of Clinical Documents with Large Language Models', 'authors': ['Ran Elgedawy', 'Sudarshan Srinivasan', 'Ioana Danciu', '...'], 'affiliation': 'University of Tennessee Knoxville, Oak Ridge National Laboratory', 'focus': 'Development of a conversational interface using LLMs for querying clinical notes', 'key_technologies': ['