In [1]:
# Install required libraries
!pip install scholarly pandas tqdm requests beautifulsoup4

# Import libraries
import pandas as pd
import time
import random
from tqdm import tqdm
from scholarly import scholarly
import requests
from bs4 import BeautifulSoup
import os

# Create a directory to store the results
os.makedirs('extracted_papers', exist_ok=True)

Collecting scholarly
  Downloading scholarly-1.7.11-py3-none-any.whl (39 kB)
Collecting fake-useragent
  Downloading fake_useragent-2.0.3-py3-none-any.whl (201 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.1/201.1 KB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting sphinx-rtd-theme
  Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Collecting free-proxy
  Downloading free_proxy-1.1.3.tar.gz (5.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting bibtexparser
  Downloading bibtexparser-1.4.3.tar.gz (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 KB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting selenium


## Helper functions

In [21]:
import pandas as pd
import numpy as np
import os
import requests
import json
import gzip
import re
import tarfile
from tqdm.notebook import tqdm
import random
import time
from io import BytesIO

def setup_directories():
    """Create necessary directories"""
    os.makedirs('nlp_papers', exist_ok=True)
    os.makedirs('nlp_papers/academic', exist_ok=True)
    os.makedirs('nlp_papers/industry', exist_ok=True)

def download_arxiv_metadata_sample():
    """
    Download a sample of the arXiv metadata instead of the full file
    """
    print("Downloading arXiv metadata sample...")
    
    # URL for the Kaggle dataset (smaller subset)
    # Alternative: use the arXiv API instead
    metadata_url = "https://arxiv.org/help/api/basics"
    
    print(f"Please visit {metadata_url} to learn about the arXiv API usage.")
    print("We'll use the arXiv API to download metadata for specific queries.")
    
    return True

def query_arxiv_api(query, start=0, max_results=100):
    """
    Query the arXiv API for papers
    
    Parameters:
    - query: Search query string
    - start: Starting index
    - max_results: Maximum number of results to return
    
    Returns:
    - List of paper metadata
    """
    base_url = "http://export.arxiv.org/api/query?"
    
    # Build the query URL
    query_url = f"{base_url}search_query={query}&start={start}&max_results={max_results}"
    
    try:
        response = requests.get(query_url)
        
        if response.status_code == 200:
            # Parse the XML response
            import xml.etree.ElementTree as ET
            root = ET.fromstring(response.content)
            
            # Extract papers
            papers = []
            ns = {'atom': 'http://www.w3.org/2005/Atom', 
                  'arxiv': 'http://arxiv.org/schemas/atom'}
            
            for entry in root.findall('.//atom:entry', ns):
                # Extract basic paper information
                paper = {}
                
                # ID (convert from URL to arXiv ID)
                id_elem = entry.find('./atom:id', ns)
                if id_elem is not None:
                    paper['id'] = id_elem.text.split('/')[-1]
                
                # Title
                title_elem = entry.find('./atom:title', ns)
                if title_elem is not None:
                    paper['title'] = title_elem.text
                
                # Abstract
                summary_elem = entry.find('./atom:summary', ns)
                if summary_elem is not None:
                    paper['abstract'] = summary_elem.text
                
                # Authors
                authors = []
                for author_elem in entry.findall('./atom:author', ns):
                    name_elem = author_elem.find('./atom:name', ns)
                    if name_elem is not None:
                        authors.append(name_elem.text)
                paper['authors'] = ', '.join(authors)
                
                # Categories
                categories = []
                for cat_elem in entry.findall('./arxiv:primary_category', ns):
                    if 'term' in cat_elem.attrib:
                        categories.append(cat_elem.attrib['term'])
                for cat_elem in entry.findall('./atom:category', ns):
                    if 'term' in cat_elem.attrib:
                        categories.append(cat_elem.attrib['term'])
                paper['categories'] = ', '.join(categories)
                
                # Publication date
                published_elem = entry.find('./atom:published', ns)
                if published_elem is not None:
                    paper['published'] = published_elem.text[:10]  # YYYY-MM-DD
                
                # Links
                for link_elem in entry.findall('./atom:link', ns):
                    if link_elem.attrib.get('title') == 'pdf':
                        paper['pdf_url'] = link_elem.attrib.get('href')
                    elif link_elem.attrib.get('rel') == 'alternate':
                        paper['url'] = link_elem.attrib.get('href')
                
                papers.append(paper)
            
            return papers
        else:
            print(f"Error: API returned status code {response.status_code}")
            return []
            
    except Exception as e:
        print(f"Error querying arXiv API: {e}")
        return []

def classify_paper_affiliation(paper):
    """
    Classify a paper as academic or industry based on metadata
    with improved author affiliation extraction
    
    Parameters:
    - paper: Paper metadata
    
    Returns:
    - 'academic', 'industry', or None
    """
    # Companies to identify industry papers - expanded list
    companies = [
        'google', 'microsoft', 'amazon', 'facebook', 'meta', 'apple', 'ibm', 
        'deepmind', 'openai', 'anthropic', 'nvidia', 'hugging face', 'tencent',
        'baidu', 'salesforce', 'adobe', 'twitter', 'linkedin', 'samsung',
        'uber', 'netflix', 'intel', 'oracle', 'sap', 'bloomberg', 'spotify',
        'airbnb', 'dropbox', 'ebay', 'snap', 'paypal', 'stripe'
    ]
    
    # Academic institutions keywords
    academic_keywords = [
        'university', 'college', 'institute of technology', 'polytechnic',
        'school of', 'department of', 'faculty of', 'academy of sciences',
        'university of', 'ecole', 'universit', 'academic', 'laboratory',
        'lab,', 'lab.', 'laboratory,', 'laboratory.'
    ]
    
    # Additional heuristics
    email_academic_domains = ['.edu', '.ac.', '.uni-', '.univ']
    
    # Try to determine if it's industry or academic
    is_industry = False
    is_academic = False
    
    # Check authors field for affiliations
    authors = paper.get('authors', '')
    
    # Look for affiliations in different formats
    # Common format: "Author Name (Organization)"
    affiliation_pattern = re.compile(r'\((.*?)\)')
    affiliations = affiliation_pattern.findall(authors)
    
    # Common format: "Author Name, Organization"
    if not affiliations and ',' in authors:
        # Try to extract the part after the first comma
        first_author_parts = authors.split(',', 1)
        if len(first_author_parts) > 1:
            affiliations.append(first_author_parts[1].strip())
    
    # Process found affiliations
    for affiliation in affiliations:
        affiliation_lower = affiliation.lower()
        
        # Check for industry affiliations
        for company in companies:
            if company in affiliation_lower:
                is_industry = True
                break
        
        # Check for academic affiliations
        for keyword in academic_keywords:
            if keyword in affiliation_lower:
                is_academic = True
                break
    
    # If no clear affiliation found in authors field, check abstract
    if not is_industry and not is_academic:
        abstract = paper.get('abstract', '').lower()
        
        # Look for author affiliations sometimes mentioned in abstract
        for company in companies:
            if company in abstract:
                # Check if it appears as an affiliation mention
                if any(marker in abstract for marker in 
                       [f"at {company}", f"from {company}", f"{company} research", 
                        f"{company},", f"{company} inc", f"{company} corporation"]):
                    is_industry = True
                    break
        
        for keyword in academic_keywords:
            if keyword in abstract:
                # Check if it appears as an affiliation mention
                if any(marker in abstract for marker in 
                       [f"at {keyword}", f"from {keyword}", f"{keyword} research",
                        f"{keyword},", f"the {keyword}"]):
                    is_academic = True
                    break
    
    # Check for email domains if available (sometimes in authors field)
    email_pattern = re.compile(r'[\w\.-]+@[\w\.-]+')
    emails = email_pattern.findall(authors)
    
    if emails:
        primary_email = emails[0].lower()
        
        # Check academic email domains
        if any(domain in primary_email for domain in email_academic_domains):
            is_academic = True
        
        # Check industry email domains
        if any(company in primary_email for company in companies):
            is_industry = True
    
    # Check paper categories for additional clues
    categories = paper.get('categories', '').lower()
    if 'cs.cl' in categories and not is_industry:
        # Computational Linguistics papers without industry affiliation 
        # are more likely to be academic
        is_academic = True
    
    # Make a decision
    if is_industry and is_academic:
        # Check which is stronger - prioritize by order in author list
        if authors:
            first_half = authors[:len(authors)//2].lower()
            for company in companies:
                if company in first_half:
                    return 'industry'
            
            for keyword in academic_keywords:
                if keyword in first_half:
                    return 'academic'
        
        # Default prioritization for mixed affiliations
        return 'industry'  # Industry papers are rarer, so prioritize them
    elif is_industry:
        return 'industry'
    elif is_academic:
        return 'academic'
    else:
        # If still unclear, we need additional heuristics
        
        # Check if title contains common industry terms
        title = paper.get('title', '').lower()
        industry_title_terms = ['product', 'application', 'platform', 'customer', 'business']
        academic_title_terms = ['theoretical', 'framework', 'formal', 'model of', 'approach to']
        
        if any(term in title for term in industry_title_terms):
            return 'industry'
        
        if any(term in title for term in academic_title_terms):
            return 'academic'
        
        # Default to academic as most papers on arXiv are academic
        return 'academic'

def download_pdf(paper_id, pdf_url, output_dir):
    """
    Download the PDF for a paper
    
    Parameters:
    - paper_id: ID of the paper
    - pdf_url: URL to download the PDF
    - output_dir: Directory to save the PDF
    
    Returns:
    - True if successful, False otherwise
    """
    # Check if we already have this PDF
    pdf_path = os.path.join(output_dir, f"{paper_id}.pdf")
    if os.path.exists(pdf_path):
        return True
    
    try:
        # Add a delay to be polite to the server
        time.sleep(1)
        
        # Download the PDF
        response = requests.get(pdf_url, timeout=30)
        
        if response.status_code == 200:
            with open(pdf_path, 'wb') as f:
                f.write(response.content)
            return True
        else:
            print(f"Failed to download PDF for {paper_id}, status: {response.status_code}")
            return False
    except Exception as e:
        print(f"Error downloading PDF for {paper_id}: {e}")
        return False

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file
    
    Parameters:
    - pdf_path: Path to the PDF file
    
    Returns:
    - Extracted text
    """
    try:
        import PyPDF2
        
        with open(pdf_path, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            text = ""
            
            # Extract text from each page
            for i in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[i]
                text += page.extract_text() + "\n\n"
            
            return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

def extract_nlp_papers(num_academic=300, num_industry=300, max_iterations=10):
    """
    Extract NLP papers from arXiv using the API
    
    Parameters:
    - num_academic: Number of academic papers to extract
    - num_industry: Number of industry papers to extract
    - max_iterations: Maximum number of API calls
    
    Returns:
    - DataFrames for academic and industry papers
    """
    setup_directories()
    
    # NLP-related search queries
    queries = [
        "cat:cs.CL",  # Computational Linguistics
        "\"natural language processing\"",
        "\"NLP\" AND \"deep learning\"",
        "\"language model\" AND \"transformer\"",
        "\"BERT\" OR \"GPT\" AND \"language\""
    ]
    
    academic_papers = []
    industry_papers = []
    
    processed_ids = set()  # Track papers we've already processed
    
    # Try each query until we have enough papers
    for query in queries:
        if len(academic_papers) >= num_academic and len(industry_papers) >= num_industry:
            break
            
        print(f"Querying arXiv API with: {query}")
        
        # Make multiple API calls with different start indices
        for iteration in range(max_iterations):
            start_index = iteration * 100
            
            # Check if we have enough papers
            if len(academic_papers) >= num_academic and len(industry_papers) >= num_industry:
                break
                
            # Query the API
            results = query_arxiv_api(query, start=start_index, max_results=100)
            
            if not results:
                print("No more results or API error. Moving to next query.")
                break
                
            print(f"Retrieved {len(results)} papers from API.")
            
            # Process the results
            for paper in results:
                # Skip if we've already processed this paper
                if paper['id'] in processed_ids:
                    continue
                    
                processed_ids.add(paper['id'])
                
                # Classify the paper
                paper_type = classify_paper_affiliation(paper)
                paper['type'] = paper_type
                
                # Add to appropriate list if we still need papers of this type
                if paper_type == 'academic' and len(academic_papers) < num_academic:
                    academic_papers.append(paper)
                    print(f"Added academic paper: {paper['title'][:50]}... ({len(academic_papers)}/{num_academic})")
                    
                elif paper_type == 'industry' and len(industry_papers) < num_industry:
                    industry_papers.append(paper)
                    print(f"Added industry paper: {paper['title'][:50]}... ({len(industry_papers)}/{num_industry})")
            
            # Pause between API calls to be respectful
            time.sleep(3)
    
    # Create DataFrames
    academic_df = pd.DataFrame(academic_papers)
    industry_df = pd.DataFrame(industry_papers)
    
    # Save the metadata
    academic_df.to_csv('nlp_papers/academic_papers_metadata.csv', index=False)
    industry_df.to_csv('nlp_papers/industry_papers_metadata.csv', index=False)
    
    print(f"Collected {len(academic_df)} academic papers and {len(industry_df)} industry papers.")
    
    return academic_df, industry_df

def download_paper_pdfs(papers_df, output_dir, limit=None):
    """
    Download PDFs for papers in the DataFrame
    
    Parameters:
    - papers_df: DataFrame containing paper metadata
    - output_dir: Directory to save the PDFs
    - limit: Maximum number of papers to download (None for all)
    
    Returns:
    - List of successfully downloaded paper IDs
    """
    if limit:
        papers_to_download = papers_df.head(limit)
    else:
        papers_to_download = papers_df
    
    successful_downloads = []
    
    for _, paper in tqdm(papers_to_download.iterrows(), total=len(papers_to_download), desc="Downloading PDFs"):
        paper_id = paper['id']
        pdf_url = paper.get('pdf_url')
        
        if not pdf_url:
            # Construct PDF URL if not available
            pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
        
        # Download the PDF
        if download_pdf(paper_id, pdf_url, output_dir):
            successful_downloads.append(paper_id)
    
    print(f"Successfully downloaded {len(successful_downloads)} PDFs to {output_dir}")
    return successful_downloads

def extract_text_from_pdfs(paper_ids, input_dir):
    """
    Extract text from downloaded PDFs
    
    Parameters:
    - paper_ids: List of paper IDs
    - input_dir: Directory containing the PDFs
    
    Returns:
    - Dictionary mapping paper IDs to extracted text
    """
    # Install PyPDF2 if not already installed
    try:
        import PyPDF2
    except ImportError:
        !pip install PyPDF2
        import PyPDF2
    
    text_contents = {}
    
    for paper_id in tqdm(paper_ids, desc="Extracting text from PDFs"):
        pdf_path = os.path.join(input_dir, f"{paper_id}.pdf")
        
        if os.path.exists(pdf_path):
            text = extract_text_from_pdf(pdf_path)
            text_contents[paper_id] = text
    
    print(f"Extracted text from {len(text_contents)} PDFs")
    return text_contents

def create_final_datasets(academic_df, industry_df, academic_texts, industry_texts):
    """
    Create final datasets with text content
    
    Parameters:
    - academic_df: DataFrame with academic paper metadata
    - industry_df: DataFrame with industry paper metadata
    - academic_texts: Dictionary of academic paper texts
    - industry_texts: Dictionary of industry paper texts
    
    Returns:
    - DataFrames for academic and industry papers with content
    """
    # Add content to dataframes
    academic_with_content = academic_df.copy()
    academic_with_content['content'] = academic_with_content['id'].apply(
        lambda x: academic_texts.get(x, "")
    )
    
    industry_with_content = industry_df.copy()
    industry_with_content['content'] = industry_with_content['id'].apply(
        lambda x: industry_texts.get(x, "")
    )
    
    # Filter out papers without content
    academic_with_content = academic_with_content[academic_with_content['content'].str.len() > 100]
    industry_with_content = industry_with_content[industry_with_content['content'].str.len() > 100]
    
    # Clean text fields
    def clean_text(text):
        if not isinstance(text, str):
            return ""
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove non-printable characters
        text = ''.join(c for c in text if c.isprintable() or c in ['\n', '\t'])
        
        return text.strip()
    
    academic_with_content['title'] = academic_with_content['title'].apply(clean_text)
    academic_with_content['abstract'] = academic_with_content['abstract'].apply(clean_text)
    academic_with_content['content'] = academic_with_content['content'].apply(clean_text)
    
    industry_with_content['title'] = industry_with_content['title'].apply(clean_text)
    industry_with_content['abstract'] = industry_with_content['abstract'].apply(clean_text)
    industry_with_content['content'] = industry_with_content['content'].apply(clean_text)
    
    # Save final datasets
    academic_with_content.to_csv('nlp_papers/academic_papers_with_content.csv', index=False)
    industry_with_content.to_csv('nlp_papers/industry_papers_with_content.csv', index=False)
    
    # Create balanced dataset
    min_size = min(len(academic_with_content), len(industry_with_content))
    
    if min_size < 300:
        print(f"Warning: Only {min_size} papers available for balanced dataset")
    
    academic_sample = academic_with_content.sample(min(min_size, 300), random_state=42)
    industry_sample = industry_with_content.sample(min(min_size, 300), random_state=42)
    
    balanced_df = pd.concat([academic_sample, industry_sample])
    balanced_df.to_csv('nlp_papers/balanced_nlp_papers.csv', index=False)
    
    return academic_with_content, industry_with_content, balanced_df

# Main execution function
def extract_nlp_papers_main():
    """Main function to extract NLP papers"""
    # Step 1: Set up extraction
    setup_directories()
    
    # Step 2: Extract metadata
    academic_df, industry_df = extract_nlp_papers(num_academic=300, num_industry=300)
    
    # Step 3: Download PDFs
    academic_downloads = download_paper_pdfs(academic_df, 'nlp_papers/academic')
    industry_downloads = download_paper_pdfs(industry_df, 'nlp_papers/industry')
    
    # Step 4: Extract text from PDFs
    academic_texts = extract_text_from_pdfs(academic_downloads, 'nlp_papers/academic')
    industry_texts = extract_text_from_pdfs(industry_downloads, 'nlp_papers/industry')
    
    # Step 5: Create final datasets
    academic_final, industry_final, balanced_final = create_final_datasets(
        academic_df, industry_df, academic_texts, industry_texts
    )
    
    print(f"Final academic papers: {len(academic_final)}")
    print(f"Final industry papers: {len(industry_final)}")
    print(f"Final balanced dataset: {len(balanced_final)}")
    
    return academic_final, industry_final, balanced_final

# Run the extraction process
academic_papers, industry_papers, balanced_papers = extract_nlp_papers_main()

Querying arXiv API with: cat:cs.CL
Retrieved 100 papers from API.
Added academic paper: Linear Segmentation and Segment Significance... (1/300)
Added academic paper: Modelling Users, Intentions, and Structure in Spok... (2/300)
Added academic paper: A Lexicalized Tree Adjoining Grammar for English... (3/300)
Added academic paper: Prefix Probabilities from Stochastic Tree Adjoinin... (4/300)
Added academic paper: Conditions on Consistency of Probabilistic Tree Ad... (5/300)
Added academic paper: Separating Dependency from Constituency in a Tree ... (6/300)
Added academic paper: Incremental Parser Generation for Tree Adjoining G... (7/300)
Added academic paper: A Freely Available Morphological Analyzer, Disambi... (8/300)
Added academic paper: Processing Unknown Words in HPSG... (9/300)
Added academic paper: Computing Declarative Prosodic Morphology... (10/300)
Added academic paper: On the Evaluation and Comparison of Taggers: The E... (11/300)
Added academic paper: Improving Tagging Per

Downloading PDFs:   0%|          | 0/300 [00:00<?, ?it/s]

Failed to download PDF for 9908001v1, status: 500
Failed to download PDF for 0002007v1, status: 500
Failed to download PDF for 0102022v2, status: 500
Failed to download PDF for 0104010v1, status: 500
Successfully downloaded 296 PDFs to nlp_papers/academic


Downloading PDFs:   0%|          | 0/90 [00:00<?, ?it/s]

Successfully downloaded 90 PDFs to nlp_papers/industry


Extracting text from PDFs:   0%|          | 0/296 [00:00<?, ?it/s]

Extracted text from 296 PDFs


Extracting text from PDFs:   0%|          | 0/90 [00:00<?, ?it/s]

Extracted text from 90 PDFs
Final academic papers: 296
Final industry papers: 90
Final balanced dataset: 180


In [11]:
academic_papers['abstract']

0      We present a new method for discovering a segm...
1      We outline how utterances in dialogs can be in...
2      This document describes a sizable grammar of E...
3      Language models for speech recognition typical...
4      Much of the power of probabilistic methods in ...
                             ...                        
295    The problem of measuring sentence similarity i...
296    Natural Language Processing offers new insight...
297    How can a text corpus stored in a customer rel...
298    In this study, a natural language processing-b...
299    Federated Learning aims to learn machine learn...
Name: abstract, Length: 296, dtype: object

In [22]:
industry_papers

Unnamed: 0,id,title,abstract,authors,categories,published,url,pdf_url,type,content
0,2304.14670v2,Prompt Engineering for Healthcare: Methodologi...,Prompt engineering is a critical technique in ...,"Jiaqi Wang, Enze Shi, Sigang Yu, Zihao Wu, Cho...","cs.AI, cs.AI",2023-04-28,http://arxiv.org/abs/2304.14670v2,http://arxiv.org/pdf/2304.14670v2,industry,"JOURNAL OF L ATEX CLASS FILES, VOL. 14, NO. 8,..."
1,2305.02029v1,Natural language processing on customer note data,Automatic analysis of customer data for busine...,"Andrew Hilditch, David Webb, Jozef Baca, Tom A...","cs.CL, cs.CL",2023-05-03,http://arxiv.org/abs/2305.02029v1,http://arxiv.org/pdf/2305.02029v1,industry,Natural language processing on customer note d...
2,2310.17644v1,torchdistill Meets Hugging Face Libraries for ...,Reproducibility in scientific work has been be...,Yoshitomo Matsubara,"cs.CL, cs.CL, cs.CV, cs.LG",2023-10-26,http://arxiv.org/abs/2310.17644v1,http://arxiv.org/pdf/2310.17644v1,industry,torchdistill Meets Hugging Face Libraries for ...
3,2408.00716v1,A Natural Language Processing Framework for Ho...,"Recently, the application of Artificial Intell...","Lavrentia Aravani, Emmanuel Pintelas, Christos...","cs.LG, cs.LG",2024-08-01,http://arxiv.org/abs/2408.00716v1,http://arxiv.org/pdf/2408.00716v1,industry,A Natural Language Processing Framework for Ho...
4,2110.07609v1,Application of Sequence Embedding in Protein S...,"In sequence-based predictions, conventionally ...","Nabil Ibtehaz, Daisuke Kihara","q-bio.QM, q-bio.QM",2021-10-14,http://arxiv.org/abs/2110.07609v1,http://arxiv.org/pdf/2110.07609v1,industry,1 Application of Sequence Embedding in Protein...
...,...,...,...,...,...,...,...,...,...,...
85,2407.17900v5,The Power of Combining Data and Knowledge: GPT...,Lymph node metastasis (LNM) is a crucial facto...,"Danqing Hu, Bing Liu, Xiaofeng Zhu, Nan Wu","cs.CL, cs.CL, cs.LG",2024-07-25,http://arxiv.org/abs/2407.17900v5,http://arxiv.org/pdf/2407.17900v5,industry,THEPOWER OF COMBINING DATA AND KNOWLEDGE : GPT...
86,2404.06904v1,Vision-Language Model-based Physical Reasoning...,There is a growing interest in applying large ...,"Wenqiang Lai, Yuan Gao, Tin Lun Lam","cs.RO, cs.RO",2024-04-10,http://arxiv.org/abs/2404.06904v1,http://arxiv.org/pdf/2404.06904v1,industry,This work has been submitted to the IEEE for p...
87,2306.05122v1,Can AI Moderate Online Communities?,The task of cultivating healthy communication ...,"Henrik Axelsen, Johannes Rude Jensen, Sebastia...","cs.CY, cs.CY",2023-06-08,http://arxiv.org/abs/2306.05122v1,http://arxiv.org/pdf/2306.05122v1,industry,1 Can AI Moderat e Online Communities ? Henrik...
88,2409.11408v1,Optimizing Performance: How Compact Models Mat...,"In this paper, we demonstrate that non-generat...","Baptiste Lefort, Eric Benhamou, Jean-Jacques O...","cs.CL, cs.CL, q-fin.ST",2024-08-22,http://arxiv.org/abs/2409.11408v1,http://arxiv.org/pdf/2409.11408v1,industry,Optimizing Performance: How Compact Models Mat...
