# Introduction

In [6]:
import re

def extract_bibtex_entries():
    """
    Extract all BibTeX entries from raw.txt and save them to input.bib
    """
    try:
        # Read the raw.txt file
        with open('raw.txt', 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Improved regex pattern to match multi-line BibTeX entries
        # This pattern matches from @ to the closing brace, handling multi-line entries
        bibtex_pattern = r'@\w+\{[\s\S]*?\n\}'
        
        # Find all BibTeX entries
        bibtex_entries = re.findall(bibtex_pattern, content)
        
        if not bibtex_entries:
            print("No BibTeX entries found in the file.")
            return None
        
        # Write BibTeX entries to input.bib
        with open('input.bib', 'w', encoding='utf-8') as output_file:
            for i, entry in enumerate(bibtex_entries):
                # Write the entry as-is (it's already properly formatted)
                output_file.write(entry.strip())
                
                # Add a blank line between entries (except for the last one)
                if i < len(bibtex_entries) - 1:
                    output_file.write('\n\n')
        
        print(f"Successfully extracted {len(bibtex_entries)} BibTeX entries to input.bib")
        
    except FileNotFoundError:
        print("Error: File 'raw.txt' not found.")
    except Exception as e:
        print(f"Error processing file: {str(e)}")

extract_bibtex_entries()

Successfully extracted 6 BibTeX entries to input.bib


# Literature review

In [2]:
import os
os.chdir('E:/0 Main Codes/Refs/')

def clean_bibtex_file(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    cleaned_lines = []
    current_entry = []
    is_article = False
    skip_keywords = ['url =', 'source =']

    for line in lines:
        stripped = line.strip()
        
        # Detect start of a new BibTeX entry
        if stripped.startswith('@'):
            if current_entry and is_article:
                cleaned_lines.extend(current_entry)

            current_entry = []
            is_article = stripped.lower().startswith('@article')
        
        if is_article and not any(keyword in stripped for keyword in skip_keywords):
            current_entry.append(line)

    # Add last entry if it was an @ARTICLE
    if current_entry and is_article:
        cleaned_lines.extend(current_entry)

    with open(output_path, 'w', encoding='utf-8') as file:
        file.writelines(cleaned_lines)

    print(f"Cleaned BibTeX file with only @ARTICLE entries saved to: {output_path}")

# Example usage
clean_bibtex_file('input.bib', 'cleaned_output.bib')


Cleaned BibTeX file with only @ARTICLE entries saved to: cleaned_output.bib
