In [5]:
import os
import re
import requests
from bs4 import BeautifulSoup

# Configure the search
AUTHOR_NAME = "Lei Liang"  # Replace with the author's name
SAVE_DIR = "papers"  # Directory to save the downloaded PDFs

# Google Scholar search URL (or use other academic search engines)
BASE_URL = "https://paperswithcode.com/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Ensure save directory exists
os.makedirs(SAVE_DIR, exist_ok=True)

def search_papers(author_name):
    """Search for papers by author name."""
    params = {"q": f'author:"{author_name}"', "hl": "en"}
    response = requests.get(BASE_URL, headers=HEADERS, params=params)
    response.raise_for_status()
    return response.text

def extract_paper_links(html):
    """Extract links to PDFs or paper details."""
    soup = BeautifulSoup(html, "html.parser")
    links = []
    for result in soup.select(".gs_r.gs_or"):
        pdf_link = result.find("a", href=True)
        if pdf_link and pdf_link["href"].endswith(".pdf"):
            links.append(pdf_link["href"])
    return links

def download_paper(url, save_dir):
    """Download a PDF given its URL."""
    try:
        response = requests.get(url, headers=HEADERS, stream=True)
        response.raise_for_status()
        filename = re.sub(r'[^\w\-_\. ]', '_', url.split("/")[-1])  # Clean the filename
        filepath = os.path.join(save_dir, filename)
        with open(filepath, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"Downloaded: {filename}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")

def main():
    print(f"Searching for papers by {AUTHOR_NAME}...")
    html = search_papers(AUTHOR_NAME)
    paper_links = extract_paper_links(html)
    
    if not paper_links:
        print("No PDF links found. Consider using a different source or API.")
        return

    print(f"Found {len(paper_links)} papers. Downloading...")
    for link in paper_links:
        download_paper(link, SAVE_DIR)
    
    print("Download complete.")

# if __name__ == "__main__":
#     main()


In [6]:
main()

Searching for papers by Lei Liang...
No PDF links found. Consider using a different source or API.


In [9]:
import re
import PyPDF2
from cleantext import clean

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def clean_text(text):
    """Clean extracted text by removing extra spaces, newlines, and special characters."""
    # Remove newlines, tabs, and multiple spaces
    cleaned_text = re.sub(r"[^\w\s.,!?]", " ", text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    
    
    # Use cleantext to further process the text
    # cleaned_text = clean(
    #     cleaned_text,
    #     lowercase=True,           # Convert to lowercase
    #     extra_spaces=True,         # Remove URLs
    #     stemming=True,             # Remove emojis
    # )
    
    return cleaned_text.strip()

def main():
    pdf_path = "/home/olawale/Desktop/PROJECTS/llms/digital-research-assistant/notebooks/papers/s11248-015-9867-7.pdf"  # Replace with your PDF file path
    print(f"Extracting text from {pdf_path}...")
    raw_text = extract_text_from_pdf(pdf_path)
    
    print("\nRaw Extracted Text:")
    print(raw_text[:500])  # Print first 500 characters for preview
    
    cleaned_text = clean_text(raw_text)
    
    print("\nCleaned Text:")
    print(cleaned_text[:500])  # Print first 500 characters for preview
    
    # Optionally, save cleaned text to a file
    with open("cleaned_text.txt", "w", encoding="utf-8") as output_file:
        output_file.write(cleaned_text)
        print("\nCleaned text saved to 'cleaned_text.txt'.")


In [10]:
main()

Extracting text from /home/olawale/Desktop/PROJECTS/llms/digital-research-assistant/notebooks/papers/s11248-015-9867-7.pdf...

Raw Extracted Text:
REVIEW
Genetic basis and detection of unintended effects
in genetically modiﬁed crop plants
Gregory S. Ladics •Andrew Bartholomaeus •Phil Bregitzer •Nancy G. Doerrer •
Alan Gray •Thomas Holzhauser •Mark Jordan •Paul Keese •Esther Kok •Phil Macdonald •
Wayne Parrott •Laura Privalle •Alan Raybould •Seung Yon Rhee •Elena Rice •
Jo¨rg Romeis •Justin Vaughn •Jean-Michel Wal •Kevin Glenn
Received: 18 January 2015 / Accepted: 14 February 2015 / Published online: 26 February 2015
/C211The Author(s) 2015

Cleaned Text:
REVIEW Genetic basis and detection of unintended effects in genetically modiﬁed crop plants Gregory S. Ladics Andrew Bartholomaeus Phil Bregitzer Nancy G. Doerrer Alan Gray Thomas Holzhauser Mark Jordan Paul Keese Esther Kok Phil Macdonald Wayne Parrott Laura Privalle Alan Raybould Seung Yon Rhee Elena Rice Jo rg Romeis Justin Vaughn J

In [12]:
import os

In [22]:
ROOT_DIR = "/home/olawale/Desktop/PROJECTS/llms/digital-research-assistant/data/input/"
username = "olawale_ibrahim"
filepaths = ROOT_DIR + username + "/"
os.listdir(filepaths)

['2310.11511v1.pdf', '2312.10997v5.pdf', '2005.11401v4.pdf']

In [21]:
filepaths, "/home/olawale/Desktop/PROJECTS/llms/digital-research-assistant/data/input/olawale_ibrahim"

('/home/olawale/Desktop/PROJECTS/llms/digital-research-assistant/digital_research_assistant/data/input/olawale_ibrahim/',
 '/home/olawale/Desktop/PROJECTS/llms/digital-research-assistant/data/input/olawale_ibrahim')

In [23]:
!pip install python-docx

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2


In [24]:
from docx import Document

def extract_text_from_word(doc_path):
    """Extract text from a Word document."""
    document = Document(doc_path)
    text = ""
    for paragraph in document.paragraphs:
        text += paragraph.text + "\n"  # Add newlines to separate paragraphs
    return text.strip()

# Example usage
doc_path = "/home/olawale/Desktop/PROJECTS/llms/digital-research-assistant/data/input/olawale_ibrahim/Olawale Ibrahim Bio.docx"  # Replace with your Word document's path
extracted_text = extract_text_from_word(doc_path)
print(extracted_text)

Olawale holds a B.Tech. in Applied Geophysics (first class honors) from the Federal University of Technology, Nigeria. He is a seasoned geoscience software developer with over four years of experience working with oil and gas companies across the world. He specializes in helping geoscience teams adopt latest digital tools like AI and machine learning in improving and automating traditional workflows to better save time, cost and manpower labor, which consequently improves efficiency and generates better results. 
In his latest role as a machine learning engineer at CGG (UK), He leads current R&D efforts and advancements by spearheading research, developing and putting analytical and digital tools in the hands of experienced geoscientists to make quicker and insightful interpretations. At Earth Science Analytics (Norway), he developed machine learning models for both seismic processing, interpretation and petrophysical applications. As a machine learning consultant with dGB Earth Scienc