Import 100 articles abstract from pubmed

In [3]:
import json
import csv
import pandas as pd
from Bio import Entrez
from Bio import Medline

In [5]:
# Define your search query
query = "clinical research in ophthalmology"
Entrez.email = "pa7wel@gmail.com"

# Function to search PubMed
def search_pubmed(query, retmax=100):
    handle = Entrez.esearch(db="pubmed", term=query, retmax=retmax)
    record = Entrez.read(handle)
    return record["IdList"]

# Function to fetch article metadata and abstract
def fetch_article_data(pubmed_ids):
    article_data = []
    handle = Entrez.efetch(db="pubmed", id=','.join(pubmed_ids), rettype="medline", retmode="text")
    records = Medline.parse(handle)
    for record in records:
        article_data.append(record)
    return article_data

# Search PubMed and fetch article metadata and abstracts
pubmed_ids = search_pubmed(query)
article_data = fetch_article_data(pubmed_ids)

# # Save the data in CSV format
with open("pubmed_abstracts.csv", "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Title", "Journal", "Publication Date", "PubMed ID", "Abstract"])
    for article in article_data:
        writer.writerow([
            article.get("TI", "N/A"),
            article.get("JT", "N/A"),
            article.get("DP", "N/A"),
            article.get("PMID", "N/A"),
            article.get("AB", "N/A"),
        ])

print("CSV file created: pubmed_abstracts.csv")
print(article_data)

CSV file created: pubmed_abstracts.csv
[{'PMID': '37031410', 'OWN': 'NLM', 'STAT': 'Publisher', 'LR': '20230409', 'IS': '1110-4902 (Print)', 'VI': '30', 'IP': '2', 'DP': '2023 Apr', 'TI': 'Serum metabolomic profiles and semaphorin-3A as biomarkers of diabetic retinopathy progression.', 'PG': '83-98', 'AB': 'Diabetic retinopathy (DR) is a typical microvascular complication of diabetes mellitus (DM) and it remains one of the leading causes of vision loss worldwide. Studies postulated that a distinct metabolic signature of DR exists and can be resolved from that of diabetes alone. Serum Semaphorin3A (Sema3A) levels have also been found to be correlated with the phenotypes of diabetic retinopathy. We aimed to analyze and identify serum metabolites and serum Sema3A levels that could be useful biomarkers of DR progression. This cross-sectional study included 45 type 2 diabetes (T2D) patients. Diabetic patients were divided into three groups based on the status of their complications: non-DR 

In [8]:
import csv
import random
from sklearn.model_selection import train_test_split

# Load the data from CSV file into a list
data = []
with open("pubmed_abstracts.csv", "r", newline='', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip the header row
    for row in reader:
        data.append(row)

# Divide the data into training and test sets
random.seed(0)  # Set a seed for reproducibility
train_data, test_data = train_test_split(data, test_size=0.2)

# Save the training and test sets to separate CSV files
with open("train_data.csv", "w", newline='', encoding="utf-8") as train_file:
    writer = csv.writer(train_file)
    writer.writerow(["Title", "Journal", "Publication Date", "PubMed ID", "Abstract"])
    for article in train_data:
        writer.writerow(article)

print(train_data)

[['Serum metabolomic profiles and semaphorin-3A as biomarkers of diabetic retinopathy progression.', 'The Egyptian journal of immunology', '2023 Apr', '37031410', 'Diabetic retinopathy (DR) is a typical microvascular complication of diabetes mellitus (DM) and it remains one of the leading causes of vision loss worldwide. Studies postulated that a distinct metabolic signature of DR exists and can be resolved from that of diabetes alone. Serum Semaphorin3A (Sema3A) levels have also been found to be correlated with the phenotypes of diabetic retinopathy. We aimed to analyze and identify serum metabolites and serum Sema3A levels that could be useful biomarkers of DR progression. This cross-sectional study included 45 type 2 diabetes (T2D) patients. Diabetic patients were divided into three groups based on the status of their complications: non-DR (NDR, n=15), non-proliferative DR (NPDR, n=15), and proliferative DR (PDR, n=15) groups. Serum metabolomic profiles of these patients were determ

In [11]:
import csv
import spacy

# Load the training data from CSV file into a list
train_data = []
with open("train_data.csv", "r", newline='', encoding="utf-8") as train_file:
    reader = csv.reader(train_file)
    next(reader)  # Skip the header row
    for row in reader:
        train_data.append(row[4])  # Store only the abstract text

# Load the default English model
nlp = spacy.load("en_core_web_sm")

# Train the named entity recognizer
ner = nlp.create_pipe("ner")
labels = ["CLINICAL_TRIAL", "ORGANIZATION", "DURATION", "TOPIC"]
for label in labels:
    ner.add_label(label)
nlp.add_pipe(ner, last=True)

# Update the model with the training data
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for i in range(10):
        random.shuffle(train_data)
        losses = {}
        for abstract in train_data:
            doc = nlp(abstract)
            gold = [(ent.text, ent.label_) for ent in doc.ents]
            gold = [(text, label) for text, label in gold if label in labels]
            nlp.update([abstract], [gold], sgd=optimizer, losses=losses)
        print(f"Loss after iteration {i}: {losses}")

# Save the trained model to disk
nlp.to_disk("entity_recognition_model")




ValueError: [E966] `nlp.add_pipe` now takes the string name of the registered component factory, not a callable component. Expected string, but got <spacy.pipeline.ner.EntityRecognizer object at 0x2840699a0> (name: 'None').

- If you created your component with `nlp.create_pipe('name')`: remove nlp.create_pipe and call `nlp.add_pipe('name')` instead.

- If you passed in a component like `TextCategorizer()`: call `nlp.add_pipe` with the string name instead, e.g. `nlp.add_pipe('textcat')`.

- If you're using a custom component: Add the decorator `@Language.component` (for function components) or `@Language.factory` (for class components / factories) to your custom component and assign it a name, e.g. `@Language.component('your_name')`. You can then run `nlp.add_pipe('your_name')` to add it to the pipeline.