### Task 1: Write a Python program to download at most 10,000 abstracts (in either XML format or just the plain text abstracts) using Entrez utilities belonging to each of the four classes given at the top of the page.

In [2]:
pip install entrezpy --user

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install biopython

Note: you may need to restart the kernel to use updated packages.


#### Using entrezpy 

In [6]:
import entrezpy.conduit
import entrezpy.base.result
import entrezpy.base.analyzer

In [12]:
c = entrezpy.conduit.Conduit('adistar121999@gmail.com')
fetch_pubmed = c.new_pipeline()
fetch_pubmed.add_fetch({'db':'pubmed', 'id':['D064420'], 'retmode':'xml'})
c.run(fetch_pubmed)

<?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2019//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd">
<PubmedArticleSet>
<PubmedArticle>
    <MedlineCitation Status="MEDLINE" Owner="NLM">
        <PMID Version="1">64420</PMID>
        <DateCompleted>
            <Year>1977</Year>
            <Month>03</Month>
            <Day>31</Day>
        </DateCompleted>
        <DateRevised>
            <Year>2013</Year>
            <Month>11</Month>
            <Day>21</Day>
        </DateRevised>
        <Article PubModel="Print">
            <Journal>
                <ISSN IssnType="Print">0323-4347</ISSN>
                <JournalIssue CitedMedium="Print">
                    <Volume>103</Volume>
                    <Issue>6</Issue>
                    <PubDate>
                        <Year>1976</Year>
                    </PubDate>
                </JournalIssue>
                <Title>Folia haematologica (Leipzig, Germany : 1

<entrezpy.efetch.efetch_analyzer.EfetchAnalyzer at 0x22593c97af0>

#### Using Bio

In [102]:
from Bio import Entrez
import pandas as pd
import numpy as np
import os

In [119]:
def search(query):
    Entrez.email = 'adistar121999@gmail.com'
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax='1000',
                            retmode='xml',
                            term=query)
    results = Entrez.read(handle)
    return results

In [120]:
def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'adistar121999@gmail.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

In [61]:
results = search('Drug-Related Side Effects and Adverse Reactions')
id_list = results['IdList']
papers = fetch_details(id_list)

In [59]:
import json
print(json.dumps(papers['PubmedArticle'][0], indent=2))

{
  "MedlineCitation": {
    "SpaceFlightMission": [],
    "KeywordList": [
      [
        "adverse drug reaction",
        "allergy",
        "chondroitin",
        "glucosamine",
        "osteoarthritis"
      ]
    ],
    "GeneralNote": [],
    "OtherAbstract": [],
    "CitationSubset": [
      "IM"
    ],
    "OtherID": [],
    "PMID": "31597786",
    "DateCompleted": {
      "Year": "2021",
      "Month": "01",
      "Day": "18"
    },
    "DateRevised": {
      "Year": "2021",
      "Month": "01",
      "Day": "18"
    },
    "Article": {
      "Language": [
        "eng"
      ],
      "ELocationID": [
        "10.1136/postgradmedj-2019-136957"
      ],
      "ArticleDate": [
        {
          "Year": "2019",
          "Month": "10",
          "Day": "09"
        }
      ],
      "Journal": {
        "ISSN": "1469-0756",
        "JournalIssue": {
          "Volume": "96",
          "Issue": "1134",
          "PubDate": {
            "Year": "2020",
            "Month": "Apr"


In [125]:
df_fin = pd.DataFrame(columns=['pmid','title','key words','abstract'])
for paper in papers['PubmedArticle'][:20]:
    klist,abstract='',''
    pmid = paper['MedlineCitation']['PMID']
    title = paper['MedlineCitation']['Article']['ArticleTitle']
    if paper['MedlineCitation']['KeywordList']:
        for k in paper['MedlineCitation']['KeywordList'][0]:
            klist=klist+k+' '
        klist = klist.split(' ')[:-1]
    else:
        klist = []
    for t in paper['MedlineCitation']['Article']['Abstract']['AbstractText']:
        abstract+=t
    df = pd.DataFrame({'pmid':pmid,'title':title,'key words':[klist],'abstract':abstract})
    df_fin = df_fin.append(df,ignore_index=True)
df_fin['Label'] = 'adverse drug'
df_fin.head()

Unnamed: 0,pmid,title,key words,abstract,Label
0,31597786,Hypersensitive adverse drug reactions to gluco...,"[adverse, drug, reaction, allergy, chondroitin...",This study investigates spontaneous adverse dr...,adverse drug
1,31056713,Impact of protocol change on individual factor...,"[Adverse, reactions, Breast, cancer, Chemother...","Asthenia, myalgia, arthralgia, mucositis, abdo...",adverse drug
2,31771857,Adverse drug reactions.,"[Adverse, drug, reaction, Drug, safety, Efecto...",An adverse drug reaction (ADR) is defined as a...,adverse drug
3,32249738,Adverse Drug Reactions in Canada (2009-2018): ...,[],"Annually, thousands of individuals die and ten...",adverse drug
4,30833488,Adverse Drug Reactions in an Oncological Popul...,"[Adverse, drug, reactions, Clinical, oncology,...",Our goal was to determine (a) the prevalence o...,adverse drug


In [135]:
def create_df(query,label):
    
    results = search(query)
    id_list = results['IdList']
    papers = fetch_details(id_list)
    
    df_fin = pd.DataFrame(columns=['pmid','title','key words','abstract'])
    
    
    for paper in papers['PubmedArticle']:
        if 'Abstract' in paper['MedlineCitation']['Article'].keys():
            klist,abstract='',''
        
            pmid = paper['MedlineCitation']['PMID']
        
            title = paper['MedlineCitation']['Article']['ArticleTitle']
        
            if paper['MedlineCitation']['KeywordList']:
                for k in paper['MedlineCitation']['KeywordList'][0]:
                    klist=klist+k+' '
                klist = klist.split(' ')[:-1]
            else:
                klist = []
    
            for t in paper['MedlineCitation']['Article']['Abstract']['AbstractText']:
                abstract+=t
    
            df = pd.DataFrame({'pmid':pmid,'title':title,'key words':[klist],'abstract':abstract})
        
            df_fin = df_fin.append(df,ignore_index=True)
    
    df_fin['Label'] = label
    return df_fin

In [137]:
DAE = ['Drug Related Side Effects and Adverse Reactions',
'Side Effects of Drugs',
'Drug Side Effects',
'Drug Side Effect',
'Effects, Drug Side',
'Side Effect, Drug',
'Side Effects, Drug',
'Adverse Drug Reaction',
'Adverse Drug Reactions',
'Drug Reaction, Adverse',
'Drug Reactions, Adverse',
'Reactions, Adverse Drug',
'Adverse Drug Event',
'Adverse Drug Events',
'Drug Event, Adverse',
'Drug Events, Adverse',
'Drug Toxicity',
'Toxicity, Drug',
'Drug Toxicities',
'Toxicities, Drug']

CA = ['Abnormality, Congenital',
'Congenital Abnormality',
'Deformities',
'Deformity',
'Congenital Defects',
'Congenital Defect',
'Defect, Congenital',
'Defects, Congenital',
'Abnormalities, Congenital',
'Birth Defects',
'Birth Defect',
'Defect, Birth',
'Fetal Malformations',
'Fetal Malformation',
'Malformation, Fetal',
'Fetal Anomalies',
'Anomaly, Fetal',
'Fetal Anomaly'] 

In [152]:
def label_search(lst,label):
    df_lst = [] 
    for q in lst:
        df_lst.append(create_df(q,label))
    df = pd.concat(df_lst,ignore_index=True)
    return df

In [141]:
df_DAE = label_search(DAE,'Drug adverse events')

In [142]:
df_CA = label_search(CA,'Congenital anomalies')

In [145]:
df_DAE.drop_duplicates(subset='pmid',keep='first',inplace=True)
df_CA.drop_duplicates(subset='pmid',keep='first',inplace=True)

In [158]:
OTHER = ['latest','virus','disease','anomalies','deficiency','hair']
df_OTH = label_search(OTHER,'Others')

In [160]:
df_OTH.drop_duplicates(subset='pmid',keep='first',inplace=True)

In [161]:
df

Unnamed: 0,pmid,title,key words,abstract,Label
0,32808142,Latest Generation High-Definition Colonoscopy ...,"[Adenomatous, polyps, Colonic, polyps, Colonos...",Adenoma detection rate (ADR) is an important q...,Others
1,33434275,Evaluation of automated cephalometric analysis...,"[Artificial, intelligence, Deep, learning, Mac...",To compare an automated cephalometric analysis...,Others
2,32852869,Reproducibility and repeatability of identifyi...,"[cardiac, resynchronization, therapy, coronary...",Studies have shown an association between the ...,Others
3,33337279,Early Outcomes of Isolated Aortic Valve Replac...,"[Trifecta, valve, aortic, valve, replacement, ...",The aim of this study is to evaluate early and...,Others
4,32110605,"Knowledge, attitude, and practice of general d...","[Dental, dental, technology, knowledge, practice]",Choosing latest technology for the treatment i...,Others
...,...,...,...,...,...
5674,27106540,Hair Trace Elements are Associated with Increa...,"[Boron, Goiter, Iodine, Schoolchildren, Silico...",The objective of the study was analysis of hai...,Others
5675,27560098,Perception of Hair Transplant for Androgenetic...,[],Hair transplant is among the most common cosme...,Others
5676,28600869,Follow-up on the characterization of peptidic ...,[],Species identification of hair is routinely do...,Others
5677,26884274,Sources of variation in hair cortisol in wild ...,"[Chronic, stress, Cortisol, Hair, cortisol, an...",Hair cortisol analysis is a potentially powerf...,Others


In [163]:
CA_DE = ['Congenital anomalies caused by drug usage',
        'Drug-Related Side Effects and Adverse Reactions/congenital[Mesh]',
        'Congenital Abnormalities/adverse effects[Mesh] OR Congenital Abnormalities/drug effects[Mesh]',
        '(Congenital Abnormalities) AND (Drug effects)',
        'drug reaction congenital',
        'Congenital Abnormalities by drug effects']
df_CADE = label_search(CA_DE,'congenital anomalies caused by drug usage')

In [164]:
df_CADE.drop_duplicates(subset='pmid',keep='first',inplace=True)

In [172]:
df_merge = pd.merge(df_DAE,df_CA,left_on=['pmid','title','abstract'],right_on=['pmid','title','abstract'],how='inner')
df_merge.drop(['key words_x','Label_x'],axis=1,inplace=True)
df_merge.rename(columns={"key words_y": "key words", "Label_y": "Label"},inplace=True)

In [176]:
df_merge

Unnamed: 0,pmid,title,abstract,key words,Label
0,26017034,P-Glycoprotein-Mediated Drug Interactions in P...,Drug use in pregnancy is very common but may c...,[],Congenital anomalies


In [177]:
df_CADE[df_CADE['pmid']=='26017034']

Unnamed: 0,pmid,title,key words,abstract,Label
177,26017034,P-Glycoprotein-Mediated Drug Interactions in P...,[],Drug use in pregnancy is very common but may c...,congenital anomalies caused by drug usage


In [185]:
df_CA = df_CA[~(df_CA['pmid']=='26017034')]
df_DAE = df_DAE[~(df_DAE['pmid']=='26017034')]
df_OTH = df_OTH[~(df_OTH['pmid']=='26017034')]

#### Cheking redundancy in data

In [196]:
df_fin = pd.concat([df_CADE,df_CA,df_DAE,df_OTH]).drop_duplicates(subset=['pmid'],keep='first')

In [208]:
len(set(df_fin['pmid']))==len(set(df_fin['pmid']))

True

In [211]:
df_fin = df_fin.set_index('pmid')

In [200]:
print(df_CADE.shape,df_CA.shape,df_DAE.shape,df_OTH.shape)

(1794, 5) (4831, 5) (3474, 5) (5675, 5)


In [214]:
df_DAE.to_csv('DAE.csv',index=False)
df_CA.to_csv('CA.csv',index=False)
df_CADE.to_csv('CADE.csv',index=False)
df_OTH.to_csv('OTH.csv',index=False)

In [216]:
df_fin.to_csv('data.csv')