# GENPACT: Pubmed Text Classification

## Problem: Classify given set of Pubmed biomedical literature abstracts into 4 classes:
a)	Abstracts containing Drug adverse events  
b)	Abstracts containing Congenital anomalies  
c)	Abstracts containing both (a) and (b)  
d)	Others  

## Dataset: Pubmed 
https://pubmed.ncbi.nlm.nih.gov/


# TASK 1 
## Write a Python program to download at most 10,000 abstracts (in either XML format or just the plain text abstracts) using Entrez utilities belonging to each of the four classes 

MeSH (Medical Subject Headings) provides ‘topics’ to every abstract in the Pubmed database. I will use esearch and efetch to download articles belonging to the following topics: 

<li>Class 1: drug side effects (https://www.ncbi.nlm.nih.gov/mesh/68064420)
<li>Class 2: congenital anomalies (https://www.ncbi.nlm.nih.gov/mesh/68000013)
<li>Class 3: both (drug side effects congenital anomalies)
<li>Class 0: others (NOT drug NOT side NOT effects NOT congenital NOT anomalies)

Because of Pubmed limitations on the number of articles retrieved per second I will need to create a local copy of these topics on my computer as XML files. I will then parse these XML files and extract article abstracts, titles and other information and store it in plain text files.

# DONE
<li>Retrieved 32,082 article IDs and extracted 32038 abstracts for 4 classes
<table>
<tr><th>class<th>n.of IDs<th>n.of abstracts<th>query
<tr><td>Class 0<td>10000<td>9981<td>NOT drug NOT side NOT effects NOT congenital NOT anomalies
<tr><td>Class 1<td>10000<td>9995<td>drug side effects NOT congenital NOT anomalies
<tr><td>Class 2<td>10000<td>9980<td>congenital anomalies NOT drug NOT side NOT effects 
<tr><td>Class 3<td>2082<td>2082<td>drug side effects congenital anomalies
</table>

# TODO

<li> Play with different queries to increase the total number of retrieved articles with English abstracts, especially for the minority Class 3. 
<li> Extract journal names, year, etc.

# Read IDs from Pubmed

In [2]:
import requests
import xml.etree.ElementTree

import collections
import datetime
import gzip
import os
import sys
import time
import pandas as pd

In [65]:
def requests_query(payload, docmax=10000, retmax=100, sleep=0.34):
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    payload['rettype'] = 'xml'
    payload['retmax'] = retmax
    payload['retstart'] = 0
    ids = list()
    count = 1
    while payload['retstart'] < count:
        response = requests.get(url, params=payload)
        
        tree = xml.etree.ElementTree.fromstring(response.text)
        count = int(tree.findtext('Count'))
        if count == 0:
            print("No items found, count=0")
            break
         
        ids += [id_.text for id_ in tree.findall('IdList/Id')]
        payload['retstart'] += retmax
        
        print("ids=", len(ids))
        if len(ids) >= docmax:
            break

        time.sleep(sleep)
    return ids

In [66]:
%%time
if True:
    # Queries for 4 classes
    queries = ['NOT drug NOT side NOT effects NOT adverse NOT reactions NOT congenital NOT abnormality NOT deformity',
               'drug related side effects adverse reactions NOT congenital NOT abnormality NOT deformity', 
               'congenital abnormality deformity NOT drug NOT side NOT effects NOT adverse NOT reactions', 
               'drug side effects adverse congenital abnormality', 
           ]    
    for iclass in range(4):    
        # Run esearch queries to read pubmed IDs from MeSH
        pubmed_ids = requests_query({'db': 'pubmed', 'term': queries[iclass]})
        pubmed_ids = sorted(map(int, pubmed_ids))
        print('Class=', iclass)
        print('Total ids=', len(pubmed_ids))
        print('Query=', queries[iclass])

        # Save pubmed IDs to a TSV file
        id_df = pd.DataFrame({'pubmed_id': pubmed_ids})
        id_df.to_csv('./data/'+str(iclass)+'/ids.tsv', sep='\t', index=False)
        
        # Save query
        with open('./data/'+str(iclass)+'/query.txt', 'wt') as write_file:                
            write_file.write(queries[iclass])

ids= 100
ids= 200
ids= 300
ids= 400
ids= 500
ids= 600
ids= 700
ids= 800
ids= 900
ids= 1000
ids= 1100
ids= 1200
ids= 1300
ids= 1400
ids= 1500
ids= 1600
ids= 1700
ids= 1800
ids= 1900
ids= 2000
ids= 2100
ids= 2200
ids= 2300
ids= 2400
ids= 2500
ids= 2600
ids= 2700
ids= 2800
ids= 2900
ids= 3000
ids= 3100
ids= 3200
ids= 3300
ids= 3400
ids= 3500
ids= 3600
ids= 3700
ids= 3800
ids= 3900
ids= 4000
ids= 4100
ids= 4200
ids= 4300
ids= 4400
ids= 4500
ids= 4600
ids= 4700
ids= 4800
ids= 4900
ids= 5000
ids= 5100
ids= 5200
ids= 5300
ids= 5400
ids= 5500
ids= 5600
ids= 5700
ids= 5800
ids= 5900
ids= 6000
ids= 6100
ids= 6200
ids= 6300
ids= 6400
ids= 6500
ids= 6600
ids= 6700
ids= 6800
ids= 6900
ids= 7000
ids= 7100
ids= 7200
ids= 7300
ids= 7400
ids= 7500
ids= 7600
ids= 7700
ids= 7800
ids= 7900
ids= 8000
ids= 8100
ids= 8200
ids= 8300
ids= 8400
ids= 8500
ids= 8600
ids= 8700
ids= 8800
ids= 8900
ids= 9000
ids= 9100
ids= 9200
ids= 9300
ids= 9400
ids= 9500
ids= 9600
ids= 9700
ids= 9800
ids= 9900
ids= 10000
Class= 0

# Check ID lists for intersections

In [8]:
for iclass1 in range(4):    
    # Read idx from TSV file
    pubmed_ids1 = list(pd.read_csv('./data/'+str(iclass1)+'/ids.tsv').pubmed_id)
    for iclass2 in range(4):    
        if iclass1 == iclass2:
            continue
        # Read idx from TSV file
        pubmed_ids2 = list(pd.read_csv('./data/'+str(iclass2)+'/ids.tsv').pubmed_id)
        
        inter = list(set(pubmed_ids1) & set(pubmed_ids2)) 
        if len(inter) > 0:
            print(len(inter))

# Read article abstacts as XML

In [67]:
def cleanHtml(sentence):
    sentence = sentence.replace("<sub>","")
    sentence = sentence.replace("</sub>","")
    sentence = sentence.replace("<sup>","")
    sentence = sentence.replace("</sup>","")
    sentence = sentence.replace("<i>","")
    sentence = sentence.replace("</i>","")
    sentence = sentence.replace("<b>","")
    sentence = sentence.replace("</b>","")
    return sentence

In [68]:
def requests_efetch(ids, write_file, retmax=1000, retmin=20, sleep=0.34, error_sleep=10):
    # Base URL 
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
            
    idq = collections.deque()
    for i in range(0, len(ids), retmax):
        idq.append(ids[i:i+retmax])

    # Query until the queue is empty
    successive_errors = 0
    while idq:
        time.sleep(sleep)
        id_subset = idq.popleft()
        id_subset_len = len(id_subset)       
        id_string = ','.join(map(str, id_subset))
        payload = {'db': 'pubmed', 'id': id_string, 'retmode': 'xml'}
        try:
            response = requests.get(url, params=payload)
            
            resp = response.text
            resp = cleanHtml(resp)
            resp = resp.encode('unicode-escape').decode('utf-8').replace("\\n", " ")
            write_file.write(str(resp))
                        
            successive_errors = 0
        except Exception as e:
            successive_errors += 1
            print('{} successive error: {} IDs [{} ... {}] threw {}'.format(
                    successive_errors, id_subset_len, id_subset[0], id_subset[-1], e))
            if id_subset_len >= retmin * 2:
                mid = len(id_subset) // 2
                idq.appendleft(id_subset[:mid])
                idq.appendleft(id_subset[:mid])
            else:
                idq.appendleft(id_subset)
            time.sleep(error_sleep * successive_errors)
            continue        

In [69]:
%%time
if True:
    for iclass in range(4):    
        # Read idx from TSV file
        pubmed_ids = list(pd.read_csv('./data/'+str(iclass)+'/ids.tsv').pubmed_id)
        print('Class=', iclass)
        print('Total ids=', len(pubmed_ids))
        # Fetch
        istep = 100
        for i in range(0,len(pubmed_ids),istep):
            if i % 100 == 0:
                print('id=', i)
            with open('./data/'+str(iclass)+'/'+str(i).zfill(5)+'.xml', 'w') as write_file:                
                requests_efetch(pubmed_ids[i:min(i+istep,len(pubmed_ids))], write_file, retmax=istep, retmin=max(istep/2,1), sleep=0, error_sleep=1)

Class= 0
Total ids= 10000
id= 0
id= 100
id= 200
id= 300
id= 400
id= 500
id= 600
id= 700
id= 800
id= 900
id= 1000
id= 1100
id= 1200
id= 1300
id= 1400
id= 1500
id= 1600
id= 1700
id= 1800
id= 1900
id= 2000
id= 2100
id= 2200
id= 2300
id= 2400
id= 2500
id= 2600
id= 2700
id= 2800
id= 2900
id= 3000
id= 3100
id= 3200
id= 3300
id= 3400
id= 3500
id= 3600
id= 3700
id= 3800
id= 3900
id= 4000
id= 4100
id= 4200
id= 4300
id= 4400
id= 4500
id= 4600
id= 4700
id= 4800
id= 4900
id= 5000
id= 5100
id= 5200
id= 5300
id= 5400
id= 5500
id= 5600
id= 5700
id= 5800
id= 5900
id= 6000
id= 6100
id= 6200
id= 6300
id= 6400
id= 6500
id= 6600
id= 6700
id= 6800
id= 6900
id= 7000
id= 7100
id= 7200
id= 7300
id= 7400
id= 7500
id= 7600
id= 7700
id= 7800
id= 7900
id= 8000
id= 8100
id= 8200
id= 8300
id= 8400
id= 8500
id= 8600
id= 8700
id= 8800
id= 8900
id= 9000
id= 9100
id= 9200
id= 9300
id= 9400
id= 9500
id= 9600
id= 9700
id= 9800
id= 9900
Class= 1
Total ids= 10000
id= 0
id= 100
id= 200
id= 300
id= 400
id= 500
id= 600
id= 70

# Parse XML files, extract article titles and abstracts

In [71]:
import xml.etree.ElementTree as ET

for iclass in range(4):    
    # Read idx from TSV file
    pubmed_ids = list(pd.read_csv('./data/'+str(iclass)+'/ids.tsv').pubmed_id)
    print('Class=', iclass)
    print('Total ids=', len(pubmed_ids))
    
    istep = 100
    for i in range(0,len(pubmed_ids),istep):
        if i % 100 == 0:
            print('id=', i)

        tree = ET.parse('./data/'+str(iclass)+'/'+str(i).zfill(5)+'.xml')
        root = tree.getroot()
        
        with open('./data/'+str(iclass)+'/a'+str(i).zfill(5)+'.txt', 'w') as a_file:                
            for ichild, child in enumerate(root):
                if (child.tag == "PubmedArticle"):
                    article = child.find('MedlineCitation/Article')
                if (child.tag == "PubmedBookArticle"):                       
                    article = child.find('BookDocument')
                if article == None:
                    print("None "+child.tag+" article for i="+str(i+ichild))    
                    continue
                
                title = article.find('ArticleTitle')
                if title == None:
                    print("None title for i=",i+ichild)
                else:
                    ttext = title.text
                    if ttext == None:
                        print("No title for i=",i+ichild)
                    else:
                        a_file.write(str(ttext)+" ")

                for abstract in article.findall('Abstract/AbstractText'):
                    if abstract == None:
                        print("None abstract for i=",i+ichild)
                    else:
                        atext = abstract.text
                        if atext == None:
                            print("No abstract for i=",i+ichild)
                        else:
                            a_file.write(str(atext)+" ")    
                a_file.write("\n")    

Class= 0
Total ids= 10000
id= 0
No abstract for i= 36
No abstract for i= 76
id= 100
id= 200
id= 300
id= 400
No abstract for i= 409
No abstract for i= 455
id= 500
id= 600
No abstract for i= 637
No abstract for i= 637
No abstract for i= 637
No abstract for i= 637
No abstract for i= 637
id= 700
id= 800
id= 900
No abstract for i= 963
No abstract for i= 964
id= 1000
No title for i= 1073
id= 1100
No title for i= 1188
No abstract for i= 1192
id= 1200
None title for i= 1245
id= 1300
id= 1400
id= 1500
id= 1600
id= 1700
id= 1800
id= 1900
id= 2000
id= 2100
id= 2200
id= 2300
No abstract for i= 2342
id= 2400
id= 2500
No abstract for i= 2538
No abstract for i= 2539
id= 2600
No abstract for i= 2686
No abstract for i= 2687
No abstract for i= 2688
id= 2700
id= 2800
id= 2900
id= 3000
id= 3100
id= 3200
id= 3300
id= 3400
id= 3500
id= 3600
No abstract for i= 3663
id= 3700
id= 3800
id= 3900
id= 4000
id= 4100
id= 4200
No title for i= 4225
No title for i= 4226
id= 4300
id= 4400
id= 4500
id= 4600
id= 4700
No a