<a href="https://colab.research.google.com/github/mirsadra/casereport/blob/main/CaseSeriesAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81


In [4]:
from Bio import Entrez
import pandas as pd
import numpy as np
from google.colab import files

# Section 1: Data Fetch (PubMed)

In [11]:
def search_multiple_terms(query):
    Entrez.email = 'miirsadra@gmail.com'
    handle = Entrez.esearch(db='pubmed',
                            sort='publication date',
                            retmax='250000',
                            retmode='xml',
                            term=query + ' AND Case Reports[PT]')
    results = Entrez.read(handle)
    return results

query_original = '((renal ectopia) OR (ectopic kidney)) AND ((ectopic ureter) OR (ureteral ectopia)) AND ((crossed) OR (fused) OR (cross) OR (fuse))'

# New query:
query_new = '(crossed fused ectopic kidney OR crossed-fused renal ectopia OR crossed renal ectopia OR "crossed renal ectopia calculi" OR "crossed renal ectopia stones" OR "crossed renal ectopia carcinoma" OR "crossed renal ectopia cancer")'

# Combining both queries:
combined_query = query_original + ' OR ' + query_new

studies = search_multiple_terms(combined_query)
studiesIdList = studies['IdList']

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'miirsadra@gmail.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

# Building dataframe
title_list= []
abstract_list=[]
journal_list = []
language_list =[]
pubdate_year_list = []
pubdate_month_list = []
chunk_size = 10000

for chunk_i in range(0, len(studiesIdList), chunk_size):
    chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
    papers = fetch_details(chunk)
    for i, paper in enumerate(papers['PubmedArticle']):
        title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
        try:
            abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
        except:
            abstract_list.append('No Abstract')
        journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
        language_list.append(paper['MedlineCitation']['Article']['Language'][0])
        try:
            pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
        except:
            pubdate_year_list.append('No Data')
        try:
            pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
        except:
            pubdate_month_list.append('No Data')

df = pd.DataFrame(list(zip(title_list, abstract_list, journal_list, language_list, pubdate_year_list, pubdate_month_list)),
                  columns=['Title', 'Abstract', 'Journal', 'Language', 'Year','Month'])

# Standardizing month format
df['Month'].replace({
    'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
    'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12', 'No Data': np.nan
}, inplace=True)

In [15]:
df.to_csv('pubmed_fetch.csv', index=False)
files.download('pubmed_fetch.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Section 2: Data Cleaning + Basics

In [16]:
df = pd.read_csv('pubmed_fetch.csv')
print(f"Total number of papers: {len(df)}")

df = df[df['Language'] == 'eng']
print(f"Total number of papers in English: {len(df)}")

df.head(10)

Total number of papers: 332
Total number of papers in English: 332


Unnamed: 0,Title,Abstract,Journal,Language,Year,Month
0,Recurrent urinary tract infections led to the ...,Crossed renal ectopy (CRE) is a very rare cong...,International journal of surgery case reports,eng,2023,8.0
1,Crossed-fused renal ectopia with renal calculi...,Crossed fused renal ectopia is a rare congenit...,International journal of surgery case reports,eng,2023,5.0
2,Crossed fused renal ectopia with hydronephrosi...,No Abstract,Asian journal of surgery,eng,2023,8.0
3,Mayer-Rokitansky-Kuster-Hauser syndrome type I...,Mayer-Rokitansky-Küster-Hauser (MRKH) syndrome...,Radiology case reports,eng,2023,5.0
4,Urethral Caruncle with Associated Renoureteric...,A urethral caruncle is a rare differential dia...,Sultan Qaboos University medical journal,eng,2023,2.0
5,A Rare Case of Crossed Non-fused Ectopic Kidney.,Crossed and non-fused kidneys are an exceeding...,Cureus,eng,2022,11.0
6,Pelviureteric junction obstruction of the ipsi...,Crossed fused renal ectopia is rare and usuall...,IJU case reports,eng,2022,9.0
7,Obstructed hemivagina with ipsilateral renal a...,Obstructed hemivagina with ipsilateral renal a...,BMJ case reports,eng,2022,6.0
8,Bilateral ureteral reimplantation in a patient...,Diphallia is a highly uncommon congenital urog...,World journal of clinical cases,eng,2022,3.0
9,Primary Chondrosarcoma in L-shaped Crossed Fus...,Primary renal chondrosarcomas are rare tumors ...,Journal of kidney cancer and VHL,eng,2022,


In [17]:
df['Month'].fillna(0, inplace=True)
df['Month'] = df['Month'].astype(float).astype(int)

In [20]:
print(f"Time span of papers: {df['Year'].min()} to {df['Year'].max()}")
print(f"{df['Journal'].value_counts()}")
print(f"There are {df[df['Abstract'] == 'No Abstract'].shape[0]} rows with 'No Abstract' in the Abstract column.")

Time span of papers: 1960 to 2023
Urology                                          36
The Journal of urology                           31
Clinical nuclear medicine                        16
Cureus                                            8
Urology case reports                              8
                                                 ..
Journal of transplantation                        1
Medicine                                          1
Journal of cardiovascular computed tomography     1
Odontology                                        1
Delaware medical journal                          1
Name: Journal, Length: 146, dtype: int64
There are 82 rows with 'No Abstract' in the Abstract column.
