# Data Collection

## Importing Libraries

In [2]:
from Bio import Entrez
import pandas as pd
import numpy as np
import requests
import time
from bs4 import BeautifulSoup

## Accessing Journal Names

In [43]:
df = pd.read_csv('./journals_in_pmc.csv')
df.head()

Unnamed: 0,Journal title,NLM TA,pISSN,eISSN,Publisher,LOCATORplus ID,Latest issue,Earliest volume,Free access,Open access,Participation level,Deposit status,Journal URL
0,3 Biotech,3 Biotech,2190-572X,2190-5738,Springer,101565857,v.10(9);Sep 2020,v.1;2011,12 months,Some,Full,,http://www.ncbi.nlm.nih.gov/pmc/journals/1811/
1,3D Printing in Medicine,3D Print Med,,2365-6271,BioMed Central,101721758,v.5;Dec 2019,v.2;2016,Immediate,All,Full,,http://www.ncbi.nlm.nih.gov/pmc/journals/3516/
2,AACE Clinical Case Reports,AACE Clin Case Rep,,2376-0605,American Association of Clinical Endocrinologists,101670593,v.6(5);Sep-Oct 2020,v.5;2019,Immediate,No,Full,,http://www.ncbi.nlm.nih.gov/pmc/journals/3582/
3,The AAPS Journal,AAPS J,,1550-7416,American Association of Pharmaceutical Scientists,101223209,v.18(3);May 2016,v.6;2004,,Some,Full,No New Content,http://www.ncbi.nlm.nih.gov/pmc/journals/792/
4,AAPS PharmSci,AAPS PharmSci,,1522-1059,American Association of Pharmaceutical Scientists,100897065,v.6(2);Jun 2004,v.1;1999,Immediate,No,Full,Predecessor,http://www.ncbi.nlm.nih.gov/pmc/journals/989/


In [44]:
df['Participation level'].value_counts()

 Full              3104
 NIH Portfolio      448
Name: Participation level, dtype: int64

In [45]:
df = pd.concat((df, pd.get_dummies(df['Participation level'], prefix='participation')), axis=1)

In [46]:
df['participation_full'] = df['participation_ Full ']

In [47]:
df = df.drop(columns=['participation_ NIH Portfolio ', 'participation_ Full '])

In [48]:
df.loc[df['participation_full'] == 0].index

Int64Index([  17,   23,   24,   25,   26,   27,   28,   29,   30,   35,
            ...
            3396, 3408, 3415, 3427, 3456, 3464, 3476, 3478, 3479, 3544],
           dtype='int64', length=448)

In [49]:
df = df.drop(df.loc[df['participation_full'] == 0].index, axis=0)

In [50]:
print(df.shape)
df['participation_full'].value_counts()

(3104, 14)


1    3104
Name: participation_full, dtype: int64

In [None]:
df.to_csv('./journals_in_pmc_clean.csv')

In [157]:
df = pd.read_csv('./journals_in_pmc_clean.csv', index_col=False)
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,Journal title,NLM TA,pISSN,eISSN,Publisher,LOCATORplus ID,Latest issue,Earliest volume,Free access,Open access,Participation level,Deposit status,Journal URL,participation_full
0,3 Biotech,3 Biotech,2190-572X,2190-5738,Springer,101565857,v.10(9);Sep 2020,v.1;2011,12 months,Some,Full,,http://www.ncbi.nlm.nih.gov/pmc/journals/1811/,1
1,3D Printing in Medicine,3D Print Med,,2365-6271,BioMed Central,101721758,v.5;Dec 2019,v.2;2016,Immediate,All,Full,,http://www.ncbi.nlm.nih.gov/pmc/journals/3516/,1
2,AACE Clinical Case Reports,AACE Clin Case Rep,,2376-0605,American Association of Clinical Endocrinologists,101670593,v.6(5);Sep-Oct 2020,v.5;2019,Immediate,No,Full,,http://www.ncbi.nlm.nih.gov/pmc/journals/3582/,1
3,The AAPS Journal,AAPS J,,1550-7416,American Association of Pharmaceutical Scientists,101223209,v.18(3);May 2016,v.6;2004,,Some,Full,No New Content,http://www.ncbi.nlm.nih.gov/pmc/journals/792/,1
4,AAPS PharmSci,AAPS PharmSci,,1522-1059,American Association of Pharmaceutical Scientists,100897065,v.6(2);Jun 2004,v.1;1999,Immediate,No,Full,Predecessor,http://www.ncbi.nlm.nih.gov/pmc/journals/989/,1


In [15]:
df['Journal title'][388]

'BMC Blood Disorders'

## Pulling Data from PubMed/PMC - Retractions

### Pulling Data from PubMed - Retractions

In [None]:
#https://medium.com/@kliang933/scraping-big-data-from-public-research-repositories-e-g-pubmed-arxiv-2-488666f6f29b

In [None]:
ls_num = []
ls_id = []
ls_doi = []
ls_language = []
ls_year = []
ls_month = []
ls_day = []
ls_volume = []
ls_issue = []
ls_journal = []
ls_title = []
ls_page = []
x=0
no_doi = 0

for i in range(0,len(df['NLM TA'])):
    Entrez.email = 'lmpack01@outlook.com'
    handle = Entrez.esearch(db='pubmed',term='(hasretractionin) AND ('+df['NLM TA'][i]+'[Journal])', retmode='xml', retmax=1000)
    results = Entrez.read(handle)
    
    ids = ' , '.join(results['IdList'])
    print(df['NLM TA'][i],[i])
    
    if len(ids)==0:
        pass
    else:
        Entrez.email = 'lmpack01@outlook.com'
        handle = Entrez.efetch(db='pubmed', id = ids, retmode='xml', rettype='full')
        results_id = Entrez.read(handle)
        
        x += len(results['IdList'])
        print(x)
        
        for j in range(0, len(results['IdList'])):
            ls_id.append(results['IdList'][j])
            try:
                doi = str(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['ELocationID'][0])
                ls_doi.append(doi)
            except:
                ls_doi.append(None)
                no_doi += 1
                print(no_doi)
                
            try:
                language = str(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['Language'][0])
                ls_language.append(language)
            except:
                ls_language.append(None)
                
            try:
                year = int(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleDate'][0]['Year'])
                ls_year.append(year)
            except:
                ls_year.append(None)
            
            try:
                month = int(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleDate'][0]['Month'])
                ls_month.append(month)
            except:
                ls_month.append(None)
                
            try:
                day = int(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleDate'][0]['Day'])
                ls_day.append(day)
            except:
                ls_day.append(None)
                
            try:
                volume = int(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['Journal']['JournalIssue']['Volume'])
                ls_volume.append(volume)
            except:
                ls_volume.append(None)
            
            try:
                issue = int(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['Journal']['JournalIssue']['Issue'])
                ls_issue.append(issue)
            except:
                ls_issue.append(None)
                
            try:
                journal = str(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['Journal']['Title'])
                ls_journal.append(journal)
            except:
                ls_journal.append(None)
                
            try:
                title = str(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleTitle'])
                ls_title.append(title)
            except:
                ls_title.append(None)
                
            try:
                page = str(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['Pagination']['MedlinePgn'])
                ls_page.append(page)
            except:
                ls_page.append(None)
        
    time.sleep(5)

In [26]:
doi = pd.read_csv('./doi.csv')
doi['0'].isnull().sum()

827

In [158]:
data = pd.concat([pd.Series(ls_id), pd.Series(ls_doi), pd.Series(ls_language), pd.Series(ls_year), pd.Series(ls_month), pd.Series(ls_day), pd.Series(ls_volume), pd.Series(ls_issue), pd.Series(ls_journal), pd.Series(ls_title), pd.Series(ls_page)], axis=1)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,16353936,,eng,2005.0,10.0,19.0,7.0,3.0,The AAPS journal,Recent advances for the treatment of cocaine a...,E579-86
1,28667474,10.1208/s12249-017-0838-6,eng,2017.0,6.0,30.0,19.0,6.0,AAPS PharmSciTech,RETRACTED ARTICLE: Development and In Vitro-In...,2750
2,27511111,10.1208/s12249-016-0596-x,eng,2016.0,8.0,10.0,18.0,5.0,AAPS PharmSciTech,Study of the Transformations of Micro/Nano-cry...,1428-1437
3,23835739,10.1208/s12249-013-0001-y,eng,2013.0,7.0,9.0,14.0,3.0,AAPS PharmSciTech,Meloxicam taste-masked oral disintegrating tab...,1118-28
4,23800858,10.1208/s12249-013-9993-6,eng,2013.0,6.0,26.0,14.0,3.0,AAPS PharmSciTech,Design and formulation technique of a novel dr...,1045-54
...,...,...,...,...,...,...,...,...,...,...,...
2965,12619192,,eng,,,,44.0,1.0,Yonsei medical journal,Bilateral popliteal artery aneurysms with rupt...,159-62
2966,11371116,,eng,,,,42.0,2.0,Yonsei medical journal,Tic convulsif caused by cerebellopontine angle...,255-7
2967,22814263,10.3779/j.issn.1009-3419.2012.07.07,eng,,,,15.0,7.0,Zhongguo fei ai za zhi = Chinese journal of lu...,Lung cancer: microRNA and target database.,429-34
2968,20840810,10.3779/j.issn.1009-3419.2010.09.01,chi,,,,13.0,9.0,Zhongguo fei ai za zhi = Chinese journal of lu...,[Effect of Bufalin on proliferation and apopto...,841-5


In [159]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2970 entries, 0 to 2969
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       2970 non-null   object 
 1   1       2143 non-null   object 
 2   2       2970 non-null   object 
 3   3       2202 non-null   float64
 4   4       2202 non-null   float64
 5   5       2202 non-null   float64
 6   6       2896 non-null   float64
 7   7       2189 non-null   float64
 8   8       2970 non-null   object 
 9   9       2970 non-null   object 
 10  10      2851 non-null   object 
dtypes: float64(5), object(6)
memory usage: 255.4+ KB


In [161]:
data.to_csv('./pubmed_data_retraction.csv')

In [None]:
data = pd.read_csv('./pubmed_data_retraction.csv')
data

In [None]:
data = data.dropna(axis=0, subset=['1'])
data

In [164]:
data['2'].value_counts()

eng    2139
chi       2
spa       1
fre       1
Name: 2, dtype: int64

In [165]:
print(data.loc[data['2']=='chi'].index)
print(data.loc[data['2']=='spa'].index)
print(data.loc[data['2']=='fre'].index)

Int64Index([2968, 2969], dtype='int64')
Int64Index([259], dtype='int64')
Int64Index([2243], dtype='int64')


In [166]:
data = data.drop([2968, 2969, 259, 2243], axis=0)
data['2'].value_counts()

eng    2139
Name: 2, dtype: int64

In [None]:
ls_index = []
for i in data['1']:
    if len(i) < 7:
        for j in range(0,len(data.loc[data['1']==i].index)):
            ls_index.append(data.loc[data['1']==i].index[j])
data = data.drop(ls_index)

In [None]:
data = data.drop_duplicates()

In [None]:
data.to_csv('./pubmed_data_retraction_cleaned.csv')

In [2]:
data = pd.read_csv('./pubmed_data_retraction_cleaned.csv')
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,0,1,2,3,4,5,6,7,8,9,10
0,0,1,1,28667474,10.1208/s12249-017-0838-6,eng,2017.0,6.0,30.0,19.0,6.0,AAPS PharmSciTech,RETRACTED ARTICLE: Development and In Vitro-In...,2750
1,1,2,2,27511111,10.1208/s12249-016-0596-x,eng,2016.0,8.0,10.0,18.0,5.0,AAPS PharmSciTech,Study of the Transformations of Micro/Nano-cry...,1428-1437
2,2,3,3,23835739,10.1208/s12249-013-0001-y,eng,2013.0,7.0,9.0,14.0,3.0,AAPS PharmSciTech,Meloxicam taste-masked oral disintegrating tab...,1118-28
3,3,4,4,23800858,10.1208/s12249-013-9993-6,eng,2013.0,6.0,26.0,14.0,3.0,AAPS PharmSciTech,Design and formulation technique of a novel dr...,1045-54
4,4,5,5,18446488,10.1208/s12249-008-9044-x,eng,2008.0,2.0,14.0,9.0,1.0,AAPS PharmSciTech,"The influence of sodium hyaluronate, L-leucine...",243-9


In [4]:
data.shape

(2106, 14)

In [103]:
data['1'].sample(n=25,replace = False)

212             10.1186/1471-2121-13-8
618           10.3389/fpsyg.2016.01298
1877      10.1371/journal.pone.0001444
895            10.1074/jbc.M111.260414
1439             10.1093/neuonc/nor116
1995        10.1038/s41598-019-38519-5
602           10.3389/fphar.2017.00871
511                10.7554/eLife.12248
2108           10.4103/2229-5070.72109
1953         10.1186/s12978-019-0732-7
1175    10.1523/JNEUROSCI.2613-09.2009
1428                10.1038/ncomms6446
98             10.4103/0256-4947.83211
1432                10.1038/ncomms1623
44           10.1107/S160053680706254X
599            10.3389/fonc.2013.00153
1488               10.2147/OTT.S124118
1            10.1208/s12249-016-0596-x
899            10.1074/jbc.M111.247726
2002        10.1038/s41598-017-10365-3
111            10.1074/jbc.M111.329078
1598      10.1371/journal.pone.0218664
1392              10.1128/MCB.01480-09
1340     10.1158/1535-7163.MCT-14-0672
1384              10.1128/MCB.00114-14
Name: 1, dtype: object

### Pulling Data from PMC - Retractions

In [47]:
Entrez.email = 'lmpack01@outlook.com'
handle = Entrez.esearch(db='pmc',term='10.11604/pamj.2013.16.18.2505', retmode='xml')
results = Entrez.read(handle)
results

{'Count': '1', 'RetMax': '1', 'RetStart': '0', 'IdList': ['3909696'], 'TranslationSet': [], 'TranslationStack': [{'Term': '10.11604/pamj.2013.16.18.2505[All Fields]', 'Field': 'All Fields', 'Count': '1', 'Explode': 'N'}, 'GROUP'], 'QueryTranslation': '10.11604/pamj.2013.16.18.2505[All Fields]'}

In [48]:
ids = ' , '.join(results['IdList'])
Entrez.email = 'lmpack01@outlook.com'
handle = Entrez.efetch(db='pmc', id = ids, retmode='xml', rettype='full')
text = handle.read()

In [None]:
soup = BeautifulSoup(text, "lxml")
soup.extract

In [None]:
ls_total_text = []
ls_total_keywords = []
ls_total_abstract = []
ls_publisher = []
count = 2075
no_text = 0

for i in data['1'][2075:2106]:
    Entrez.email = 'lmpack01@outlook.com'
    handle = Entrez.esearch(db='pmc',term=i, retmode='xml')
    results = Entrez.read(handle)

    ids = ' , '.join(results['IdList'])
    Entrez.email = 'lmpack01@outlook.com'
    handle = Entrez.efetch(db='pmc', id = ids, retmode='xml', rettype='full')
    text = handle.read()

    soup = BeautifulSoup(text, "lxml")
    print(f"{i} [{count}]")
    
    try:
        ls_raw_text = []
        text = str()
        for i in range(0,len(soup.find_all("sec"))):
            for j in range(0,len(soup.find_all("sec")[i].find_all("p"))):
                ls_raw_text.append(str(soup.find_all("sec")[i].find_all("p")[j].text))

        for i in range(0,len(ls_raw_text)):
            text += ls_raw_text[i]
        ls_total_text.append(text)
        
        if text=='':
            no_text += 1
            print(f'No text --> {no_text}')
    except:
        ls_total_text.append(None)
    
    try:
        ls_keywords = []
        for i in range(0,len(soup.find_all("kwd"))):
            ls_keywords.append(soup.find_all("kwd")[i].text)
        ls_total_keywords.append(ls_keywords)
    except:
        ls_total_keywords.append(None)
    
    try:
        ls_abstract = []
        for i in soup.find_all("abstract")[0].text.split('\n'):
            if i == '':
                pass
            else:
                ls_abstract.append(i)
        ls_total_abstract.append(ls_abstract[0])
    except:
        ls_total_abstract.append(None)
    
    try:      
        ls_publisher.append(soup.find_all("publisher-name")[0].text)
    except:
        ls_publisher.append(None)
    
    count +=1
    time.sleep(3)

In [154]:
print(pd.Series(data['1'][2075:2106]).shape)
print(pd.Series(ls_total_text).shape)
print(pd.Series(ls_total_keywords).shape)
print(pd.Series(ls_total_abstract).shape)
print(pd.Series(ls_publisher).shape)

(31,)
(31,)
(31,)
(31,)
(31,)


In [155]:
pd.Series(data['1'][2075:2106]).to_csv('./doi__2075_2106.csv')
pd.Series(ls_total_text).to_csv('./text__2075_2106.csv')
pd.Series(ls_total_keywords).to_csv('./keywords__2075_2106.csv')
pd.Series(ls_total_abstract).to_csv('./abstract__2075_2106.csv')
pd.Series(ls_publisher).to_csv('./publisher__2075_2106.csv')

## Pulling Data from PubMed/PMC - No Retractions

### Pulling Data from PubMed - No Retractions

In [59]:
day = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16',
                 '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31']
len(day)

31

In [60]:
start_month = ['01/01', '02/01', '03/01', '04/01', '05/01', '06/01', 
               '07/01', '08/01', '09/01', '10/01', '11/01', '12/01']
end_month = ['01/31', '02/28', '03/31', '04/30', '05/31', '06/30', 
            '07/31', '08/31', '09/30', '10/31', '11/30', '12/31']

month = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
year = ['2015', '2016', '2017', '2018', '2019']
start_date = []
end_date = []

for i in year:
    for j in range(0,len(month)):
        if j == '01' or j == '03' or j == '05' or j == '07' or j == '08' or j == '10' or j == '12':
            day_num = np.random.choice(day, 1)
        elif j == '02':
            day_num = np.random.choice(day[:28])
        else:
            day_num = np.random.choice(day[:30])
        start_date.append(i+'/'+start_month[j])
        end_date.append(i+'/'+month[j]+'/'+day_num)
        
print(len(start_date))

60


In [None]:
ls_id = []
ls_doi = []
ls_language = []
ls_year = []
ls_month = []
ls_day = []
ls_volume = []
ls_issue = []
ls_journal = []
ls_title = []
ls_page = []
x=0
no_doi = 0

for i in range(0,len(start_date)):
    Entrez.email = 'lmpack01@outlook.com'
    handle = Entrez.esearch(db='pubmed',term='(PLoS One [Journal])', retmode='xml', retmax=167, mindate = start_date[i], maxdate = end_date[i])
    results = Entrez.read(handle)
    
    ids = ' , '.join(results['IdList'])

    
    if len(ids)==0:
        pass
    else:
        Entrez.email = 'lmpack01@outlook.com'
        handle = Entrez.efetch(db='pubmed', id = ids, retmode='xml', rettype='full')
        results_id = Entrez.read(handle)
        
        x += len(results['IdList'])
        print(x)
        
        for j in range(0, len(results['IdList'])):
            ls_id.append(results['IdList'][j])
            try:
                doi = str(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['ELocationID'][0])
                ls_doi.append(doi)
            except:
                ls_doi.append(None)
                no_doi += 1
                print(no_doi)
                
            try:
                language = str(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['Language'][0])
                ls_language.append(language)
            except:
                ls_language.append(None)
                
            try:
                year = int(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleDate'][0]['Year'])
                ls_year.append(year)
                print(year)
            except:
                ls_year.append(None)
            
            try:
                month = int(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleDate'][0]['Month'])
                ls_month.append(month)
                print(month)
            except:
                ls_month.append(None)
                
            try:
                day = int(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleDate'][0]['Day'])
                ls_day.append(day)
                print(day)
            except:
                ls_day.append(None)
                
            try:
                volume = int(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['Journal']['JournalIssue']['Volume'])
                ls_volume.append(volume)
            except:
                ls_volume.append(None)
            
            try:
                issue = int(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['Journal']['JournalIssue']['Issue'])
                ls_issue.append(issue)
            except:
                ls_issue.append(None)
                
            try:
                journal = str(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['Journal']['Title'])
                ls_journal.append(journal)
            except:
                ls_journal.append(None)
                
            try:
                title = str(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleTitle'])
                ls_title.append(title)
            except:
                ls_title.append(None)
                
            try:
                page = str(results_id['PubmedArticle'][j]['MedlineCitation']['Article']['Pagination']['MedlinePgn'])
                ls_page.append(page)
            except:
                ls_page.append(None)
        
    time.sleep(5)

In [63]:
data_no_retract = pd.concat([pd.Series(ls_id), pd.Series(ls_doi), pd.Series(ls_language), pd.Series(ls_year), pd.Series(ls_month), pd.Series(ls_day), pd.Series(ls_volume), pd.Series(ls_issue), pd.Series(ls_journal), pd.Series(ls_title), pd.Series(ls_page)], axis=1)
data_no_retract

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,25569838,10.1371/journal.pone.0115528,eng,2015,1,8,10,1,PloS one,High incidence is not high exposure: what prop...,e0115528
1,25569796,10.1371/journal.pone.0115194,eng,2015,1,8,10,1,PloS one,Neurological abnormalities in full-term asphyx...,e0115194
2,25569682,10.1371/journal.pone.0117040,eng,2015,1,8,10,1,PloS one,Production of siderophores increases resistanc...,e0117040
3,25569558,10.1371/journal.pone.0116930,eng,2015,1,8,10,1,PloS one,Application of clinico-radiologic-pathologic d...,e0116930
4,25569428,10.1371/journal.pone.0116566,eng,2015,1,8,10,1,PloS one,Cinnamon ameliorates experimental allergic enc...,e0116566
...,...,...,...,...,...,...,...,...,...,...,...
9608,31856208,10.1371/journal.pone.0226734,eng,2019,12,19,14,12,PloS one,Statistical learning and the uncertainty of me...,e0226734
9609,31856207,10.1371/journal.pone.0226837,eng,2019,12,19,14,12,PloS one,Leishmania amazonensis resistance in murine ma...,e0226837
9610,31856205,10.1371/journal.pone.0226726,eng,2019,12,19,14,12,PloS one,Disparities in survival by stage after surgery...,e0226726
9611,31856204,10.1371/journal.pone.0227068,eng,2019,12,19,14,12,PloS one,Correction: Elevated levels of eEF1A2 protein ...,e0227068


In [65]:
data_no_retract[3].value_counts()

2019    1983
2016    1893
2018    1890
2015    1880
2017    1843
2014     124
Name: 3, dtype: int64

In [66]:
data_no_retract[4].value_counts()

12    959
5     930
10    918
7     835
9     835
3     823
2     804
1     768
11    743
8     724
6     668
4     606
Name: 4, dtype: int64

In [67]:
data_no_retract[5].value_counts()

14    806
12    642
13    490
23    477
31    474
15    463
28    424
22    421
11    420
6     385
8     334
7     324
18    317
24    310
5     303
10    301
19    269
1     253
9     243
16    238
3     236
25    219
27    212
17    199
4     172
2     169
26    167
20    156
21     98
29     91
Name: 5, dtype: int64

In [68]:
data_no_retract.to_csv('./pubmed_data_second_no_retraction.csv')

In [None]:
data_no_retract = pd.read_csv('./pubmed_data_second_no_retraction.csv')
data_no_retract = data_no_retract.dropna(axis=0, subset=['1'])
data_no_retract

In [70]:
data_no_retract['2'].value_counts()

eng    9613
Name: 2, dtype: int64

In [None]:
ls_index = []
for i in data_no_retract['1']:
    if len(i) < 7:
        for j in range(0,len(data_no_retract.loc[data_no_retract['1']==i].index)):
            ls_index.append(data_no_retract.loc[data_no_retract['1']==i].index[j])
data_no_retract = data_no_retract.drop(ls_index)
data_no_retract

In [None]:
data_no_retract = data_no_retract.drop_duplicates()
data_no_retract

### Pulling Data from PMC - No Retractions

In [None]:
ls_total_text = []
ls_total_keywords = []
ls_total_abstract = []
ls_publisher = []
count = 6166
no_text = 0

for i in data_no_retract['1'][6166:]:
    Entrez.email = 'lmpack01@outlook.com'
    handle = Entrez.esearch(db='pmc',term=i, retmode='xml')
    results = Entrez.read(handle)

    ids = ' , '.join(results['IdList'])
    Entrez.email = 'lmpack01@outlook.com'
    handle = Entrez.efetch(db='pmc', id = ids, retmode='xml', rettype='full')
    text = handle.read()

    soup = BeautifulSoup(text, "lxml")
    print(f"{i} [{count}]")
    
    try:
        ls_raw_text = []
        text = str()
        for i in range(0,len(soup.find_all("sec"))):
            for j in range(0,len(soup.find_all("sec")[i].find_all("p"))):
                ls_raw_text.append(str(soup.find_all("sec")[i].find_all("p")[j].text))

        for i in range(0,len(ls_raw_text)):
            text += ls_raw_text[i]
        ls_total_text.append(text)
        
        if text=='':
            no_text += 1
            print(f'No text --> {no_text}')
    except:
        ls_total_text.append(None)
    
    try:
        ls_keywords = []
        for i in range(0,len(soup.find_all("kwd"))):
            ls_keywords.append(soup.find_all("kwd")[i].text)
        ls_total_keywords.append(ls_keywords)
    except:
        ls_total_keywords.append(None)
    
    try:
        ls_abstract = []
        for i in soup.find_all("abstract")[0].text.split('\n'):
            if i == '':
                pass
            else:
                ls_abstract.append(i)
        ls_total_abstract.append(ls_abstract[0])
    except:
        ls_total_abstract.append(None)
    
    try:      
        ls_publisher.append(soup.find_all("publisher-name")[0].text)
    except:
        ls_publisher.append(None)
    
    count +=1
    time.sleep(3)

In [79]:
pd.Series(data_no_retract['1'][6166:]).to_csv('./doi__no_retraction__6166_end.csv')
pd.Series(ls_total_text).to_csv('./text__no_retraction__6166_end.csv')
pd.Series(ls_total_keywords).to_csv('./keywords__no_retraction__6166_end.csv')
pd.Series(ls_total_abstract).to_csv('./abstract__no_retraction__6166_end.csv')
pd.Series(ls_publisher).to_csv('./publisher__no_retraction__6166_end.csv')