# Data Cleaning

## Importing Libraries

In [3]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

## Cleaning Retraction Data

In [None]:
retract = pd.read_csv('./total_retraction_data.csv', index_col=False)
retract.head()

In [5]:
retract = retract.drop(columns=['Unnamed: 0', 'doi_check'])

In [None]:
retract = retract.dropna(axis=0, subset=['text'])
print(retract.shape)
retract.head()

In [None]:
retract['retraction_binary'] = 1
print(retract['retraction_binary'].value_counts())
retract.head()

In [8]:
retract.to_csv('./no_null_text_retraction_data.csv')
retract = pd.read_csv('./no_null_text_retraction_data.csv')
retract = retract.drop(columns=['Unnamed: 0'])

In [9]:
retract.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1553 entries, 0 to 1552
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1553 non-null   int64  
 1   doi                1553 non-null   object 
 2   language           1553 non-null   object 
 3   year               1315 non-null   float64
 4   month              1315 non-null   float64
 5   day                1315 non-null   float64
 6   volume             1518 non-null   float64
 7   issue              1043 non-null   float64
 8   journal            1553 non-null   object 
 9   title              1553 non-null   object 
 10  page               1487 non-null   object 
 11  text               1553 non-null   object 
 12  abstract           1533 non-null   object 
 13  keywords           1553 non-null   object 
 14  publisher          1549 non-null   object 
 15  retraction_binary  1553 non-null   int64  
dtypes: float64(5), int64(2),

In [10]:
ls_keywords_list = []
count = 0
for i in retract['keywords']:
    keywords_list = []
    if i == []:
        ls_keywords_list.append([])
    else:
        for j in i.split():
            keywords_list.append(j.replace("'",'').replace('[','').replace(',','').replace(']','').replace('(','').replace(')','').replace('\\n', '').replace('\\n','').lower())
        if keywords_list == ['']:
            ls_keywords_list.append([])
        else:
            ls_keywords_list.append(keywords_list)

retract['unpacked_keywords'] = ls_keywords_list

In [None]:
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

clean_text = []
clean_text_lem = []

for i in range(0, len(retract['text'])):
    ls_words = []
    ls_lem = []
    for j in tokenizer.tokenize(retract['text'][i]):
        try:
            int(j)
        except:
            if len(j) < 45:
                ls_words.append(j)
                ls_lem.append(lemmatizer.lemmatize(j))
            else:
                pass
        
    clean_text.append(' '.join(ls_words))
    clean_text_lem.append(' '.join(ls_lem))
    
    print(i)

In [12]:
print(len(clean_text[1]))
print(len(clean_text_lem[1]))

14994
14855


In [13]:
retract['clean_text'] = clean_text
retract['clean_text_lem'] = clean_text_lem
retract.head()

Unnamed: 0,id,doi,language,year,month,day,volume,issue,journal,title,page,text,abstract,keywords,publisher,retraction_binary,unpacked_keywords,clean_text,clean_text_lem
0,27511111,10.1208/s12249-016-0596-x,eng,2016.0,8.0,10.0,18.0,5.0,AAPS PharmSciTech,Study of the Transformations of Micro/Nano-cry...,1428-1437,‘Polymorphism’ generally referred as the abili...,This study elucidates the physical properties ...,"['monoclinic', 'nano-sized crystals', 'orthorh...",Springer International Publishing,1,"[monoclinic, nano-sized, crystals, orthorhombi...",Polymorphism generally referred as the ability...,Polymorphism generally referred a the ability ...
1,31263767,10.1021/acscentsci.9b00224,eng,2019.0,5.0,9.0,5.0,6.0,ACS central science,Targeted Protein Internalization and Degradati...,1079-1084,Traditional\ndrug development efforts are focu...,Targeted,[],American Chemical Society,1,[],Traditional drug development efforts are focus...,Traditional drug development effort are focuse...
2,31458862,10.1021/acsomega.8b00488,eng,2018.0,6.0,27.0,3.0,6.0,ACS omega,Regulating the Microstructure of Intumescent F...,6962-6970,Intumescent flame retardants\nare now being us...,A compatibilizer,[],American Chemical Society,1,[],Intumescent flame retardants are now being use...,Intumescent flame retardant are now being used...
3,31458855,10.1021/acsomega.8b00153,eng,2018.0,6.0,25.0,3.0,6.0,ACS omega,Solid-to-Solid Crystallization of Organic Thin...,6874-6879,Crystal growth process is basic and essential ...,The solid-to-solid crystallization processes o...,[],American Chemical Society,1,[],Crystal growth process is basic and essential ...,Crystal growth process is basic and essential ...
4,21837091,10.1107/S1600536811022574,eng,2011.0,6.0,18.0,67.0,,"Acta crystallographica. Section E, Structure r...",Oxonium picrate.,o1694,For general background to organic salts of pic...,"The title compound, H3O+·C6H2N3O7",[],International Union of Crystallography,1,[],For general background to organic salts of pic...,For general background to organic salt of picr...


## Cleaning No Retraction Data

In [15]:
text_one = pd.read_csv('./text__no_retraction__0_5545.csv')
text_two = pd.read_csv('./text__no_retraction__5545_6166.csv')
text_three = pd.read_csv('./text__no_retraction__6166_end.csv')

abstract_one = pd.read_csv('./abstract__no_retraction__0_5545.csv')
abstract_two = pd.read_csv('./abstract__no_retraction__5545_6166.csv')
abstract_three = pd.read_csv('./abstract__no_retraction__6166_end.csv')

keywords_one = pd.read_csv('./keywords__no_retraction__0_5545.csv')
keywords_two = pd.read_csv('./keywords__no_retraction__5545_6166.csv')
keywords_three = pd.read_csv('./keywords__no_retraction__6166_end.csv')

publisher_one = pd.read_csv('./publisher__no_retraction__0_5545.csv')
publisher_two = pd.read_csv('./publisher__no_retraction__5545_6166.csv')
publisher_three = pd.read_csv('./publisher__no_retraction__6166_end.csv')

doi_one = pd.read_csv('./doi__no_retraction__0_5545.csv')
doi_two = pd.read_csv('./doi__no_retraction__5545_6166.csv')
doi_three = pd.read_csv('./doi__no_retraction__6166_end.csv')

In [16]:
text_one.tail()

Unnamed: 0.1,Unnamed: 0,0
5541,5541,"Winter air pollution in Ulaanbaatar, Mongolia ..."
5542,5542,Chronic infection with hepatitis C virus (HCV)...
5543,5543,California has one of the most highly engineer...
5544,5544,Thiol-dependent cathepsins are found in all li...
5545,5545,To estimate hepatitis C virus (HCV) viremic ra...


In [17]:
text_two.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,To estimate hepatitis C virus (HCV) viremic ra...
1,1,Pollinators are crucial in almost all terrestr...
2,2,The association of melanosis coli with the dev...
3,3,"In most plant species, repetitive DNA constitu..."
4,4,"In the last decades, there has been a great in..."


In [18]:
text_two.tail()

Unnamed: 0.1,Unnamed: 0,0
617,617,Bread wheat (Triticum aestivum L.) is one of t...
618,618,"Metabolic syndrome (MetS), defined as a comple..."
619,619,Competitive learning techniques are being succ...
620,620,"In South East Africa, about 100,000 years ago ..."
621,621,In the text of González-Fernández [1] can be f...


In [19]:
text_three.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,In the text of González-Fernández [1] can be f...
1,1,Infantile spasms (IS) are the defining seizure...
2,2,The arylamine N-acetyltransferases are a famil...
3,3,Several biomarkers have been proposed for ultr...
4,4,Osteoporosis is a skeletal disease characteriz...


In [24]:
text_one = text_one.drop(5545, axis=0)
text_two = text_two.drop(621, axis=0)
abstract_one = abstract_one.drop(5545, axis=0)
abstract_two = abstract_two.drop(621, axis=0)
keywords_one = keywords_one.drop(5545, axis=0)
keywords_two = keywords_two.drop(621, axis=0)
publisher_one = publisher_one.drop(5545, axis=0)
publisher_two = publisher_two.drop(621, axis=0)

In [25]:
text_total = pd.concat([text_one, text_two, text_three], axis =0)
text_total.shape

(9613, 2)

In [26]:
doi_total = pd.concat([doi_one, doi_two, doi_three], axis =0)
doi_total.shape

(9613, 2)

In [27]:
abstract_total = pd.concat([abstract_one, abstract_two, abstract_three], axis =0)
keywords_total = pd.concat([keywords_one, keywords_two, keywords_three], axis =0)
publisher_total = pd.concat([publisher_one, publisher_two, publisher_three], axis =0)

In [28]:
text_total.to_csv('./plos_only_no_retraction_text.csv')
doi_total.to_csv('./plos_only_no_retraction_doi.csv')
abstract_total.to_csv('./plos_only_no_retraction_abstract.csv')
keywords_total.to_csv('./plos_only_no_retraction_keywords.csv')
publisher_total.to_csv('./plos_only_no_retraction_publisher.csv')

In [29]:
text = pd.read_csv('./plos_only_no_retraction_text.csv')
abstract = pd.read_csv('./plos_only_no_retraction_abstract.csv')
keywords = pd.read_csv('./plos_only_no_retraction_keywords.csv')
publisher = pd.read_csv('./plos_only_no_retraction_publisher.csv')
doi = pd.read_csv('./plos_only_no_retraction_doi.csv')

In [30]:
ls_text = []
for i in text['0']:
    ls_text.append(i)
print(len(ls_text))
    
ls_abstract = []
for i in abstract['0']:
    ls_abstract.append(i)
print(len(ls_abstract))
    
ls_keywords = []
for i in keywords['0']:
    ls_keywords.append(i)
print(len(ls_keywords))
    
ls_publisher = []
for i in publisher['0']:
    ls_publisher.append(i)
print(len(ls_publisher))
    
ls_doi = []
for i in doi['1']:
    ls_doi.append(i)
print(len(ls_doi))

9613
9613
9613
9613
9613


In [31]:
no_retract = pd.read_csv('./pubmed_data_second_no_retraction.csv')

In [32]:
no_retract = pd.concat([no_retract, pd.Series(ls_text), pd.Series(ls_abstract), pd.Series(ls_keywords), 
                        pd.Series(ls_publisher), pd.Series(ls_doi)], axis=1)

In [33]:
no_retract = no_retract.drop(columns=['Unnamed: 0'])

In [34]:
no_retract = no_retract.rename(columns={'0':'id', '1':'doi', '2':'language', '3':'year', '4':'month', '5':'day', 
                      '6':'volume', '7':'issue', '8':'journal', '9':'title', '10':'page', 
                      0:'text', 1:'abstract', 2:'keywords', 3:'publisher', 4:'doi_check'})

In [35]:
no_retract.head()

Unnamed: 0,id,doi,language,year,month,day,volume,issue,journal,title,page,text,abstract,keywords,publisher,doi_check
0,25569838,10.1371/journal.pone.0115528,eng,2015,1,8,10,1,PloS one,High incidence is not high exposure: what prop...,e0115528,Randomized clinical trials of HIV prevention i...,Objective,[],Public Library of Science,10.1371/journal.pone.0115528
1,25569796,10.1371/journal.pone.0115194,eng,2015,1,8,10,1,PloS one,Neurological abnormalities in full-term asphyx...,e0115194,Perinatal asphyxia (PA) is a leading cause of ...,Background,[],Public Library of Science,10.1371/journal.pone.0115194
2,25569682,10.1371/journal.pone.0117040,eng,2015,1,8,10,1,PloS one,Production of siderophores increases resistanc...,e0117040,"Fusaric acid (FA, 5-butylpyridine-2-carboxylic...",Fusaric acid is produced by pathogenic fungi o...,[],Public Library of Science,10.1371/journal.pone.0117040
3,25569558,10.1371/journal.pone.0116930,eng,2015,1,8,10,1,PloS one,Application of clinico-radiologic-pathologic d...,e0116930,Diffuse parenchymal lung diseases in children ...,Diffuse parenchymal lung diseases in children ...,[],Public Library of Science,10.1371/journal.pone.0116930
4,25569428,10.1371/journal.pone.0116566,eng,2015,1,8,10,1,PloS one,Cinnamon ameliorates experimental allergic enc...,e0116566,Regulatory T cells (Tregs) are regarded as the...,Upregulation and/or maintenance of regulatory ...,[],Public Library of Science,10.1371/journal.pone.0116566


In [36]:
no_retract['doi'][5540:5550]

5540    10.1371/journal.pone.0186821
5541    10.1371/journal.pone.0186834
5542    10.1371/journal.pone.0186898
5543    10.1371/journal.pone.0187181
5544    10.1371/journal.pone.0186869
5545    10.1371/journal.pone.0187177
5546    10.1371/journal.pone.0187079
5547    10.1371/journal.pone.0186668
5548    10.1371/journal.pone.0187131
5549    10.1371/journal.pone.0186957
Name: doi, dtype: object

In [37]:
no_retract['doi_check'][5540:5550]

5540    10.1371/journal.pone.0186821
5541    10.1371/journal.pone.0186834
5542    10.1371/journal.pone.0186898
5543    10.1371/journal.pone.0187181
5544    10.1371/journal.pone.0186869
5545    10.1371/journal.pone.0187177
5546    10.1371/journal.pone.0187079
5547    10.1371/journal.pone.0186668
5548    10.1371/journal.pone.0187131
5549    10.1371/journal.pone.0186957
Name: doi_check, dtype: object

In [39]:
no_retract.to_csv('./plos_only_no_retraction_data.csv')

In [40]:
no_retract = pd.read_csv('./plos_only_no_retraction_data.csv', index_col=False)
no_retract = no_retract.drop(columns=['Unnamed: 0', 'doi_check'])

In [41]:
print(no_retract['abstract'].isnull().sum())
print(no_retract['text'].isnull().sum())

494
399


In [42]:
no_retract = no_retract.dropna(axis=0, subset=['text'])
print(no_retract.shape)
no_retract.head()

(9214, 15)


Unnamed: 0,id,doi,language,year,month,day,volume,issue,journal,title,page,text,abstract,keywords,publisher
0,25569838,10.1371/journal.pone.0115528,eng,2015,1,8,10,1,PloS one,High incidence is not high exposure: what prop...,e0115528,Randomized clinical trials of HIV prevention i...,Objective,[],Public Library of Science
1,25569796,10.1371/journal.pone.0115194,eng,2015,1,8,10,1,PloS one,Neurological abnormalities in full-term asphyx...,e0115194,Perinatal asphyxia (PA) is a leading cause of ...,Background,[],Public Library of Science
2,25569682,10.1371/journal.pone.0117040,eng,2015,1,8,10,1,PloS one,Production of siderophores increases resistanc...,e0117040,"Fusaric acid (FA, 5-butylpyridine-2-carboxylic...",Fusaric acid is produced by pathogenic fungi o...,[],Public Library of Science
3,25569558,10.1371/journal.pone.0116930,eng,2015,1,8,10,1,PloS one,Application of clinico-radiologic-pathologic d...,e0116930,Diffuse parenchymal lung diseases in children ...,Diffuse parenchymal lung diseases in children ...,[],Public Library of Science
4,25569428,10.1371/journal.pone.0116566,eng,2015,1,8,10,1,PloS one,Cinnamon ameliorates experimental allergic enc...,e0116566,Regulatory T cells (Tregs) are regarded as the...,Upregulation and/or maintenance of regulatory ...,[],Public Library of Science


In [43]:
np.random.choice(no_retract['doi'], 50)
#checking to make sure no retractions
#did have five articles that were corrected: author byline changes and fixing a citation,
#                                           supporting figure appears incorrectly,
#                                           authors spelled incorrectly,
#                                           errors in figures,
#                                           affiliation listing error

array(['10.1371/journal.pone.0221236', '10.1371/journal.pone.0208797',
       '10.1371/journal.pone.0145158', '10.1371/journal.pone.0184077',
       '10.1371/journal.pone.0169155', '10.1371/journal.pone.0119129',
       '10.1371/journal.pone.0175673', '10.1371/journal.pone.0226358',
       '10.1371/journal.pone.0186309', '10.1371/journal.pone.0144882',
       '10.1371/journal.pone.0217685', '10.1371/journal.pone.0168498',
       '10.1371/journal.pone.0203429', '10.1371/journal.pone.0147806',
       '10.1371/journal.pone.0210432', '10.1371/journal.pone.0201007',
       '10.1371/journal.pone.0155713', '10.1371/journal.pone.0178620',
       '10.1371/journal.pone.0135598', '10.1371/journal.pone.0178231',
       '10.1371/journal.pone.0156508', '10.1371/journal.pone.0118722',
       '10.1371/journal.pone.0179354', '10.1371/journal.pone.0221109',
       '10.1371/journal.pone.0171148', '10.1371/journal.pone.0152195',
       '10.1371/journal.pone.0131134', '10.1371/journal.pone.0216493',
      

In [44]:
no_retract['retraction_binary'] = 0
print(no_retract['retraction_binary'].value_counts())
no_retract.head()

0    9214
Name: retraction_binary, dtype: int64


Unnamed: 0,id,doi,language,year,month,day,volume,issue,journal,title,page,text,abstract,keywords,publisher,retraction_binary
0,25569838,10.1371/journal.pone.0115528,eng,2015,1,8,10,1,PloS one,High incidence is not high exposure: what prop...,e0115528,Randomized clinical trials of HIV prevention i...,Objective,[],Public Library of Science,0
1,25569796,10.1371/journal.pone.0115194,eng,2015,1,8,10,1,PloS one,Neurological abnormalities in full-term asphyx...,e0115194,Perinatal asphyxia (PA) is a leading cause of ...,Background,[],Public Library of Science,0
2,25569682,10.1371/journal.pone.0117040,eng,2015,1,8,10,1,PloS one,Production of siderophores increases resistanc...,e0117040,"Fusaric acid (FA, 5-butylpyridine-2-carboxylic...",Fusaric acid is produced by pathogenic fungi o...,[],Public Library of Science,0
3,25569558,10.1371/journal.pone.0116930,eng,2015,1,8,10,1,PloS one,Application of clinico-radiologic-pathologic d...,e0116930,Diffuse parenchymal lung diseases in children ...,Diffuse parenchymal lung diseases in children ...,[],Public Library of Science,0
4,25569428,10.1371/journal.pone.0116566,eng,2015,1,8,10,1,PloS one,Cinnamon ameliorates experimental allergic enc...,e0116566,Regulatory T cells (Tregs) are regarded as the...,Upregulation and/or maintenance of regulatory ...,[],Public Library of Science,0


In [45]:
no_retract.to_csv('./no_null_text_plos_only_no_retraction_data.csv')
no_retract = pd.read_csv('./no_null_text_plos_only_no_retraction_data.csv')
no_retract = no_retract.drop(columns=['Unnamed: 0'])

In [46]:
no_retract.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9214 entries, 0 to 9213
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 9214 non-null   int64 
 1   doi                9214 non-null   object
 2   language           9214 non-null   object
 3   year               9214 non-null   int64 
 4   month              9214 non-null   int64 
 5   day                9214 non-null   int64 
 6   volume             9214 non-null   int64 
 7   issue              9214 non-null   int64 
 8   journal            9214 non-null   object
 9   title              9214 non-null   object
 10  page               9214 non-null   object
 11  text               9214 non-null   object
 12  abstract           9119 non-null   object
 13  keywords           9214 non-null   object
 14  publisher          9214 non-null   object
 15  retraction_binary  9214 non-null   int64 
dtypes: int64(7), object(9)
memory usage: 1.1+ 

In [47]:
no_retract['keywords'].value_counts()
#checked 50 random samples, value count holds true

[]                                                                            9213
['Socioeconomic inequality', 'Antenatal care', 'Decomposition', 'Nigeria']       1
Name: keywords, dtype: int64

In [None]:
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

clean_text = []
clean_text_lem = []

for i in range(0, len(no_retract['text'])):
    ls_words = []
    ls_lem = []
    for j in tokenizer.tokenize(no_retract['text'][i]):
        try:
            int(j)
        except:
            if len(j) < 45:
                ls_words.append(j)
                ls_lem.append(lemmatizer.lemmatize(j))
            else:
                pass
        
    clean_text.append(' '.join(ls_words))
    clean_text_lem.append(' '.join(ls_lem))
    
    print(i)

print(len(no_retract['text'][1]))
print(len(clean_text[1]))
print(len(clean_text_lem[1]))

In [49]:
no_retract['clean_text'] = clean_text
no_retract['clean_text_lem'] = clean_text_lem
no_retract.head()

Unnamed: 0,id,doi,language,year,month,day,volume,issue,journal,title,page,text,abstract,keywords,publisher,retraction_binary,clean_text,clean_text_lem
0,25569838,10.1371/journal.pone.0115528,eng,2015,1,8,10,1,PloS one,High incidence is not high exposure: what prop...,e0115528,Randomized clinical trials of HIV prevention i...,Objective,[],Public Library of Science,0,Randomized clinical trials of HIV prevention i...,Randomized clinical trial of HIV prevention in...
1,25569796,10.1371/journal.pone.0115194,eng,2015,1,8,10,1,PloS one,Neurological abnormalities in full-term asphyx...,e0115194,Perinatal asphyxia (PA) is a leading cause of ...,Background,[],Public Library of Science,0,Perinatal asphyxia PA is a leading cause of mo...,Perinatal asphyxia PA is a leading cause of mo...
2,25569682,10.1371/journal.pone.0117040,eng,2015,1,8,10,1,PloS one,Production of siderophores increases resistanc...,e0117040,"Fusaric acid (FA, 5-butylpyridine-2-carboxylic...",Fusaric acid is produced by pathogenic fungi o...,[],Public Library of Science,0,Fusaric acid FA butylpyridine carboxylic acid ...,Fusaric acid FA butylpyridine carboxylic acid ...
3,25569558,10.1371/journal.pone.0116930,eng,2015,1,8,10,1,PloS one,Application of clinico-radiologic-pathologic d...,e0116930,Diffuse parenchymal lung diseases in children ...,Diffuse parenchymal lung diseases in children ...,[],Public Library of Science,0,Diffuse parenchymal lung diseases in children ...,Diffuse parenchymal lung disease in child chDP...
4,25569428,10.1371/journal.pone.0116566,eng,2015,1,8,10,1,PloS one,Cinnamon ameliorates experimental allergic enc...,e0116566,Regulatory T cells (Tregs) are regarded as the...,Upregulation and/or maintenance of regulatory ...,[],Public Library of Science,0,Regulatory T cells Tregs are regarded as the m...,Regulatory T cell Tregs are regarded a the mas...


In [61]:
no_retract.loc[no_retract['year']==2014].index

Int64Index([ 43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
            ...
            157, 158, 159, 160, 161, 162, 163, 164, 165, 166],
           dtype='int64', length=124)

In [68]:
no_retract = no_retract.drop(range(43, 167))
no_retract['year'].value_counts()

2019    1891
2016    1833
2015    1792
2018    1788
2017    1786
Name: year, dtype: int64

## Cleaning Combined Datasets

In [None]:
total = pd.concat([retract, no_retract], axis=0)
total

In [71]:
total = total.drop(columns=['id', 'language', 'publisher'])

In [72]:
total['page'].value_counts()

26          7
4           5
123         4
85          4
34          4
           ..
e0181077    1
e0189277    1
3852        1
e0186435    1
e0196047    1
Name: page, Length: 10451, dtype: int64

In [73]:
total = total.drop(columns=['page'])

In [74]:
total['year'] = total['year'].astype(str)
total['month'] = total['month'].astype(str)
total['day'] = total['day'].astype(str)
total['volume'] = total['volume'].astype(str)
total['issue'] = total['issue'].astype(str)

In [75]:
total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10643 entries, 0 to 9213
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   doi                10643 non-null  object
 1   year               10643 non-null  object
 2   month              10643 non-null  object
 3   day                10643 non-null  object
 4   volume             10643 non-null  object
 5   issue              10643 non-null  object
 6   journal            10643 non-null  object
 7   title              10643 non-null  object
 8   text               10643 non-null  object
 9   abstract           10528 non-null  object
 10  keywords           10643 non-null  object
 11  retraction_binary  10643 non-null  int64 
 12  unpacked_keywords  1553 non-null   object
 13  clean_text         10643 non-null  object
 14  clean_text_lem     10643 non-null  object
dtypes: int64(1), object(14)
memory usage: 1.3+ MB


In [76]:
total.to_csv('./total_plos_only_data.csv')
total = pd.read_csv('./total_plos_only_data.csv')

In [77]:
total['doi'].value_counts().head(25)

10.1074/jbc.M111.329078           4
10.1371/journal.pone.0194078      2
10.1038/cr.2011.194               2
10.1074/jbc.M111.275073           2
10.1371/journal.pone.0164378      2
10.1038/cdd.2010.114              2
10.1371/journal.pone.0212021      2
10.1074/jbc.M110.175802           2
10.3390/nano4020203               2
10.1074/jbc.M808084200            2
10.1371/journal.pone.0216079      2
10.1371/journal.pone.0155697      2
10.1523/JNEUROSCI.0372-13.2013    2
10.1371/journal.pone.0146671      2
10.1074/jbc.M112.387738           2
10.1200/JCO.2017.74.7824          2
10.1155/2012/236409               2
10.1074/jbc.M709854200            2
10.1371/journal.pone.0140044      2
10.1371/journal.pone.0125542      2
10.3389/fnins.2018.00529          2
10.1371/journal.pone.0183066      1
10.1371/journal.pone.0225345      1
10.1371/journal.pone.0193981      1
10.1371/journal.pone.0166478      1
Name: doi, dtype: int64

In [78]:
total = total.drop_duplicates(subset='doi', keep='first')

In [79]:
total['doi'].value_counts().head()

10.1371/journal.pone.0156737    1
10.1371/journal.pone.0164836    1
10.1371/journal.pone.0168679    1
10.1371/journal.pone.0210251    1
10.1371/journal.pone.0168758    1
Name: doi, dtype: int64

In [80]:
total.to_csv('./total_plos_only_data_no_duplicates.csv')
total = pd.read_csv('./total_plos_only_data_no_duplicates.csv')

In [81]:
total['clean_text'][950:955]

950    For the chemotherapeutic activity of pyrimidin...
951    Urinary tract infection UTI is a bacterial inf...
952    The Bible descrbies the case of a woman with h...
953                                                  NaN
954    Human betaherpesviruses 6A and 6B HHV 6A and H...
Name: clean_text, dtype: object

In [82]:
total = total.drop(953)
total['clean_text'][950:955]

950    For the chemotherapeutic activity of pyrimidin...
951    Urinary tract infection UTI is a bacterial inf...
952    The Bible descrbies the case of a woman with h...
954    Human betaherpesviruses 6A and 6B HHV 6A and H...
955    A cesarean section is the most frequently perf...
Name: clean_text, dtype: object

In [83]:
total['retraction_binary'][1536:]

1537     1
1538     0
1539     0
1540     0
1541     0
        ..
10615    0
10616    0
10617    0
10618    0
10619    0
Name: retraction_binary, Length: 9083, dtype: int64

In [84]:
total.to_csv('./total_data_plos_only_cleaned.csv')
total[1537:].to_csv('./no_retraction_data_plos_only_cleaned.csv')
total[:1537].to_csv('./retraction_data_plos_only_cleaned.csv')