In [57]:
#importing the dependencies

import pandas as pd
import pycountry
import numpy as np

In [58]:
df = pd.read_csv('../data/external/scopus.csv')
df.columns

Index(['Authors', 'Author(s) ID', 'Title', 'Year', 'Source title', 'Volume',
       'Issue', 'Art. No.', 'Page start', 'Page end', 'Page count', 'Cited by',
       'DOI', 'Link', 'Affiliations', 'Authors with affiliations', 'Abstract',
       'Author Keywords', 'Index Keywords', 'Molecular Sequence Numbers',
       'Chemicals/CAS', 'Tradenames', 'Manufacturers', 'Funding Details',
       'Funding Text 1', 'Funding Text 2', 'Funding Text 3', 'Funding Text 4',
       'Funding Text 5', 'Funding Text 6', 'Funding Text 7', 'Funding Text 8',
       'Funding Text 9', 'Funding Text 10', 'References',
       'Correspondence Address', 'Editors', 'Sponsors', 'Publisher',
       'Conference name', 'Conference date', 'Conference location',
       'Conference code', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID',
       'Language of Original Document', 'Abbreviated Source Title',
       'Document Type', 'Publication Stage', 'Source', 'EID'],
      dtype='object')

In [59]:
columns =['Title','Year','Source title', 'Affiliations', 'Authors with affiliations', 'Abstract','Cited by',
        'Author Keywords', 'Index Keywords','Funding Details', 'Correspondence Address','Publisher',  'Language of Original Document',  'Document Type']

df = df[columns]
df.head()

Unnamed: 0,Title,Year,Source title,Affiliations,Authors with affiliations,Abstract,Cited by,Author Keywords,Index Keywords,Funding Details,Correspondence Address,Publisher,Language of Original Document,Document Type
0,Severe proliferative retinopathy in a patient ...,2023,American Journal of Ophthalmology Case Reports,"Tufts Medical Center, 800 Washington Street, B...","Caranfa, J.T., Tufts Medical Center, 800 Washi...",[No abstract available],,,adult; Article; autoimmune hepatitis; blurred ...,,"Caranfa, J.T.; New England Eye Center, 800 Was...",Elsevier Inc.,English,Article
1,Plasma levels of E-selectin are associated wit...,2023,European Journal of Haematology,Centre de référence des syndromes drépanocytai...,"Agouti, I., Centre de référence des syndromes ...",Background: The vascular endothelium is marked...,,E-selectin; endothelium markers; sickle cell d...,creatinine; endothelial leukocyte adhesion mol...,2012‐13,"Agouti, I.; Laboratoire de Génétique Moléculai...",John Wiley and Sons Inc,English,Article
2,Routine Ophthalmological Examination Rates in ...,2023,International Journal of Environmental Researc...,"Albert Einstein College of Medicine, New York,...","Zulueta, P., Albert Einstein College of Medici...",The American Academy of Ophthalmology and the ...,,screening; sickle cell disease; sickle cell re...,blood; cell organelle; COVID-19; epidemic; hea...,"Albert Einstein College of Medicine, Yeshiva U...","Mian, U.K.; Department of Ophthalmology & Visu...",MDPI,English,Article
3,Progress in and Prospects of Genome Editing To...,2023,Genes,"Department of Physiology, Korea University Col...","Phan, H.T.L., Department of Physiology, Korea ...","Programmable nucleases, such as zinc finger nu...",,genome editing; mouse model of human disease; ...,atonal BHLH transcription factor 1; beta actin...,"2013M3A9D5072550, 2014M3A9D5A01075128, 2019R1A...","Lee, H.; Graduate School of Cancer Science and...",MDPI,English,Review
4,Screening for sickle-cell retinopathy,2023,Eye (Basingstoke),Northern Lincolnshire and Goole NHS Foundation...,"Ashwin, P.T., Northern Lincolnshire and Goole ...",[No abstract available],,,crizanlizumab; vasculotropin antibody; clinica...,,"Ashwin, P.T.; Northern Lincolnshire and Goole ...",Springer Nature,English,Letter


In [60]:
df.describe()

Unnamed: 0,Year,Cited by
count,799.0,656.0
mean,2003.77597,27.45122
std,16.625949,65.125697
min,1954.0,1.0
25%,1989.5,4.0
50%,2010.0,12.0
75%,2018.0,27.0
max,2023.0,974.0


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 799 entries, 0 to 798
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Title                          799 non-null    object 
 1   Year                           799 non-null    int64  
 2   Source title                   799 non-null    object 
 3   Affiliations                   745 non-null    object 
 4   Authors with affiliations      797 non-null    object 
 5   Abstract                       799 non-null    object 
 6   Cited by                       656 non-null    float64
 7   Author Keywords                352 non-null    object 
 8   Index Keywords                 745 non-null    object 
 9   Funding Details                193 non-null    object 
 10  Correspondence Address         714 non-null    object 
 11  Publisher                      441 non-null    object 
 12  Language of Original Document  799 non-null    obj

# Cleaning the data

## Afiliations

1. Extract the countries from the text into a list of countries

In [62]:
#Functions
        
def extract_countries(affiliations :str)-> list:
    """ Extracts countries from a string of affiliations.

    Args:
        affiliations (str): A string of affiliations.

    Returns:
        list:  A list of countries extracted from the affiliations
    """
    countries = []
    other_countries = [' South Korea', ' Iran', ' Toronto', " Cote d'Ivoire", ' Lagos University Teaching Hospital.', ' Lagos.', ' North Carolina',' Créteil.', ' Southern California College of Optometry']
    for country in pycountry.countries:
        if country.name in affiliations:
            countries.append(country.name) 

    for other_country in other_countries:
        if other_country in affiliations.split(','):
            stripped = other_country.strip()
            if stripped in ['Lagos University Teaching Hospital.', 'Lagos.']:
                stripped = 'Nigeria'
            if stripped == 'Toronto':
                stripped = 'Canada'
            if stripped == 'North Carolina' or stripped == 'Southern California College of Optometry':
                stripped = 'United States'
            if stripped =='Créteil.':
                stripped = 'France'
            countries.append(stripped)   
    return (countries)





In [63]:
text =  "United States (New York), United Kingdom (London), Iran"
print(extract_countries(text))

['United Kingdom', 'United States', 'Iran']


In [64]:
df['Affiliations'] = df['Affiliations'].astype('object')
df.dtypes

Title                             object
Year                               int64
Source title                      object
Affiliations                      object
Authors with affiliations         object
Abstract                          object
Cited by                         float64
Author Keywords                   object
Index Keywords                    object
Funding Details                   object
Correspondence Address            object
Publisher                         object
Language of Original Document     object
Document Type                     object
dtype: object

In [65]:
df['Affiliations'].fillna('', inplace=True)

In [66]:
df['Affiliations'].isnull().sum()

0

In [67]:
#extract countries from dataframe rows using the extract_country function
countries = df['Affiliations'].apply(extract_countries)


In [68]:
countries

0           [United States]
1      [France, Guadeloupe]
2           [United States]
3             [South Korea]
4          [United Kingdom]
               ...         
794                      []
795                      []
796                      []
797                      []
798                      []
Name: Affiliations, Length: 799, dtype: object

In [69]:
df['countries'] = countries

In [70]:
df.head()

Unnamed: 0,Title,Year,Source title,Affiliations,Authors with affiliations,Abstract,Cited by,Author Keywords,Index Keywords,Funding Details,Correspondence Address,Publisher,Language of Original Document,Document Type,countries
0,Severe proliferative retinopathy in a patient ...,2023,American Journal of Ophthalmology Case Reports,"Tufts Medical Center, 800 Washington Street, B...","Caranfa, J.T., Tufts Medical Center, 800 Washi...",[No abstract available],,,adult; Article; autoimmune hepatitis; blurred ...,,"Caranfa, J.T.; New England Eye Center, 800 Was...",Elsevier Inc.,English,Article,[United States]
1,Plasma levels of E-selectin are associated wit...,2023,European Journal of Haematology,Centre de référence des syndromes drépanocytai...,"Agouti, I., Centre de référence des syndromes ...",Background: The vascular endothelium is marked...,,E-selectin; endothelium markers; sickle cell d...,creatinine; endothelial leukocyte adhesion mol...,2012‐13,"Agouti, I.; Laboratoire de Génétique Moléculai...",John Wiley and Sons Inc,English,Article,"[France, Guadeloupe]"
2,Routine Ophthalmological Examination Rates in ...,2023,International Journal of Environmental Researc...,"Albert Einstein College of Medicine, New York,...","Zulueta, P., Albert Einstein College of Medici...",The American Academy of Ophthalmology and the ...,,screening; sickle cell disease; sickle cell re...,blood; cell organelle; COVID-19; epidemic; hea...,"Albert Einstein College of Medicine, Yeshiva U...","Mian, U.K.; Department of Ophthalmology & Visu...",MDPI,English,Article,[United States]
3,Progress in and Prospects of Genome Editing To...,2023,Genes,"Department of Physiology, Korea University Col...","Phan, H.T.L., Department of Physiology, Korea ...","Programmable nucleases, such as zinc finger nu...",,genome editing; mouse model of human disease; ...,atonal BHLH transcription factor 1; beta actin...,"2013M3A9D5072550, 2014M3A9D5A01075128, 2019R1A...","Lee, H.; Graduate School of Cancer Science and...",MDPI,English,Review,[South Korea]
4,Screening for sickle-cell retinopathy,2023,Eye (Basingstoke),Northern Lincolnshire and Goole NHS Foundation...,"Ashwin, P.T., Northern Lincolnshire and Goole ...",[No abstract available],,,crizanlizumab; vasculotropin antibody; clinica...,,"Ashwin, P.T.; Northern Lincolnshire and Goole ...",Springer Nature,English,Letter,[United Kingdom]


# Year

In [71]:
df['Year'].value_counts()

Year
2021    50
2019    47
2022    45
2020    36
2017    35
        ..
1964     1
1963     1
1960     1
1958     1
1954     1
Name: count, Length: 61, dtype: int64

# Author and Index Keywords
1. Split on ; into a list of keywords

In [72]:
cols = ['Author Keywords', 'Index Keywords']
keywords_df  = df[cols]
keywords_df = keywords_df.apply(lambda row: row.str.split(';'))
df[cols] = keywords_df


In [73]:
df.head()

Unnamed: 0,Title,Year,Source title,Affiliations,Authors with affiliations,Abstract,Cited by,Author Keywords,Index Keywords,Funding Details,Correspondence Address,Publisher,Language of Original Document,Document Type,countries
0,Severe proliferative retinopathy in a patient ...,2023,American Journal of Ophthalmology Case Reports,"Tufts Medical Center, 800 Washington Street, B...","Caranfa, J.T., Tufts Medical Center, 800 Washi...",[No abstract available],,,"[adult, Article, autoimmune hepatitis, blur...",,"Caranfa, J.T.; New England Eye Center, 800 Was...",Elsevier Inc.,English,Article,[United States]
1,Plasma levels of E-selectin are associated wit...,2023,European Journal of Haematology,Centre de référence des syndromes drépanocytai...,"Agouti, I., Centre de référence des syndromes ...",Background: The vascular endothelium is marked...,,"[E-selectin, endothelium markers, sickle cel...","[creatinine, endothelial leukocyte adhesion m...",2012‐13,"Agouti, I.; Laboratoire de Génétique Moléculai...",John Wiley and Sons Inc,English,Article,"[France, Guadeloupe]"
2,Routine Ophthalmological Examination Rates in ...,2023,International Journal of Environmental Researc...,"Albert Einstein College of Medicine, New York,...","Zulueta, P., Albert Einstein College of Medici...",The American Academy of Ophthalmology and the ...,,"[screening, sickle cell disease, sickle cell...","[blood, cell organelle, COVID-19, epidemic,...","Albert Einstein College of Medicine, Yeshiva U...","Mian, U.K.; Department of Ophthalmology & Visu...",MDPI,English,Article,[United States]
3,Progress in and Prospects of Genome Editing To...,2023,Genes,"Department of Physiology, Korea University Col...","Phan, H.T.L., Department of Physiology, Korea ...","Programmable nucleases, such as zinc finger nu...",,"[genome editing, mouse model of human disease...","[atonal BHLH transcription factor 1, beta act...","2013M3A9D5072550, 2014M3A9D5A01075128, 2019R1A...","Lee, H.; Graduate School of Cancer Science and...",MDPI,English,Review,[South Korea]
4,Screening for sickle-cell retinopathy,2023,Eye (Basingstoke),Northern Lincolnshire and Goole NHS Foundation...,"Ashwin, P.T., Northern Lincolnshire and Goole ...",[No abstract available],,,"[crizanlizumab, vasculotropin antibody, clin...",,"Ashwin, P.T.; Northern Lincolnshire and Goole ...",Springer Nature,English,Letter,[United Kingdom]


In [74]:
df[cols].isnull().sum()

Author Keywords    447
Index Keywords      54
dtype: int64

In [75]:
keywords_df  = df[cols]
keywords_df.head()

Unnamed: 0,Author Keywords,Index Keywords
0,,"[adult, Article, autoimmune hepatitis, blur..."
1,"[E-selectin, endothelium markers, sickle cel...","[creatinine, endothelial leukocyte adhesion m..."
2,"[screening, sickle cell disease, sickle cell...","[blood, cell organelle, COVID-19, epidemic,..."
3,"[genome editing, mouse model of human disease...","[atonal BHLH transcription factor 1, beta act..."
4,,"[crizanlizumab, vasculotropin antibody, clin..."


In [76]:
keywords_df.fillna("",inplace=True)
keywords_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  keywords_df.fillna("",inplace=True)


Unnamed: 0,Author Keywords,Index Keywords
0,,"[adult, Article, autoimmune hepatitis, blur..."
1,"[E-selectin, endothelium markers, sickle cel...","[creatinine, endothelial leukocyte adhesion m..."
2,"[screening, sickle cell disease, sickle cell...","[blood, cell organelle, COVID-19, epidemic,..."
3,"[genome editing, mouse model of human disease...","[atonal BHLH transcription factor 1, beta act..."
4,,"[crizanlizumab, vasculotropin antibody, clin..."


In [77]:
df[cols]=keywords_df
df[cols].isnull().sum()

Author Keywords    0
Index Keywords     0
dtype: int64

# Funding details

In [78]:
cols=['Funding Details']
funding_df = df[cols]
funding_df.head()

Unnamed: 0,Funding Details
0,
1,2012‐13
2,"Albert Einstein College of Medicine, Yeshiva U..."
3,"2013M3A9D5072550, 2014M3A9D5A01075128, 2019R1A..."
4,


In [79]:
#maybe some named entity recognition to determine the organisations in each row

import spacy
from spacy import displacy


In [80]:
#loading spacy
nlp = spacy.load("en_core_web_sm")


In [81]:
#testing
txt = nlp(funding_df['Funding Details'].values[15])
displacy.render(txt[:], style='ent', jupyter=True)

In [82]:
#function
def ent(row :str) -> list:
    """Extracts named entities of type "ORG" from a string using spacy.

    Args:
        row (str): A string to extract named entities from.

    Returns:
        list: A list of named entities of type "ORG" extracted from the string.
    """
    if type(row) == str:
        return [x for x in (nlp(row)).ents if x.label_ == "ORG"]
    else:
        return ('')
    


In [83]:
funding_df['Funding Details'] = funding_df['Funding Details'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  funding_df['Funding Details'] = funding_df['Funding Details'].astype(str)


In [84]:
type(funding_df['Funding Details'].values[0])

str

In [85]:
funding_df['Funding Details'] = funding_df['Funding Details'].apply(lambda row: ent(row))
funding_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  funding_df['Funding Details'] = funding_df['Funding Details'].apply(lambda row: ent(row))


Unnamed: 0,Funding Details
0,[]
1,[]
2,"[(Albert, Einstein, College, of, Medicine), (Y..."
3,"[(National, Research, Foundation)]"
4,[]


replace empty funding details with unfunded?

In [86]:
#number of empty funding details
print(len([i for i in funding_df['Funding Details'] if len(i) == 0])
)

#rows with no funding
funding_df[funding_df['Funding Details'].str.len() == 0]

610


Unnamed: 0,Funding Details
0,[]
1,[]
4,[]
7,[]
8,[]
...,...
794,[]
795,[]
796,[]
797,[]


In [87]:
df['Funding Entities'] = funding_df
df.columns

Index(['Title', 'Year', 'Source title', 'Affiliations',
       'Authors with affiliations', 'Abstract', 'Cited by', 'Author Keywords',
       'Index Keywords', 'Funding Details', 'Correspondence Address',
       'Publisher', 'Language of Original Document', 'Document Type',
       'countries', 'Funding Entities'],
      dtype='object')

# First Author country
Assuming the corresponding author is the first author

In [88]:
cols =['Correspondence Address']
first_author_df = df[cols]
first_author_df.head()

Unnamed: 0,Correspondence Address
0,"Caranfa, J.T.; New England Eye Center, 800 Was..."
1,"Agouti, I.; Laboratoire de Génétique Moléculai..."
2,"Mian, U.K.; Department of Ophthalmology & Visu..."
3,"Lee, H.; Graduate School of Cancer Science and..."
4,"Ashwin, P.T.; Northern Lincolnshire and Goole ..."


In [89]:
first_author_df.isnull().sum()

Correspondence Address    85
dtype: int64

In [90]:
first_author_df.fillna('', inplace=True)
first_author_df.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_author_df.fillna('', inplace=True)


Correspondence Address    0
dtype: int64

In [91]:
first_author_df['Correspondence Address'] = first_author_df['Correspondence Address'].apply(lambda row: extract_countries(row))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_author_df['Correspondence Address'] = first_author_df['Correspondence Address'].apply(lambda row: extract_countries(row))


In [92]:
first_author_df.head()

Unnamed: 0,Correspondence Address
0,[United States]
1,[France]
2,[United States]
3,[]
4,[United Kingdom]


In [93]:
df['Main Country'] = first_author_df
df.head()

Unnamed: 0,Title,Year,Source title,Affiliations,Authors with affiliations,Abstract,Cited by,Author Keywords,Index Keywords,Funding Details,Correspondence Address,Publisher,Language of Original Document,Document Type,countries,Funding Entities,Main Country
0,Severe proliferative retinopathy in a patient ...,2023,American Journal of Ophthalmology Case Reports,"Tufts Medical Center, 800 Washington Street, B...","Caranfa, J.T., Tufts Medical Center, 800 Washi...",[No abstract available],,,"[adult, Article, autoimmune hepatitis, blur...",,"Caranfa, J.T.; New England Eye Center, 800 Was...",Elsevier Inc.,English,Article,[United States],[],[United States]
1,Plasma levels of E-selectin are associated wit...,2023,European Journal of Haematology,Centre de référence des syndromes drépanocytai...,"Agouti, I., Centre de référence des syndromes ...",Background: The vascular endothelium is marked...,,"[E-selectin, endothelium markers, sickle cel...","[creatinine, endothelial leukocyte adhesion m...",2012‐13,"Agouti, I.; Laboratoire de Génétique Moléculai...",John Wiley and Sons Inc,English,Article,"[France, Guadeloupe]",[],[France]
2,Routine Ophthalmological Examination Rates in ...,2023,International Journal of Environmental Researc...,"Albert Einstein College of Medicine, New York,...","Zulueta, P., Albert Einstein College of Medici...",The American Academy of Ophthalmology and the ...,,"[screening, sickle cell disease, sickle cell...","[blood, cell organelle, COVID-19, epidemic,...","Albert Einstein College of Medicine, Yeshiva U...","Mian, U.K.; Department of Ophthalmology & Visu...",MDPI,English,Article,[United States],"[(Albert, Einstein, College, of, Medicine), (Y...",[United States]
3,Progress in and Prospects of Genome Editing To...,2023,Genes,"Department of Physiology, Korea University Col...","Phan, H.T.L., Department of Physiology, Korea ...","Programmable nucleases, such as zinc finger nu...",,"[genome editing, mouse model of human disease...","[atonal BHLH transcription factor 1, beta act...","2013M3A9D5072550, 2014M3A9D5A01075128, 2019R1A...","Lee, H.; Graduate School of Cancer Science and...",MDPI,English,Review,[South Korea],"[(National, Research, Foundation)]",[]
4,Screening for sickle-cell retinopathy,2023,Eye (Basingstoke),Northern Lincolnshire and Goole NHS Foundation...,"Ashwin, P.T., Northern Lincolnshire and Goole ...",[No abstract available],,,"[crizanlizumab, vasculotropin antibody, clin...",,"Ashwin, P.T.; Northern Lincolnshire and Goole ...",Springer Nature,English,Letter,[United Kingdom],[],[United Kingdom]


## More cleaning

In [94]:
df.columns

Index(['Title', 'Year', 'Source title', 'Affiliations',
       'Authors with affiliations', 'Abstract', 'Cited by', 'Author Keywords',
       'Index Keywords', 'Funding Details', 'Correspondence Address',
       'Publisher', 'Language of Original Document', 'Document Type',
       'countries', 'Funding Entities', 'Main Country'],
      dtype='object')

In [95]:
df['Cited by'].isnull().sum()

143

In [99]:
citations_df = df['Cited by']
citations_df.dtypes

dtype('float64')

In [101]:
cols = ['Title', 'Year', 'Source title', 'Abstract', 'Index Keywords', 'Cited by',
        'Publisher', 'Language of Original Document', 'Document Type',
        'countries', 'Funding Entities', 'Main Country']

df2 = df[cols]
df2.head()

Unnamed: 0,Title,Year,Source title,Abstract,Index Keywords,Cited by,Publisher,Language of Original Document,Document Type,countries,Funding Entities,Main Country
0,Severe proliferative retinopathy in a patient ...,2023,American Journal of Ophthalmology Case Reports,[No abstract available],"[adult, Article, autoimmune hepatitis, blur...",,Elsevier Inc.,English,Article,[United States],[],[United States]
1,Plasma levels of E-selectin are associated wit...,2023,European Journal of Haematology,Background: The vascular endothelium is marked...,"[creatinine, endothelial leukocyte adhesion m...",,John Wiley and Sons Inc,English,Article,"[France, Guadeloupe]",[],[France]
2,Routine Ophthalmological Examination Rates in ...,2023,International Journal of Environmental Researc...,The American Academy of Ophthalmology and the ...,"[blood, cell organelle, COVID-19, epidemic,...",,MDPI,English,Article,[United States],"[(Albert, Einstein, College, of, Medicine), (Y...",[United States]
3,Progress in and Prospects of Genome Editing To...,2023,Genes,"Programmable nucleases, such as zinc finger nu...","[atonal BHLH transcription factor 1, beta act...",,MDPI,English,Review,[South Korea],"[(National, Research, Foundation)]",[]
4,Screening for sickle-cell retinopathy,2023,Eye (Basingstoke),[No abstract available],"[crizanlizumab, vasculotropin antibody, clin...",,Springer Nature,English,Letter,[United Kingdom],[],[United Kingdom]


In [102]:
df2.isnull().sum()

Title                              0
Year                               0
Source title                       0
Abstract                           0
Index Keywords                     0
Cited by                         143
Publisher                        358
Language of Original Document      0
Document Type                      3
countries                          0
Funding Entities                   0
Main Country                       0
dtype: int64

In [103]:
df2['Publisher'].value_counts()

Publisher
BMJ Publishing Group                                                  35
Elsevier Inc.                                                         32
Lippincott Williams and Wilkins                                       26
Elsevier Masson SAS                                                   14
Wiley-Liss Inc.                                                       11
                                                                      ..
Korean Society of Ultrasound in Medicine                               1
National Academy of Sciences                                           1
American Society for Pharmacology and Experimental Therapy (ASPET)     1
Iranian Society of Ophthalmology                                       1
Birkhäuser-Verlag                                                      1
Name: count, Length: 131, dtype: int64

In [104]:
df2['Publisher'].fillna('',inplace = True)
df2['Document Type'].fillna('', inplace=True)
df2.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Publisher'].fillna('',inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Document Type'].fillna('', inplace=True)


Title                              0
Year                               0
Source title                       0
Abstract                           0
Index Keywords                     0
Cited by                         143
Publisher                          0
Language of Original Document      0
Document Type                      0
countries                          0
Funding Entities                   0
Main Country                       0
dtype: int64

In [105]:
df2.head()

Unnamed: 0,Title,Year,Source title,Abstract,Index Keywords,Cited by,Publisher,Language of Original Document,Document Type,countries,Funding Entities,Main Country
0,Severe proliferative retinopathy in a patient ...,2023,American Journal of Ophthalmology Case Reports,[No abstract available],"[adult, Article, autoimmune hepatitis, blur...",,Elsevier Inc.,English,Article,[United States],[],[United States]
1,Plasma levels of E-selectin are associated wit...,2023,European Journal of Haematology,Background: The vascular endothelium is marked...,"[creatinine, endothelial leukocyte adhesion m...",,John Wiley and Sons Inc,English,Article,"[France, Guadeloupe]",[],[France]
2,Routine Ophthalmological Examination Rates in ...,2023,International Journal of Environmental Researc...,The American Academy of Ophthalmology and the ...,"[blood, cell organelle, COVID-19, epidemic,...",,MDPI,English,Article,[United States],"[(Albert, Einstein, College, of, Medicine), (Y...",[United States]
3,Progress in and Prospects of Genome Editing To...,2023,Genes,"Programmable nucleases, such as zinc finger nu...","[atonal BHLH transcription factor 1, beta act...",,MDPI,English,Review,[South Korea],"[(National, Research, Foundation)]",[]
4,Screening for sickle-cell retinopathy,2023,Eye (Basingstoke),[No abstract available],"[crizanlizumab, vasculotropin antibody, clin...",,Springer Nature,English,Letter,[United Kingdom],[],[United Kingdom]


In [106]:
df2.to_csv('../data/interim/data.csv')