## We'll start with importing our libraries and reading the dataset

In [1]:
import requests
import re 
import pandas as pd
import numpy as np
from urllib.error import HTTPError
from bs4 import BeautifulSoup

In [2]:
df = pd.read_csv('covid_abstracts.csv')

In [3]:
df1 = pd.read_csv("Covid_Papers.csv")

In [5]:
df

Unnamed: 0,title,abstract,url
0,Real-World Experience with COVID-19 Including...,This article summarizes the experiences of COV...,https://pubmed.ncbi.nlm.nih.gov/35008137
1,Successful outcome of pre-engraftment COVID-19...,Coronavirus disease 2019 COVID-19 caused by...,https://pubmed.ncbi.nlm.nih.gov/35008104
2,The impact of COVID-19 on oncology professiona...,BACKGROUND COVID-19 has had a significant imp...,https://pubmed.ncbi.nlm.nih.gov/35007996
3,ICU admission and mortality classifiers for CO...,The coronavirus disease 2019 COVID-19 which ...,https://pubmed.ncbi.nlm.nih.gov/35007991
4,Clinical evaluation of nasopharyngeal midturb...,In the setting of supply chain shortages of na...,https://pubmed.ncbi.nlm.nih.gov/35007959
...,...,...,...
9995,Rooming-in Breastfeeding and Neonatal Follow-...,INTRODUCTION Due to growing evidence suggesti...,https://pubmed.ncbi.nlm.nih.gov/34851815
9996,Acute Retinal Necrosis from Reactivation of Va...,PURPOSE To report a case of acute retinal nec...,https://pubmed.ncbi.nlm.nih.gov/34851795
9997,Acute Abducens Nerve Palsy Following the Secon...,The authors report the case of an otherwise he...,https://pubmed.ncbi.nlm.nih.gov/34851785
9998,Planning and Implementing the Protocol for Psy...,The present study aims to plan the protocol fo...,https://pubmed.ncbi.nlm.nih.gov/34851781


In [7]:
df1.drop("Unnamed: 0", axis =1, inplace = True)

In [11]:
df1

Unnamed: 0,title,abstract,tags
0,Real-World Experience with COVID-19 Including...,This article summarizes the experiences of COV...,"['Aged', 'Aged, 80 and over', 'Antibodies, Mon..."
1,Successful outcome of pre-engraftment COVID-19...,Coronavirus disease 2019 COVID-19 caused by...,"['COVID-19*', 'Hematopoietic Stem Cell Transpl..."
2,The impact of COVID-19 on oncology professiona...,BACKGROUND COVID-19 has had a significant imp...,"['Burnout, Professional* / epidemiology', 'COV..."
3,ICU admission and mortality classifiers for CO...,The coronavirus disease 2019 COVID-19 which ...,"['Bayes Theorem', 'COVID-19*', 'Hospitalizatio..."
4,Clinical evaluation of nasopharyngeal midturb...,In the setting of supply chain shortages of na...,"['COVID-19 Testing', 'COVID-19* / diagnosis', ..."
...,...,...,...
5468,Hypersensitivity Reactions to Vaccines Curren...,The first reports of hypersensitivity reaction...,"['Anaphylaxis* / chemically induced', 'COVID-1..."
5469,Rooming-in Breastfeeding and Neonatal Follow-...,INTRODUCTION Due to growing evidence suggesti...,"['Breast Feeding*', 'COVID-19*', 'Female', 'Fo..."
5470,Acute Abducens Nerve Palsy Following the Secon...,The authors report the case of an otherwise he...,['Abducens Nerve Diseases* / chemically induce...
5471,Planning and Implementing the Protocol for Psy...,The present study aims to plan the protocol fo...,"['COVID-19*', 'Delivery of Health Care', 'Huma..."


In [15]:
df['title']

0       Real-World Experience with COVID-19  Including...
1       Successful outcome of pre-engraftment COVID-19...
2       The impact of COVID-19 on oncology professiona...
3       ICU admission and mortality classifiers for CO...
4       Clinical evaluation of nasopharyngeal  midturb...
                              ...                        
9995    Rooming-in  Breastfeeding and Neonatal Follow-...
9996    Acute Retinal Necrosis from Reactivation of Va...
9997    Acute Abducens Nerve Palsy Following the Secon...
9998    Planning and Implementing the Protocol for Psy...
9999    Prolonged corrected QT interval in hospitalize...
Name: title, Length: 10000, dtype: object

In [16]:
new_df = df[~df.title.isin(list(df1['title']))]

In [18]:
new_df.to_csv("NonTaggedPapers.csv")

## Next we're gonna loop through the dataset and go to the link specified and download the html page into a list

In [None]:
htmls = []
for index, row in df.iterrows():
    try:
        htmls.append(requests.get(row['url']))
    except urllib.error.HTTPError as err:
        htmls.append(None)

In [None]:
len(htmls)

## Next we need to process the html to find our relevant text. Here I'll run the process on an individual case, so you can see how we're handling each link

In [None]:
a = htmls

In [None]:
a[0]

In [None]:
soup = BeautifulSoup(a[0].content, "html.parser")
results = soup.find(id = 'mesh-terms')
print(results.prettify())

In [None]:
re.findall(r'Toggle dropdown menu for keyword (.*)" class', str(results))

## Next we need to run it on the entire 10,000 list and then add it as a new column to our dataframe

In [None]:
tags = []
for x in a:
    try:
        soup = BeautifulSoup(x.content, "html.parser")
        results = soup.find(id = 'mesh-terms')
        tags.append(re.findall(r'Toggle dropdown menu for keyword (.*)" class', str(results)))
    except:
        tags.append(None)
        pass

In [None]:
df['tags'] = tags

In [None]:
df['tags']

## As we can see on row 9996, we have empty lists instead of a NaN value. Thus pandas will not recognise it as a null value and we won't be able to filter our values. So I'm converting all the empty lists into NaN

In [None]:
df.tags = df.tags.apply(lambda y: np.nan if len(y)==0 else y)

In [None]:
df.info()

## We have nearly half our rows without any tag values. Now one reason for this could be that the link was no longer accessible, as we had seen previously. However it is unlikely that half the dataset is dead links

In [None]:
df[df['tags'].isnull()]

## Looking at an individual case, for example row 14. Earlier we found out the first error 404 link was at row 70, so that means row 14's link should be accessible.
## After manually opening the link, it loads fine. However a new problem, is that the author has not specified any mesh terms (i.e keywords). As a result our parser returned nothing.
## A combination of dead links and unlabelled mesh terms, means half our dataset will need to be discarded, at least in the training phase.

In [None]:
df = df[df['tags'].notna()]

In [None]:
df.info()

In [None]:
df.head()

## We can also drop the url column, as we no longer need it to retrieve our tags

In [None]:
df.drop('url', axis=1, inplace=True)

## Now we finally have our clean and complete dataset and ready to use it!

In [None]:
df.head(10)

In [None]:
df.to_csv('Covid_Papers.csv')