# Package Library

In [None]:
!pip install BeautifulSoup
!pip install requests
!pip install selenium

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib.request

# Creates a list of article acronyms and their links

In [None]:
my_url = "https://www.wikijournalclub.org/wiki/WikiJournalClub:Usable_articles"

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}

# This request object will integrate your URL and the headers defined above
req = urllib.request.Request(url=my_url, headers=headers)

# Calling urlopen this way will automatically handle closing the request
with urllib.request.urlopen(req) as response:
    page_html = response.read()

soup = BeautifulSoup(page_html, "html.parser")

# Adding the XPath expression to select the desired elements. This expression was gotten from the Chrome extension SelectorGadget
selected_elements = soup.select('div#mw-content-text li a')

article_acronym = []
article_link = []

# Looping through the selected elements to extract text and links
for element in selected_elements:
    text = element.get_text()
    link = "https://www.wikijournalclub.org/" + element.get('href')
    article_acronym.append(text)
    article_link.append(link)

df = pd.DataFrame(list(zip(article_acronym, article_link)),
               columns =['Acronym', 'link'])

# Clicks each link and colects text and pdf

In [None]:
# Create lists to store text and pdf links
text_list = []
pdf_list = []

# Cliking links and scraping text and pdf links
for url in df['link']:
    req = urllib.request.Request(url=url, headers=headers)

    with urllib.request.urlopen(req) as response:
        page_html = response.read()

    soup = BeautifulSoup(page_html, "html.parser")

    text = soup.find("div", class_="vector-body")
    pdf_link = soup.find("span", class_="pdflink")

    text_list.append(text.get_text(strip=True) if text else 'N/A')
    pdf_list.append(pdf_link.find("a")["href"] if pdf_link else 'N/A')

# Adding it all to the dataframe
df['Text'] = text_list
df['PDF_Link'] = pdf_list

print(df)

                   Acronym                                               link  \
0                    EXTRA        https://www.wikijournalclub.org//wiki/EXTRA   
1                      ACT          https://www.wikijournalclub.org//wiki/ACT   
2     ATLAS ACS-2, TIMI 51  https://www.wikijournalclub.org//wiki/ATLAS_AC...   
3                CAPRICORN    https://www.wikijournalclub.org//wiki/CAPRICORN   
4                   CAST I       https://www.wikijournalclub.org//wiki/CAST_I   
...                    ...                                                ...   
1085                 TRICC        https://www.wikijournalclub.org//wiki/TRICC   
1086                 ERSPC        https://www.wikijournalclub.org//wiki/ERSPC   
1087                  PCPT         https://www.wikijournalclub.org//wiki/PCPT   
1088             SWOG 8949    https://www.wikijournalclub.org//wiki/SWOG_8949   
1089           S-HYDRACYST  https://www.wikijournalclub.org//wiki/S-HYDRACYST   

                           

In [None]:
from google.colab import files
df.to_csv('df.csv')
files.download('df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Extracting text from NEJM

Downloading the pdfs didn't really work, so now I will scrappe the full text from the journal's website.

The New England Journal of Medicine has by far the most number of artciles in the Wiki Journal page. As such, I will get the full links for the articles in the NEJM, and then write script to extract the text specific for the journals website.

In [None]:
nejm_df = df[df['PDF_Link'].str.contains('NEJM', case=False)]

nejm_df

# Create lists to store text and pdf links
article_link = []
full_article = []

# Cliking links and scraping text and pdf links
for url in nejm_df['link']:
    req = urllib.request.Request(url=url, headers=headers)

    with urllib.request.urlopen(req) as response:
        page_html = response.read()

    soup = BeautifulSoup(page_html, "html.parser")

    full_link = soup.find("span", class_="fulltextlink")
    article_link.append(full_link.find("a")["href"] if full_link else 'N/A')


# Adding it all to the dataframe
nejm_df['Full Article Link'] = article_link



In [None]:
nejm_df

nejm_df.to_csv('nejm_df.csv')
files.download('nejm_df.csv')

# Extracting text from Lancet



In [None]:
lancet_df = df[df['PDF_Link'].str.contains('lancet', case=False)]

lancet_df

# Create lists to store text and pdf links
article_link = []
full_article = []

# Cliking links and scraping text and pdf links
for url in lancet_df['link']:
    req = urllib.request.Request(url=url, headers=headers)

    with urllib.request.urlopen(req) as response:
        page_html = response.read()

    soup = BeautifulSoup(page_html, "html.parser")

    full_link = soup.find("span", class_="fulltextlink")
    article_link.append(full_link.find("a")["href"] if full_link else 'N/A')


# Adding it all to the dataframe
lancet_df['Full Article Link'] = article_link

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lancet_df['Full Article Link'] = article_link


In [None]:
lancet_df

lancet_df.to_csv('lancet_df.csv')
files.download('lancet_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Extracting the Text

The NEJM has a pretty tricky captcha to fool by using beuatifulsoup. My work around was using Selenium, where I can control the behavior of everything inside the browser, including closing pop-up windows and etc.

I am used to using Selenium in R, so the code bellow iterates over every full text link acquired from the Python code above, and fools the captcha from the NEJM into thinking the information is being gathered by human.

In total, there were 347 unique articles that I could scrappe from the NEJM.

### The code bellow is from R!!





```
# Package Library
library(RSelenium)
library(wdman)
library(netstat)
library(purrr)
library(httpuv)
library(dplyr)

# Data import
articles <- read.csv("C:\\Users\\lglea\\Downloads\\nejm_df.csv")

# Starting selenium
selenium()
selenium_object <- selenium(retcommand = TRUE, check = F)
print(selenium_object)

# Creating a temporary list for the scrapped articles
data_list <- list()

      # Looping over the full text urls from the file
      for (url in articles$Full.Article.Link) {
        
        # Opening the server and creating a remote driver
        driver <- rsDriver(browser = "firefox", verbose = FALSE, port = free_port())
        remDr <- driver$client
        remDr$open()
        
        # Opening the URL
        remDr$navigate(url)
        
        # Scrapping the text
        text <- remDr$findElements(using = "xpath", "//*[(@id = 'full')]")
        full_text <- map(text, ~.$getElementText() %>% unlist())
        
        # Get the acronym (article title name) corresponding to the current text
        current_acronym <- articles$Acronym[articles$Full.Article.Link == url]
        
        # Storing the collected information in a temporary data frame
        temp_df <- setNames(data.frame(
          Text = full_text,
          Acronym = current_acronym,
          stringsAsFactors = FALSE
        ), c("Full Article Text", "Acronym"))
        
        # Append the temporary data frame to the list
        data_list <- c(data_list, list(temp_df))
        
        # Close the browser
        remDr$close()
        
        # Close the server
        driver$server$stop()
      }

      # Combine all the data frames in the list into a single data frame
      final_df <- bind_rows(data_list)
```

Selenium runs through GitHub, and there is a limit on how many times you can open a new server using a free gitHub account in one hour. For that reason, I had to run this loop and store the ~60 articles I was able to scrappe in a new data.frame, and create a new dataframe excluding the articles that were already scrapped.

**This process was repeated 8 times**

```
# Excludes repeated scrappes
df_one <- unique(final_df)

# Excludes scrapped articles from the new data.frame
articles_two<- anti_join(articles, df_one, by = "Acronym")

```

Finally, all temporary dataframes are binded togheter and joined with the original articles csv file, roughly cleaned, and the final csv file with 347 articles and their corresponding wiki text was created!

```
df_full_one <- bind_rows(df_one, df_two, df_three, df_four, df_five, df_six, df_seven, df_eight)

nejm_table <- left_join(df_full_one, articles, by = "Acronym")

nejm_unique <- nejm_table %>%
  distinct(Acronym, .keep_all = TRUE) %>%
  mutate(`Wiki Text` = Text) %>%
  select(`Acronym`, `Full Article Text`, `Wiki Text`)

  write.csv(nejm_unique, "C:\\Users\\lglea\\OneDrive\\Documents\\UMn MPH\\Courses\\3rd Semester\\Biostat II - PUBH6451\\BiostatsPUBH6451\\Web_Scrapping_Stroke\\unique_articles.csv")

```

#Read in CSV dataset into Panda Dataframe

##Read Dataset From Google Drive

In [9]:
!pip install oauth2client --upgrade oauth2client

from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
import pandas as pd

#setup
creds, _ = default()
gc = gspread.authorize(creds)

#read data and put in a dataframe
gsheets = gc.open_by_url('https://docs.google.com/spreadsheets/d/12wNv2hhQLM3C6TJKcEANbf0NieUCXHR2l-1EydGNi2Q/')
sheets = gsheets.worksheet('unique_articles').get_all_values()



##Read Dataset from Github

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/jonas060/ResearchWikiArticleGeneration/main/unique_articles.csv")

#Clean Dataset

In [10]:
import re

df = pd.DataFrame(sheets[1:], columns = sheets[0])

def clean_full_article_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text)
    text = re.sub('\d+ References', '', text)
    text = re.sub('\d+ Citing Articles', '', text)
    text = re.sub('\d+ Comments', '', text)
    text = re.sub('References \(\d+\)', '', text)
    text = re.sub('Citing Articles \(\d+\)', '', text)
    text = re.sub('Comments \(\d+\)', '', text)
    text = re.sub('Figure \d+', '', text)
    text = re.sub('Table \d+', '', text)
    text = re.sub('\n', ' ', text)
    return text

def clean_full_wiki_text(text):
    text = str(text)
    text = re.sub('From Wiki Journal ClubJump to navigationJump to searchPublished*', '', text)
    text = re.sub('\w+ \w+, et al\.', '', text)
    return text

In [11]:
for index, row in df.iterrows():
  cleaned_text_full_article = clean_full_article_text(row['Full Article Text'])
  row['Full Article Text'] = cleaned_text_full_article

  cleaned_text_full_wiki = clean_full_wiki_text(row['Wiki Text'])
  row['Wiki Text'] = cleaned_text_full_wiki

print(df['Wiki Text'][0])

 "Rivaroxaban in Patients with Recent Acute Coronary Syndrome".The New England Journal of Medicine. 2012. 366(1):9-19.PubMed•Full text•PDFContents1Clinical Question2Bottom Line3Major Points4Guidelines5Design6Population6.1Inclusion Criteria6.2Exclusion Criteria6.3Baseline Characteristics6.4Baseline Medications7Interventions8Outcomes8.1Primary Outcomes8.2Secondary Outcome8.3Additional Analyses8.4Adverse Events8.5Subgroup Analysis9Criticisms10Funding11Further ReadingClinical QuestionIn patients with recent ACS, does the addition of rivaroxaban to standard dual antiplatelet therapy improve CV morbidity and mortality?Bottom LineIn patients with recent ACS, the addition of rivaroxaban to standard dual antiplatelet therapy reduced the composite of CV mortality, recurrent MI, or stroke, but increased the risk of nonfatal bleeding.Major PointsTwo meta-analyses of RCTs investigating the addition of warfarin to aspirin among ACS patients suggested that combination therapy may reduce the risk of i

In [12]:
df

Unnamed: 0,Unnamed: 1,Acronym,Full Article Text,Wiki Text
0,1,"ATLAS ACS-2, TIMI 51",Abstract BACKGROUND Acute coronary syndrome...,"""Rivaroxaban in Patients with Recent Acute Co..."
1,2,CAST I,Letters Abstract BACKGROUND AND METHODS. In ...,"""Mortality and morbidity in patients receivin..."
2,3,CHAMPION PHOENIX,Letters Abstract BACKGROUND The intensity of...,"""Effect of Platelet Inhibition with Cangrelor..."
3,4,CHARISMA,Letters Abstract BACKGROUND Dual antiplatele...,"""Clopidogrel and aspirin versus aspirin alone..."
4,5,COLCOT,Letters Abstract BACKGROUND Experimental an...,"""Efficacy and safety of low-dose colchicine a..."
...,...,...,...,...
342,343,UPLIFT,Letters Abstract BACKGROUND Previous studies...,"""A 4-Year Trial of Tiotropium in Chronic Obst..."
343,344,WISDOM,Letters 1 Comment Abstract BACKGROUND Treatm...,"""Withdrawal of inhaled glucocorticoids and ex..."
344,345,INPULSIS Trials,Letters Abstract BACKGROUND Nintedanib (form...,"""Efficacy and safety of nintedanib in idiopat..."
345,346,ASCEND (IPF),Letters Abstract BACKGROUND In two of three ...,"""A phase 3 trial of pirfenidone in patients w..."


In [None]:
#df.to_csv("data.csv")
from google.colab import drive
drive.mount('drive')

df.to_csv(data.csv)
!cp data.csv "drive/My Drive/"