## Pub Med Scraper

In [1]:
# scrape dependencies
import requests
import re
from bs4 import BeautifulSoup as bs

# data analysis dependencies
import pandas as pd
import numpy as np
import csv

# ipynb dependencies
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

# viz dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import datetime as dt
import time

In [2]:
# set the url to scrape
url = 'https://www.ncbi.nlm.nih.gov/pubmed/trending/'
print(url)

https://www.ncbi.nlm.nih.gov/pubmed/trending/


In [3]:
# set up beautiful soup to scrape
response = requests.get(url)
soup = bs(response.text, 'html.parser')

In [4]:
# lets scrape the article titles
journals = soup.find_all("p", attrs={'class':'title'})

In [5]:
# searching for the journal titles
journals_len = len(journals)
print(f"There are {journals_len} journals to scrape on the first page.")

There are 20 journals to scrape on the first page.


In [6]:
# loop through journals to print titles
for i in range(journals_len):
    journals[i].text.strip()

'Reprogramming human T cell function and specificity with non-viral genome targeting.'

'Prediction of acute myeloid leukaemia risk in healthy individuals.'

'Author Correction: What models eat.'

'Physical and Mental Effects of Bathing: A Randomized Intervention Study.'

'Günter Blobel (1936–2018)'

'Histidine catabolism is a major determinant of methotrexate sensitivity.'

'Enhancing T cell therapy through TCR-signaling-responsive nanoparticle drug delivery.'

'A randomized placebo-controlled clinical trial of nicotinamide riboside in obese men: safety, insulin-sensitivity, and lipid-mobilizing effects.'

'The Health Insurance Marketplaces.'

'Mechanism of parkin activation by PINK1.'

'Cancer-associated thrombosis in patients with implanted ports: a prospective multicenter French cohort study (ONCOCIP).'

'CRISPR-enhanced engineering of therapy-sensitive cancer cells for self-targeting of primary and metastatic tumors.'

'Evolution of a central neural circuit underlies Drosophila mate preferences.'

'Phenotype molding of stromal cells in the lung tumor microenvironment.'

'X-ray and cryo-EM structures of the mitochondrial calcium uniporter.'

'NAD+ Depletion Triggers Macrophage Necroptosis, a Cell Death Pathway Exploited by Mycobacterium tuberculosis.'

'Somatic mutations precede acute myeloid leukemia years before diagnosis.'

'Long-acting Reversible Contraception-Highly Efficacious, Safe, and Underutilized.'

'Prevention of M. tuberculosis Infection with H4:IC31 Vaccine or BCG Revaccination.'

"RIG-I Recognizes the 5' Region of Dengue and Zika Virus Genomes."

## Set main url to concat with pubmed ids

In [7]:
# set the main url that we will concatanate with the pubmed id
main_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
print(main_url)

https://www.ncbi.nlm.nih.gov/pubmed/


<br>
<br>
# TESTING SECTION ------------------------------------------------------------------

In [8]:
# set empty links_all list to append to 
links_all = []

# set pubmed ids list to append to
pubmed_ids = []

# set empty list to append scrape_links to
scrape_links = []

In [9]:
# use bs to scarpe p tags with class - title
links = soup.find_all("p",attrs={'class':'title'})
print(links[0])

<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29995861" ref="ordinalpos=1&amp;ncbi_uid=29995861&amp;link_uid=29995861&amp;linksrc=docsum_title">Reprogramming human T cell function and specificity with non-viral genome targeting.</a></p>


In [10]:
# testing to see how my links / journals to scrape
articles_to_scrape = len(links)
print(f"There are {articles_to_scrape} articles to scrape.")
print("----------------------------------------------")

There are 20 articles to scrape.
----------------------------------------------


In [11]:
# loop through links to convert to string
for i in range (len(links)):
    links_all.append(str(links[i]))
    print(links[i])
    print("----------------------------------------------")

<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29995861" ref="ordinalpos=1&amp;ncbi_uid=29995861&amp;link_uid=29995861&amp;linksrc=docsum_title">Reprogramming human T cell function and specificity with non-viral genome targeting.</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29988082" ref="ordinalpos=2&amp;ncbi_uid=29988082&amp;link_uid=29988082&amp;linksrc=docsum_title">Prediction of acute myeloid leukaemia risk in healthy individuals.</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29985486" ref="ordinalpos=3&amp;ncbi_uid=29985486&amp;link_uid=29985486&amp;linksrc=docsum_title">Author Correction: What models eat.</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29977318" ref="ordinalpos=4&a

In [12]:
# for i in links_all:
#     print(i)
#     print("----------------------------------------------")

In [13]:
# slice through links_all to test
len(links_all)
links_all[0]

20

'<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29995861" ref="ordinalpos=1&amp;ncbi_uid=29995861&amp;link_uid=29995861&amp;linksrc=docsum_title">Reprogramming human T cell function and specificity with non-viral genome targeting.</a></p>'

### Regex test

In [14]:
regex_test = '<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29985486" ref="ordinalpos=1&amp;ncbi_uid=29985486&amp;link_uid=29985486&amp;linksrc=docsum_title">Author Correction: What models eat.</a></p>'

In [15]:
print(regex_test)

<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29985486" ref="ordinalpos=1&amp;ncbi_uid=29985486&amp;link_uid=29985486&amp;linksrc=docsum_title">Author Correction: What models eat.</a></p>


In [16]:
re.findall(r'\d{8}',regex_test)

['29985486', '29985486', '29985486']

### End Regex Test

In [17]:
# loop through links all and use regex to grab the id numbers
for i in range (len(links_all)):
    pubmed_ids.append(re.findall(r'\d{8}',links_all[i]))

In [18]:
# print out info for pubmed_ids
len(pubmed_ids)
type(pubmed_ids)
print(pubmed_ids)
print("----------------------------------------------")

20

list

[['29995861', '29995861', '29995861'], ['29988082', '29988082', '29988082'], ['29985486', '29985486', '29985486'], ['29977318', '29977318', '29977318'], ['29975496', '29975496', '29975496'], ['29995852', '29995852', '29995852'], ['29985479', '29985479', '29985479'], ['29992272', '29992272', '29992272'], ['29987334', '29987334', '29987334'], ['29995846', '29995846', '29995846'], ['29980524', '29980524', '29980524'], ['29997250', '29997250', '29997250'], ['29995860', '29995860', '29995860'], ['29988129', '29988129', '29988129'], ['29995856', '29995856', '29995856'], ['29996103', '29996103', '29996103'], ['29988143', '29988143', '29988143'], ['29984374', '29984374', '29984374'], ['29996082', '29996082', '29996082'], ['29996094', '29996094', '29996094']]
----------------------------------------------


In [19]:
# use itertools to transform pubmed ids from an array withn an array into one list
import itertools
pubmed_merged = list(itertools.chain.from_iterable(pubmed_ids))

In [20]:
# slice through pubmed_merged to see what itertools did
pubmed_merged[0]
print(pubmed_merged)

'29995861'

['29995861', '29995861', '29995861', '29988082', '29988082', '29988082', '29985486', '29985486', '29985486', '29977318', '29977318', '29977318', '29975496', '29975496', '29975496', '29995852', '29995852', '29995852', '29985479', '29985479', '29985479', '29992272', '29992272', '29992272', '29987334', '29987334', '29987334', '29995846', '29995846', '29995846', '29980524', '29980524', '29980524', '29997250', '29997250', '29997250', '29995860', '29995860', '29995860', '29988129', '29988129', '29988129', '29995856', '29995856', '29995856', '29996103', '29996103', '29996103', '29988143', '29988143', '29988143', '29984374', '29984374', '29984374', '29996082', '29996082', '29996082', '29996094', '29996094', '29996094']


In [21]:
# concat main_url with a slice of pubmed_merged before we loop
print(main_url + str(pubmed_merged[0]))

https://www.ncbi.nlm.nih.gov/pubmed/29995861


In [22]:
# append merged links to links_all
for i in range (len(pubmed_merged)):
    scrape_links.append(main_url + str(pubmed_merged[i]))

In [23]:
for i in scrape_links:
    print(i)

https://www.ncbi.nlm.nih.gov/pubmed/29995861
https://www.ncbi.nlm.nih.gov/pubmed/29995861
https://www.ncbi.nlm.nih.gov/pubmed/29995861
https://www.ncbi.nlm.nih.gov/pubmed/29988082
https://www.ncbi.nlm.nih.gov/pubmed/29988082
https://www.ncbi.nlm.nih.gov/pubmed/29988082
https://www.ncbi.nlm.nih.gov/pubmed/29985486
https://www.ncbi.nlm.nih.gov/pubmed/29985486
https://www.ncbi.nlm.nih.gov/pubmed/29985486
https://www.ncbi.nlm.nih.gov/pubmed/29977318
https://www.ncbi.nlm.nih.gov/pubmed/29977318
https://www.ncbi.nlm.nih.gov/pubmed/29977318
https://www.ncbi.nlm.nih.gov/pubmed/29975496
https://www.ncbi.nlm.nih.gov/pubmed/29975496
https://www.ncbi.nlm.nih.gov/pubmed/29975496
https://www.ncbi.nlm.nih.gov/pubmed/29995852
https://www.ncbi.nlm.nih.gov/pubmed/29995852
https://www.ncbi.nlm.nih.gov/pubmed/29995852
https://www.ncbi.nlm.nih.gov/pubmed/29985479
https://www.ncbi.nlm.nih.gov/pubmed/29985479
https://www.ncbi.nlm.nih.gov/pubmed/29985479
https://www.ncbi.nlm.nih.gov/pubmed/29992272
https://ww

# END TESTING SECTION------------------------------------------------------------
<br>
<br>

## Function to create array of links to scrape

In [24]:
# set empty links_all list to append to 
links_all = []

# set pubmed ids list to append to
pubmed_ids = []

# set empty list to append scrape_links to
scrape_links = []

# function to get links
def get_links(main_url):
    
    # use bs to scarpe p tags with class - title
    links = soup.find_all("p",attrs={'class':'title'})
      
    # testing to see how my links / journals to scrape
    articles_to_scrape = len(links)
    print(f"There are {articles_to_scrape} articles to scrape.")
    print("----------------------------------------------")
    
    # loop through links to convert to string
    for i in range (len(links)):
        links_all.append(str(links[i]))
        print(links[i])
        print("----------------------------------------------")
        
    # slice through links_all to test
    len(links_all)
    links_all[1]
    
    # loop through links all and use regex to grab the id numbers
    for i in range (len(links_all)):
        pubmed_ids.append(re.findall(r'\d{8}',links_all[i]))
    
    # print out info for pubmed_ids
    len(pubmed_ids)
    type(pubmed_ids)
    print(pubmed_ids)
    print("----------------------------------------------")
    
    # use itertools to transform pubmed ids from an array withn an array into one list
    import itertools
    pubmed_merged = list(itertools.chain.from_iterable(pubmed_ids))
    
    # slice through pubmed_merged to see what itertools did
    pubmed_merged[0]
    
    # concat main_url with a slice of pubmed_merged before we loop
    print(main_url + str(pubmed_merged[0]))
    
    # append merged links to links_all
    for i in range (len(pubmed_merged)):
        scrape_links.append(main_url + str(pubmed_merged[i]))

In [25]:
# RUN FUNCTION
get_links(main_url)

There are 20 articles to scrape.
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29995861" ref="ordinalpos=1&amp;ncbi_uid=29995861&amp;link_uid=29995861&amp;linksrc=docsum_title">Reprogramming human T cell function and specificity with non-viral genome targeting.</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29988082" ref="ordinalpos=2&amp;ncbi_uid=29988082&amp;link_uid=29988082&amp;linksrc=docsum_title">Prediction of acute myeloid leukaemia risk in healthy individuals.</a></p>
----------------------------------------------
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/29985486" ref="ordinalpos=3&amp;ncbi_uid=29985486&amp;link_uid=29985486&amp;linksrc=docsum_title">Author Correction: What models eat.</a></p>
----------------------------------------------
<p class="title" xmlns:mml="ht

<br>
There are duplicates in our **scrape_links** array. Use `list` to delete the duplicates.

In [26]:
# delete duplicates in scrape_links and assign to new variable scrape_links_final
scrape_links_final = list(set(scrape_links))
len(scrape_links_final)
scrape_links_final

20

['https://www.ncbi.nlm.nih.gov/pubmed/29985486',
 'https://www.ncbi.nlm.nih.gov/pubmed/29995846',
 'https://www.ncbi.nlm.nih.gov/pubmed/29997250',
 'https://www.ncbi.nlm.nih.gov/pubmed/29995860',
 'https://www.ncbi.nlm.nih.gov/pubmed/29995852',
 'https://www.ncbi.nlm.nih.gov/pubmed/29996082',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987334',
 'https://www.ncbi.nlm.nih.gov/pubmed/29988129',
 'https://www.ncbi.nlm.nih.gov/pubmed/29988082',
 'https://www.ncbi.nlm.nih.gov/pubmed/29996103',
 'https://www.ncbi.nlm.nih.gov/pubmed/29984374',
 'https://www.ncbi.nlm.nih.gov/pubmed/29995856',
 'https://www.ncbi.nlm.nih.gov/pubmed/29977318',
 'https://www.ncbi.nlm.nih.gov/pubmed/29975496',
 'https://www.ncbi.nlm.nih.gov/pubmed/29995861',
 'https://www.ncbi.nlm.nih.gov/pubmed/29996094',
 'https://www.ncbi.nlm.nih.gov/pubmed/29980524',
 'https://www.ncbi.nlm.nih.gov/pubmed/29992272',
 'https://www.ncbi.nlm.nih.gov/pubmed/29985479',
 'https://www.ncbi.nlm.nih.gov/pubmed/29988143']

## Main array of links to scrape:

Here we use selenium to iterate through these links. Seleium will click on each link then scrape the title and abstracts on each page. 

In [27]:
# testing scrape_links
for i in scrape_links_final:
    print(i)

https://www.ncbi.nlm.nih.gov/pubmed/29985486
https://www.ncbi.nlm.nih.gov/pubmed/29995846
https://www.ncbi.nlm.nih.gov/pubmed/29997250
https://www.ncbi.nlm.nih.gov/pubmed/29995860
https://www.ncbi.nlm.nih.gov/pubmed/29995852
https://www.ncbi.nlm.nih.gov/pubmed/29996082
https://www.ncbi.nlm.nih.gov/pubmed/29987334
https://www.ncbi.nlm.nih.gov/pubmed/29988129
https://www.ncbi.nlm.nih.gov/pubmed/29988082
https://www.ncbi.nlm.nih.gov/pubmed/29996103
https://www.ncbi.nlm.nih.gov/pubmed/29984374
https://www.ncbi.nlm.nih.gov/pubmed/29995856
https://www.ncbi.nlm.nih.gov/pubmed/29977318
https://www.ncbi.nlm.nih.gov/pubmed/29975496
https://www.ncbi.nlm.nih.gov/pubmed/29995861
https://www.ncbi.nlm.nih.gov/pubmed/29996094
https://www.ncbi.nlm.nih.gov/pubmed/29980524
https://www.ncbi.nlm.nih.gov/pubmed/29992272
https://www.ncbi.nlm.nih.gov/pubmed/29985479
https://www.ncbi.nlm.nih.gov/pubmed/29988143


### Add timer to main function when scraping

In [28]:
# slice out scrape_links_final so we can scrape 5 articles at a time
links_1 = scrape_links_final[0:5]
links_1

links_2 = scrape_links_final[5:10]
links_2

links_3 = scrape_links_final[10:15]
links_3

links_4 = scrape_links_final[15:20]
links_4

['https://www.ncbi.nlm.nih.gov/pubmed/29985486',
 'https://www.ncbi.nlm.nih.gov/pubmed/29995846',
 'https://www.ncbi.nlm.nih.gov/pubmed/29997250',
 'https://www.ncbi.nlm.nih.gov/pubmed/29995860',
 'https://www.ncbi.nlm.nih.gov/pubmed/29995852']

['https://www.ncbi.nlm.nih.gov/pubmed/29996082',
 'https://www.ncbi.nlm.nih.gov/pubmed/29987334',
 'https://www.ncbi.nlm.nih.gov/pubmed/29988129',
 'https://www.ncbi.nlm.nih.gov/pubmed/29988082',
 'https://www.ncbi.nlm.nih.gov/pubmed/29996103']

['https://www.ncbi.nlm.nih.gov/pubmed/29984374',
 'https://www.ncbi.nlm.nih.gov/pubmed/29995856',
 'https://www.ncbi.nlm.nih.gov/pubmed/29977318',
 'https://www.ncbi.nlm.nih.gov/pubmed/29975496',
 'https://www.ncbi.nlm.nih.gov/pubmed/29995861']

['https://www.ncbi.nlm.nih.gov/pubmed/29996094',
 'https://www.ncbi.nlm.nih.gov/pubmed/29980524',
 'https://www.ncbi.nlm.nih.gov/pubmed/29992272',
 'https://www.ncbi.nlm.nih.gov/pubmed/29985479',
 'https://www.ncbi.nlm.nih.gov/pubmed/29988143']

## Regex Notes

In [29]:
# Regex
# Identifiers:
# \d any number
# \D anything but a number
# \s space
# \S anything but a space
# \w any character
# \W anything but a character
# . any character, except for a newline
# \b the whitespace around words
# \. a period

# Modifiers:
# {1,3} we're expecting 1-3 \d{1-3}
# + Match 1 or more
# ? Match 0 or more
# * Match 0 or more
# $ Match the end of a string
# ^ matching the beginning of a string
# | either or
# [] range or "variance" [A-Za-z] [1-5a-qA-Z]
# {x} expecting "x" amount

# White Space Characters: 
# \n new line
# \s space
# \t tab
# \e escape
# \f form feed
# \r return

# DONT FORGET!:
# . + * [] $ ^ () {} | \

## Selenium
**Web Browser Automation**

In [30]:
from splinter import Browser
from selenium import webdriver

In [31]:
# make sure chrome browser exe is in current directory
# chrome browser exe is not necessary for MACS
executable_path = {'executable_path': 'chromedriver'}

## Set up dictionary to append data to

In [32]:
article_dict = {}

## Create get_article_info function

In [33]:
title = []
abstract = []

def get_article_info(links_1):
    
    # iterate through articles
    for i in links_1:
        
        # sets up scraper
        browser = Browser('chrome', headless=False)
        html = browser.html
        response2 = requests.get(i)
        soup2 = bs(response2.text, 'html.parser')
    
        browser.visit(i)
    
        # there are two 'h1' tags on this page. slice out index 0
        title_one = soup2.find_all('h1')
        article_one_title = title_one[1].text.strip()
    
        # slice h1 at index 1 to grab article title
        title.append(article_one_title)
    
        # get abstract 
        # abstract.append(soup2.find("div", attrs={'class': 'rprt_all'}).text.strip())     
        abstract.append(soup2.find("div", attrs={'class': 'rprt abstract'}).text.strip())
        
        article_dict["title"] = title
        article_dict["abstract"] = abstract

## Run article_info function

In [37]:
title

['Author Correction: What models eat.',
 'Mechanism of parkin activation by PINK1.',
 'CRISPR-enhanced engineering of therapy-sensitive cancer cells for self-targeting of primary and metastatic tumors.',
 'Evolution of a central neural circuit underlies Drosophila mate preferences.',
 'Histidine catabolism is a major determinant of methotrexate sensitivity.']

In [36]:
get_article_info(links_1)

In [51]:
get_article_info(links_2)

In [None]:
get_article_info(links_3)

In [None]:
get_article_info(links_4)

In [52]:
for i in title:
    print(i + "\n")

len(title)

Author Correction: What models eat.

Mechanism of parkin activation by PINK1.

CRISPR-enhanced engineering of therapy-sensitive cancer cells for self-targeting of primary and metastatic tumors.

Evolution of a central neural circuit underlies Drosophila mate preferences.

Histidine catabolism is a major determinant of methotrexate sensitivity.

Prevention of M. tuberculosis Infection with H4:IC31 Vaccine or BCG Revaccination.

The Health Insurance Marketplaces.

Phenotype molding of stromal cells in the lung tumor microenvironment.

Prediction of acute myeloid leukaemia risk in healthy individuals.

NAD+ Depletion Triggers Macrophage Necroptosis, a Cell Death Pathway Exploited by Mycobacterium tuberculosis.



10

In [39]:
for i in abstract:
    print(i)
    print("\n")

len(abstract)

Nat Med. 2018 Jul 9. doi: 10.1038/s41591-018-0133-4. [Epub ahead of print]Author Correction: What models eat.Engber D1.Author information1Freelance science journalist and can be found on Twitter at @danengber, . danengber@yahoo.com.AbstractIn the version of this article originally published, there was an error in the sentence "That quest, which started in the 1980s, continues through until today: For a paper published online in 2017, a group of researchers based at Brazil's Federal University of Rio Grande do Sul tested Sprague Dawley rats on either a high-fat diet, a cafeteria diet or what they called a 'Western diet', formulated to match up more closely with eating patterns in developed nations (it had 42.5% of its calories from fat and added salt and carbohydrates)." Wistar rats were used in the experiment, not Sprague Dawley rats. Also, in ref. 7, the first author's last name was listed as Bortoloin. The correct spelling is Bortolin. The errors have been corrected in the HTML and P

5

## Add title and abstract to article_dict

In [53]:
# check article dict
article_dict

{'abstract': ['Nat Med. 2018 Jul 9. doi: 10.1038/s41591-018-0133-4. [Epub ahead of print]Author Correction: What models eat.Engber D1.Author information1Freelance science journalist and can be found on Twitter at @danengber, . danengber@yahoo.com.AbstractIn the version of this article originally published, there was an error in the sentence "That quest, which started in the 1980s, continues through until today: For a paper published online in 2017, a group of researchers based at Brazil\'s Federal University of Rio Grande do Sul tested Sprague Dawley rats on either a high-fat diet, a cafeteria diet or what they called a \'Western diet\', formulated to match up more closely with eating patterns in developed nations (it had 42.5% of its calories from fat and added salt and carbohydrates)." Wistar rats were used in the experiment, not Sprague Dawley rats. Also, in ref. 7, the first author\'s last name was listed as Bortoloin. The correct spelling is Bortolin. The errors have been correcte

## Save article_dict to json

In [55]:
import json

In [56]:
# check json file
json = json.dumps(article_dict)
f = open("trending2.json","w")
f.write(json)
f.close()

33939

In [43]:
# Import sqlalchemy Dependencies
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy import func
from sqlalchemy import Column, Float, Integer, String, Date
from sqlalchemy.ext.declarative import declarative_base

In [44]:
# Create an engine for the photopharm.sqlite database
engine = create_engine("sqlite:///photopharm.sqlite")


In [45]:
# Reflect Database into ORM classes
Base = declarative_base()

In [46]:
# Create ORM class for titles and abstracts
class Articles(Base):
    __tablename__ = "Title_Abstracts"
    
    __table_args__ = {'extend_existing': True} 
    
    id = Column(Integer, primary_key = True)
    title = Column(String)
    abstract = Column(String)
    

In [47]:
# create tables
Base.metadata.create_all(engine)

In [57]:
# read in trending1 json into a dataframe
df2 = pd.read_json('trending2.json')

In [58]:
df2

Unnamed: 0,abstract,title
0,Nat Med. 2018 Jul 9. doi: 10.1038/s41591-018-0...,Author Correction: What models eat.
1,Nature. 2018 Jun 6. doi: 10.1038/s41586-018-02...,Mechanism of parkin activation by PINK1.
2,Sci Transl Med. 2018 Jul 11;10(449). pii: eaao...,CRISPR-enhanced engineering of therapy-sensiti...
3,Nature. 2018 Jul 11. doi: 10.1038/s41586-018-0...,Evolution of a central neural circuit underlie...
4,Nature. 2018 Jul 11. doi: 10.1038/s41586-018-0...,Histidine catabolism is a major determinant of...
5,N Engl J Med. 2018 Jul 12;379(2):138-149. doi:...,Prevention of M. tuberculosis Infection with H...
6,JAMA. 2018 Jul 9. doi: 10.1001/jama.2018.8117....,The Health Insurance Marketplaces.
7,Nat Med. 2018 Jul 9. doi: 10.1038/s41591-018-0...,Phenotype molding of stromal cells in the lung...
8,Nature. 2018 Jul 9. doi: 10.1038/s41586-018-03...,Prediction of acute myeloid leukaemia risk in ...
9,Cell Rep. 2018 Jul 10;24(2):429-440. doi: 10.1...,NAD+ Depletion Triggers Macrophage Necroptosis...


In [59]:
# import json to sqlite
df2.to_sql(con=engine, index_label='id', name=Articles.__tablename__, if_exists='append')

IntegrityError: (sqlite3.IntegrityError) UNIQUE constraint failed: Title_Abstracts.id [SQL: 'INSERT INTO "Title_Abstracts" (id, abstract, title) VALUES (?, ?, ?)'] [parameters: ((0, 'Nat Med. 2018 Jul 9. doi: 10.1038/s41591-018-0133-4. [Epub ahead of print]Author Correction: What models eat.Engber D1.Author information1Freelance s ... (942 characters truncated) ... DOI: 10.1038/s41591-018-0133-4 SharePublication typePublication typePublished ErratumLinkOut - more resourcesFull Text SourcesNature Publishing Group', 'Author Correction: What models eat.'), (1, 'Nature. 2018 Jun 6. doi: 10.1038/s41586-018-0224-x. [Epub ahead of print]Mechanism of parkin activation by PINK1.Gladkova C1, Maslen SL1, Skehel JM1, ... (1858 characters truncated) ... 0.1038/s41586-018-0224-x ShareLinkOut - more resourcesFull Text SourcesNature Publishing GroupMiscellaneousNCI CPTC Antibody Characterization Program', 'Mechanism of parkin activation by PINK1.'), (2, 'Sci Transl Med. 2018 Jul 11;10(449). pii: eaao3240. doi: 10.1126/scitranslmed.aao3240.CRISPR-enhanced engineering of therapy-sensitive cancer cells f ... (2395 characters truncated) ... n primary, recurrent, and metastatic settings.PMID: 29997250 DOI: 10.1126/scitranslmed.aao3240 ShareLinkOut - more resourcesFull Text SourcesHighWire', 'CRISPR-enhanced engineering of therapy-sensitive cancer cells for self-targeting of primary and metastatic tumors.'), (3, 'Nature. 2018 Jul 11. doi: 10.1038/s41586-018-0322-9. [Epub ahead of print]Evolution of a central neural circuit underlies Drosophila mate preferences ... (1549 characters truncated) ... ntribute to behavioural evolution.PMID: 29995860 DOI: 10.1038/s41586-018-0322-9 ShareLinkOut - more resourcesFull Text SourcesNature Publishing Group', 'Evolution of a central neural circuit underlies Drosophila mate preferences.'), (4, 'Nature. 2018 Jul 11. doi: 10.1038/s41586-018-0316-7. [Epub ahead of print]Histidine catabolism is a major determinant of methotrexate sensitivity.Kan ... (2900 characters truncated) ... 6-018-0316-7 ShareGrant supportGrant supportR01 CA129105/CA/NCI NIH HHS/United StatesLinkOut - more resourcesFull Text SourcesNature Publishing Group', 'Histidine catabolism is a major determinant of methotrexate sensitivity.'), (5, "N Engl J Med. 2018 Jul 12;379(2):138-149. doi: 10.1056/NEJMoa1714021.Prevention of M. tuberculosis Infection with H4:IC31 Vaccine or BCG Revaccinatio ... (7392 characters truncated) ... l Text SourcesAtypon - PDFEurope PubMed Central - Author ManuscriptOvid Technologies, Inc.PubMed Central - Author ManuscriptMedicalClinicalTrials.gov", 'Prevention of M. tuberculosis Infection with H4:IC31 Vaccine or BCG Revaccination.'), (6, 'JAMA. 2018 Jul 9. doi: 10.1001/jama.2018.8117. [Epub ahead of print]The Health Insurance Marketplaces.Sacks DW1.Author information1Department of Busi ... (64 characters truncated) ... iana University, Bloomington.PMID: 29987334 DOI: 10.1001/jama.2018.8117 ShareLinkOut - more resourcesFull Text SourcesSilverchair Information Systems', 'The Health Insurance Marketplaces.'), (7, 'Nat Med. 2018 Jul 9. doi: 10.1038/s41591-018-0096-5. [Epub ahead of print]Phenotype molding of stromal cells in the lung tumor microenvironment.Lambr ... (3339 characters truncated) ... lung cancer diagnosis and therapy.PMID: 29988129 DOI: 10.1038/s41591-018-0096-5 ShareLinkOut - more resourcesFull Text SourcesNature Publishing Group', 'Phenotype molding of stromal cells in the lung tumor microenvironment.'), (8, 'Nature. 2018 Jul 9. doi: 10.1038/s41586-018-0317-6. [Epub ahead of print]Prediction of acute myeloid leukaemia risk in healthy individuals.Abelson S1 ... (8058 characters truncated) ... d may help to inform intervention.PMID: 29988082 DOI: 10.1038/s41586-018-0317-6 ShareLinkOut - more resourcesFull Text SourcesNature Publishing Group', 'Prediction of acute myeloid leukaemia risk in healthy individuals.'), (9, 'Cell Rep. 2018 Jul 10;24(2):429-440. doi: 10.1016/j.celrep.2018.06.042.NAD+ Depletion Triggers Macrophage Necroptosis, a Cell Death Pathway Exploited ... (1558 characters truncated) ... 996103 DOI: 10.1016/j.celrep.2018.06.042 ShareLinkOut - more resourcesMiscellaneousNCI CPTAC Assay PortalNCI CPTAC Assay PortalNCI CPTAC Assay Portal', 'NAD+ Depletion Triggers Macrophage Necroptosis, a Cell Death Pathway Exploited by Mycobacterium tuberculosis.'))]