In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
import numpy as np
import time
import sqlite3 as sql
import re

In [11]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

In [12]:
links = pd.read_csv('../papers_info/jits/jits_paper_links.csv', encoding = 'latin1')
papers = []
totalkws, totalkwids = [], []
abstracts = {}
authors = []
reftitles, refjournals, refids = [], [], []

for row in links.iterrows():
    paper = row[1]
    paper_id, doi, pdf_url, title = paper.Urls.split("/")[-1], "/".join(paper.Urls.split("/")[-2:]), paper.Urls, paper.Titles
    issue_no, year = paper.BookId, paper.BookId.split('-')[0]
    
    # get page
    page = requests.get(pdf_url, headers = headers)
    soup = BeautifulSoup(page.content, 'html.parser')  
    
    if "Editorial" == soup.find("h3").string.strip() or "Editorials" == soup.find("h3").string.strip():
        continue  

    # page information
    pps = soup.find('span', {'class': 'contentItemPageRange'}).string
    try:
        startpp, endpp = pps.strip().split('-')[0].split()[-1], pps.strip().split('-')[1]
    except:
        startpp, endpp = 0, 0

    # citedby
    try:
        cited = int(soup.find('a', {'class': 'crossRef'}).find('span', {'class': 'value'}).string)
    except:
        cited = 0

    count = 0
    #author information
    for au in soup.find('div', {'class': 'hlFld-ContribAuthor'}).find_all('a', {'class': "entryAuthor"}):
        name = au.contents[0]
        try:
            affiliation = au.find('span').string
            author_info = (paper_id, count, name, 0, affiliation)
        except AttributeError:
            try: 
                affiliation = au.find_all('span')
                author_info = (paper_id, count, name, 0, affiliation[0].contents[0])
            except TypeError:
                author_info = (paper_id, count, name, 0, 'NA')
        count += 1
        authors.append(author_info)

    # abstract
    try:
        abstract = soup.find('div', {'class':  'hlFld-Abstract'}).find('p')
        if abstract.string.strip() == 'Abstract':
            abstract = abstract.find_next('p')
        abstracts.update({paper_id: abstract.string.strip()})
    except AttributeError:
        continue

    papers.append((paper_id, doi, title, year, startpp, endpp, cited, issue_no, pdf_url))

    # keywords
    kws = []
    try:
        for kw in soup.find('div', {'class':  'hlFld-KeywordText'}).find_all('a'):
            kws += kw.contents
        if len(kws) > 0:
            kw_ids = [paper_id] * len(kws)
            totalkws.append(kws)
            totalkwids.append(kw_ids)
    except:
        print("\n")

    # references
    ref_url = pdf_url.replace('full', 'ref')
    refpage = requests.get(ref_url, headers = headers)
    refsoup = BeautifulSoup(refpage.content, 'html.parser')   
    ref_journals, ref_titles = [], []
    for li in refsoup.find('ul', {'class': 'references'}).find_all('li'):
        try:
            ref_title = li.find('span', {'class': 'NLM_article-title'}).string
            ref_titles.append(ref_title)
        except AttributeError:
            continue
        try:
            ref_journal = li.find('i').string
            ref_journals.append(ref_journal)
        except AttributeError:
            try:
                ref_journal = li.find('span', {'class': 'NLM_conf-name'}).string
                ref_journals.append(ref_journal)
            except AttributeError:
                ref_journals.append("NA")
    
    ref_ids = [paper_id] * len(ref_titles)
    refids.append(ref_ids)
    reftitles.append(ref_titles)
    refjournals.append(ref_journals)

    time.sleep(0.25)









In [13]:
abstracts = pd.DataFrame.from_dict(abstracts, orient = 'index').reset_index()
abstracts.columns = ['paper_id', 'abstract']
abstracts.to_csv('../papers_info/jits/jits_paper_abstracts.csv', index = False)

In [14]:
authors = pd.DataFrame(authors, columns = ['paper_id', 'author_order', 'author_name', 'author_id', 'affiliation'])
authors.to_csv('../papers_info/jits/jits_paper_authors.csv', index = False)

In [15]:
papers = pd.DataFrame(papers, columns = ['paper_id', 'doi', 'title', 'year', 'startpp', 'endpp', 'cited', 'issue_no', 'pdf_url'])
papers.to_csv('../papers_info/jits/jits_paper_info.csv', index = False)

In [16]:
total_kws = pd.DataFrame()

for i in range(len(totalkws)):
    temp = pd.DataFrame(zip(totalkws[i], totalkwids[i]))
    total_kws = pd.concat([total_kws, temp])

total_kws.columns = ['keyword_terms', 'paper_id']
total_kws.to_csv('../papers_info/jits/jits_paper_keywords.csv', index = False)

In [17]:
refs = pd.DataFrame()

for i in range(len(refids)):
    temp = pd.DataFrame(zip(refids[i], reftitles[i], refjournals[i]))
    refs = pd.concat([refs, temp])
refs.columns = ['paper_id', 'ref_title', 'ref_journal']
refs.to_csv('../papers_info/jits/jits_paper_references.csv', index = False)

In [18]:
test_url = "https://www.tandfonline.com/doi/full/10.1080/15472450903385999"
 # get page
paper_id=1
page = requests.get(test_url, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')    
count = 0
#author information

for author in soup.find('div', {'class': 'hlFld-ContribAuthor'}).find_all('a'):
    name = author.contents[0]
    try:
        affiliation = author.find('span').string
        author_info = (paper_id, count, name.strip(), 0, affiliation.strip())
    except AttributeError:
        try: 
            affiliation = author.find_all('span')
            author_info = (paper_id, count, name.strip(), 0, affiliation[0].contents[0].strip())
        except TypeError:
            author_info = (paper_id, count, name.strip(), 0, "NA")
    count += 1
    print(author_info)

(1, 0, 'Cindy Cappelle', 0, 'Laboratoire Systèmes et Transports , Université de Technologie de Belfort-Montbéliard ,')
(1, 1, 'Maan El Badaoui El Najjar', 0, 'Laboratoire Lorrain de Recherche en Informatique et ses Applications , Institut National de Recherche en Informatique et Automatique ,')
(1, 2, 'Denis Pomorski', 0, "Laboratoire d'Automatique, Génie Informatique et Signal , Université de Lille Nord de France ,")
(1, 3, 'François Charpillet', 0, 'Laboratoire Lorrain de Recherche en Informatique et ses Applications , Institut National de Recherche en Informatique et Automatique ,')
