### Importing dependencies

In [1]:
import json
import requests
from tqdm import tqdm
import pandas as pd
import numpy as np
from collections import Counter

import nltk
import spacy

### Uncomment and run the following lines of code ONCE

In [2]:
# # essential entity models downloads
# nltk.downloader.download('maxent_ne_chunker')
# nltk.downloader.download('words')
# nltk.downloader.download('treebank')
# nltk.downloader.download('maxent_treebank_pos_tagger')
# nltk.downloader.download('punkt')
# nltk.download('averaged_perceptron_tagger')

### Functions

In [6]:
# Not all articles have a DOI and PII, that's why we'll be using the Scopus ID function to get abstracts. EID works too.
# Check Try-Except inside the article_info() function

# Set your keywords
my_input = 'salinization AND flooding'
# Set your API Key
key = 'c944c310dad119e7f4ad0078f29540fa'

def scopus_search(my_input: str) -> list:
    api_resource = "https://api.elsevier.com/content/search/scopus?"
    search_param = f'query=title-abs-key({my_input})'  # for example

    # headers
    headers = dict()
    headers['X-ELS-APIKey'] = key
    headers['X-ELS-ResourceVersion'] = 'XOCS'
    headers['Accept'] = 'application/json'

    # Set the desired number of results per page
    results_per_page = 25

    # Send the first request to get the total number of results
    first_page_request = requests.get(api_resource + search_param + f"&count={results_per_page}&start=0", headers=headers)
    first_page = json.loads(first_page_request.content.decode("utf-8"))

    total_results = int(first_page['search-results']['opensearch:totalResults'])
    total_pages = (total_results // results_per_page) + 1

    # List to store all articles
    articles_list = []

    print(f"Scrapping Data Pages from Scopus using {my_input}...")
    # Iterate over all pages
    for page_number in tqdm(range(total_pages)):
        # print(f"Page Number: {page_number} out of {total_pages}" )
        start_index = page_number * results_per_page
        page_request = requests.get(api_resource + search_param + f"&count={results_per_page}&start={start_index}", headers=headers)
        page = json.loads(page_request.content.decode("utf-8"))
        try:
            articles_list.extend(page['search-results']['entry'])
        except:
            continue

    print(f"Number of articles: {len(articles_list)}")
    return articles_list

def article_info(articles_list: list) -> set:
    print(f"\nGetting article titles...")

    article_title = []
    article_doi = []
    article_eid = []
    article_ID = []
    article_pii = []
    global outliers
    outliers = {}
    # Access individual articles
    for article in tqdm(range(len(articles_list))):
        try:
            article_pii.append( articles_list[article]["pii"]) #dc:title
            article_title.append( articles_list[article]["dc:title"]) #dc:title
            article_doi.append( articles_list[article]["prism:doi"]) #dc:title
            article_eid.append( articles_list[article]["eid"]) #dc:title
            article_ID.append( articles_list[article]["dc:identifier"]) #dc:title

        except:
            article_pii.append(None) #dc:title
            article_doi.append(None)
            article_title.append(articles_list[article]["dc:title"])
            article_eid.append( articles_list[article]["eid"]) #dc:title
            article_ID.append( articles_list[article]["dc:identifier"])
            
    return article_title, article_doi, article_eid, article_ID, article_pii

def scopus_id_abstract_retreiver(scopus_id: str) -> str:
    api_endpoint = f"https://api.elsevier.com/content/abstract/scopus_id/{scopus_id}"

    # Set the headers
    headers = {
        "X-ELS-APIKey": key,
        "Accept": "application/json"
    }

    # Make the request to retrieve the abstract
    response = requests.get(api_endpoint, headers=headers)
    data = json.loads(response.content.decode("utf-8"))

    # Extract the abstract from the response
    try:
        abstract = data["abstracts-retrieval-response"]["coredata"]["dc:description"]
    except:
        abstract = "NA"

    # Return the abstract
    return abstract

def location_finder(text: str) -> dict:
    # Load the pre-trained model
    nlp = spacy.load("en_core_web_sm")

    # Sample text
    sample_text = text

    # Process the text
    doc = nlp(sample_text)

    # Find location words and their locations
    # locations = [(entity.text, entity.label_) for entity in doc.ents if entity.label_ == "GPE" or entity.label_ == "LOC"]
    locations = [entity.text for entity in doc.ents if entity.label_ == "GPE" or entity.label_ == "LOC"]

    # Print the location words and their locations
    # for word in locations:
    #     print(f"Location: {word}")

    # Sorting locations by frequency
    my_dict = dict(Counter(locations))
    sorted_dict = dict(sorted(my_dict.items(), key=lambda x: x[1], reverse=True))
    first_five_elements = dict(list(sorted_dict.items())[:5])

    return first_five_elements

In [7]:
def doi_abstract_retriever(doi: str) -> str:
    api_endpoint = f"https://api.elsevier.com/content/abstract/doi/{doi}"

    # Set the headers
    headers = {
        "X-ELS-APIKey": key,
        "Accept": "application/json"
    }

    # Make the request to retrieve the abstract
    response = requests.get(api_endpoint, headers=headers)
    data = json.loads(response.content.decode("utf-8"))

    # Extract the abstract from the response
    abstract = data["abstracts-retrieval-response"]["coredata"]["dc:description"]

    # Return the abstract
    return abstract

def eid_abstract_retreiver(eid: str) -> str:
    api_endpoint = f"https://api.elsevier.com/content/abstract/eid/{eid}"

    # Set the headers
    headers = {
        "X-ELS-APIKey": key,
        "Accept": "application/json"
    }

    # Make the request to retrieve the abstract
    response = requests.get(api_endpoint, headers=headers)
    data = json.loads(response.content.decode("utf-8"))

    # Extract the abstract from the response
    abstract = data["abstracts-retrieval-response"]["coredata"]["dc:description"]

    # Return the abstract
    return abstract

def pii_abstract_retreiver(pii: str) -> str:
    api_endpoint = f"https://api.elsevier.com/content/abstract/pii/{pii}"

    # Set the headers
    headers = {
        "X-ELS-APIKey": key,
        "Accept": "application/json"
    }

    # Make the request to retrieve the abstract
    response = requests.get(api_endpoint, headers=headers)
    data = json.loads(response.content.decode("utf-8"))

    # Extract the abstract from the response
    abstract = data["abstracts-retrieval-response"]["coredata"]["dc:description"]

    # Return the abstract
    return abstract

### Code output

In [8]:
# Perform the search and retrieve article info
my_set = article_info(scopus_search(my_input))

# Create an empty list to store the output dictionary keys
list_of_lists = []

# Loop over the IDs and find locations
print(f'\nGetting locations from {len(my_set[3])} abstracts...')
for n, scopus_id in tqdm(enumerate(my_set[3])):
    output_dict = location_finder(scopus_id_abstract_retreiver(scopus_id))
    list_of_lists.append(list(output_dict.keys()))

print(f"\nMaking Dataframe...")
        
# Extract first and second elements
first_elements = [inner_list[0] if len(inner_list) > 0 else None for inner_list in list_of_lists]
second_elements = [inner_list[1] if len(inner_list) > 1 else None for inner_list in list_of_lists]
third_elements = [inner_list[2] if len(inner_list) > 2 else None for inner_list in list_of_lists]
fourth_elements = [inner_list[3] if len(inner_list) > 3 else None for inner_list in list_of_lists]

# Make DataFrame
df = pd.DataFrame({
    "Paper Title": my_set[0],
    "Scopus ID" : my_set[3],
    "DOI": my_set[1],
    "first_location" : first_elements,
    "second_location" : second_elements,
    "third_location" : third_elements,
    "fourth_location": fourth_elements})

# Saving file
df.to_csv("output.csv")

print("DONE!")


Scrapping Data Pages from Scopus using salinization AND flooding...


100%|██████████| 15/15 [00:12<00:00,  1.17it/s]


Number of articles: 350

Getting article titles...


100%|██████████| 350/350 [00:00<00:00, 232132.57it/s]



Getting locations from 350 abstracts...


6it [00:06,  1.15s/it]


ValueError: [E1041] Expected a string, Doc, or bytes as input, but got: <class 'NoneType'>

### Request Output Structure

In [None]:
articles_list[0]

### Another way to get locations

In [None]:
import locationtagger

# initializing sample text
sample_text = "India has very rich and vivid culture\
        widely spread from Kerala to Nagaland to Haryana to Maharashtra. " \
        "Delhi being capital with Mumbai financial capital.\
        Can be said better than some western cities such as " \
        " Munich, London etc. Pakistan and Bangladesh share its borders"

# extracting entities.
place_entity = locationtagger.find_locations(text = full_text)

# getting all countries
print("The countries in text : ")
print(place_entity.countries)

# getting all states
print("The states in text : ")
print(place_entity.regions)

# getting all cities
print("The cities in text : ")
print(place_entity.cities)