### Importing dependencies

In [1]:
import json
import requests
from tqdm import tqdm
import pandas as pd
import numpy as np
from collections import Counter
from concurrent.futures import ThreadPoolExecutor

import nltk
import spacy

### Uncomment and run the following lines of code ONCE

In [2]:
# # essential entity models downloads
# nltk.downloader.download('maxent_ne_chunker')
# nltk.downloader.download('words')
# nltk.downloader.download('treebank')
# nltk.downloader.download('maxent_treebank_pos_tagger')
# nltk.downloader.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# !pip3 install country_list


### Functions

In [6]:
import requests
import json
import spacy
from collections import Counter
from tqdm import tqdm

# Set your keywords
my_input = 'salinization AND flooding'
# Set your API Key
key = 'c944c310dad119e7f4ad0078f29540fa'

# Create a session for making requests
session = requests.Session()
session.headers['X-ELS-APIKey'] = key
session.headers['X-ELS-ResourceVersion'] = 'XOCS'
session.headers['Accept'] = 'application/json'

def scopus_search(my_input: str) -> list:
    api_resource = "https://api.elsevier.com/content/search/scopus?"
    search_param = f'query=title-abs-key({my_input})'  # for example

    # Set the desired number of results per page
    results_per_page = 25

    # Send the first request to get the total number of results
    first_page_request = session.get(api_resource + search_param + f"&count={results_per_page}&start=0")
    first_page = json.loads(first_page_request.content.decode("utf-8"))

    total_results = int(first_page['search-results']['opensearch:totalResults'])
    total_pages = (total_results // results_per_page) + 1

    # List to store all articles
    articles_list = []

    print(f"Scrapping Data Pages from Scopus using {my_input}...")
    # Iterate over all pages
    with ThreadPoolExecutor() as executor:
        for page_number in tqdm(range(total_pages)):
            start_index = page_number * results_per_page
            page_request = session.get(api_resource + search_param + f"&count={results_per_page}&start={start_index}")
            page = json.loads(page_request.content.decode("utf-8"))
            try:
                articles_list.extend(page['search-results']['entry'])
            except:
                continue

        print(f"Number of articles: {len(articles_list)}")
        return articles_list

def article_info(articles_list: list) -> set:
    print(f"\nGetting article titles...")

    article_title = []
    article_doi = []
    article_eid = []
    article_ID = []
    article_pii = []
    article_url = []
    article_creator = []
    article_pub = []
    article_coverDate = []
    article_coverDisplayDate = []

    global outliers
    outliers = {}
    # Access individual articles
    with ThreadPoolExecutor() as executor:
        for article in tqdm(range(len(articles_list))):
            try:
                article_pii.append(articles_list[article].get("pii"))
                article_title.append(articles_list[article].get("dc:title"))
                article_doi.append(articles_list[article].get("prism:doi"))
                article_eid.append(articles_list[article].get("eid"))
                article_ID.append(articles_list[article].get("dc:identifier"))
                article_url.append(articles_list[article].get("prism:url"))
                article_creator.append(articles_list[article].get("dc:creator"))
                article_pub.append(articles_list[article].get("prism:publicationName"))
                article_coverDate.append(articles_list[article].get("prism:coverDate"))
                article_coverDisplayDate.append(articles_list[article].get("prism:coverDisplayDate"))

            except:
                article_pii.append(None)
                article_doi.append(None)
                article_title.append(articles_list[article].get("dc:title"))
                article_eid.append(articles_list[article].get("eid"))
                article_ID.append(articles_list[article].get("dc:identifier"))
                article_url.append(articles_list[article].get("prism:url"))
                article_creator.append(None)
                article_pub.append(articles_list[article].get("prism:publicationName"))
                article_coverDate.append(articles_list[article].get("prism:coverDate"))
                article_coverDisplayDate.append(articles_list[article].get("prism:coverDisplayDate"))

        return (
            article_title, article_doi, article_eid, article_ID,
            article_pii, article_url, article_creator,
            article_pub, article_coverDate, article_coverDisplayDate
        )

def scopus_id_abstract_retriever(scopus_id: str) -> str:
    api_endpoint = f"https://api.elsevier.com/content/abstract/scopus_id/{scopus_id}"
    # api_endpoint = f"https://api.elsevier.com/content/abstract/scopus_id/{scopus_id}?apiKey=c944c310dad119e7f4ad0078f29540fa"

    # Make the request to retrieve the abstract
    response = session.get(api_endpoint)
    data = json.loads(response.content.decode("utf-8"))

    print(data)

    # Extract the abstract from the response
    try:
        abstract = data["abstracts-retrieval-response"]["coredata"]["dc:description"]
    except:
        abstract = "NA"

    # Return the abstract
    return abstract

def location_finder(text: str) -> dict:
    # Load the pre-trained model
    nlp = spacy.load("en_core_web_sm")

    # Sample text
    sample_text = text

    # Process the text
    doc = nlp(sample_text)

    # Find location words and their locations
    locations = [entity.text for entity in doc.ents if entity.label_ == "GPE" or entity.label_ == "LOC"]

    # Sorting locations by frequency
    my_dict = dict(Counter(locations))
    sorted_dict = dict(sorted(my_dict.items(), key=lambda x: x[1], reverse=True))
    first_five_elements = dict(list(sorted_dict.items())[:5])

    return first_five_elements

In [7]:
def doi_abstract_retriever(doi: str) -> str:
    api_endpoint = f"https://api.elsevier.com/content/abstract/doi/{doi}"

    # Set the headers
    headers = {
        "X-ELS-APIKey": key,
        "Accept": "application/json"
    }

    # Make the request to retrieve the abstract
    response = requests.get(api_endpoint, headers=headers)
    data = json.loads(response.content.decode("utf-8"))

    # Extract the abstract from the response
    abstract = data["abstracts-retrieval-response"]["coredata"]["dc:description"]

    # Return the abstract
    return abstract

def eid_abstract_retreiver(eid: str) -> str:
    api_endpoint = f"https://api.elsevier.com/content/abstract/eid/{eid}"

    # Set the headers
    headers = {
        "X-ELS-APIKey": key,
        "Accept": "application/json"
    }

    # Make the request to retrieve the abstract
    response = requests.get(api_endpoint, headers=headers)
    data = json.loads(response.content.decode("utf-8"))

    # Extract the abstract from the response
    abstract = data["abstracts-retrieval-response"]["coredata"]["dc:description"]

    # Return the abstract
    return abstract

def pii_abstract_retreiver(pii: str) -> str:
    api_endpoint = f"https://api.elsevier.com/content/abstract/pii/{pii}"

    # Set the headers
    headers = {
        "X-ELS-APIKey": key,
        "Accept": "application/json"
    }

    # Make the request to retrieve the abstract
    response = requests.get(api_endpoint, headers=headers)
    data = json.loads(response.content.decode("utf-8"))

    # Extract the abstract from the response
    abstract = data["abstracts-retrieval-response"]["coredata"]["dc:description"]

    # Return the abstract
    return abstract

### Code output

In [17]:
# Perform the search and retrieve article info
my_set = article_info(scopus_search(my_input))

# Create an empty list to store the output dictionary keys
list_of_lists = []

# Loop over the IDs and find locations
print(f'\nGetting locations from {len(my_set[3])} abstracts...')
with ThreadPoolExecutor() as executor:
    for n, scopus_id in tqdm(enumerate(my_set[3])):
        output_dict = location_finder(scopus_id_abstract_retriever(scopus_id))
        list_of_lists.append(list(output_dict.keys()))
        
print(f"\nMaking Dataframe...")
        
# Extract first and second elements
first_elements = [inner_list[0] if len(inner_list) > 0 else None for inner_list in list_of_lists]
second_elements = [inner_list[1] if len(inner_list) > 1 else None for inner_list in list_of_lists]
third_elements = [inner_list[2] if len(inner_list) > 2 else None for inner_list in list_of_lists]
fourth_elements = [inner_list[3] if len(inner_list) > 3 else None for inner_list in list_of_lists]

# Make DataFrame
df = pd.DataFrame({
    "Paper Title": my_set[0],
    "Scopus ID" : my_set[3],
    "DOI": my_set[1],
    "URL": my_set[5],
    "Creator": my_set[6],
    "Publication": my_set[7],
    "Cover_Date": my_set[8],
    "Cover_Display_Date": my_set[9],
    "first_location" : first_elements,
    "second_location" : second_elements,
    "third_location" : third_elements,
    "fourth_location": fourth_elements})

# Saving file
df.to_csv("output.csv")

print("DONE!")

Scrapping Data Pages from Scopus using salinization AND flooding...


100%|██████████| 15/15 [00:07<00:00,  2.04it/s]


Number of articles: 350

Getting article titles...


100%|██████████| 350/350 [00:00<00:00, 387643.62it/s]



Getting locations from 350 abstracts...


0it [00:00, ?it/s]


NameError: name 'response' is not defined

In [11]:
from country_list import countries_for_language
# countries_for_language returns a list of tuples now, might be changed to an OrderedDict
countries = dict(countries_for_language('en'))
del countries["US"]

# List of countries
countries = list(countries.values())
countries.append("Africa")

# Columns to check
columns_to_check = df.columns[-4:]

# Function to check if a country is present in a given value
def contains_country(value):
    for country in countries:
        if country in value:
            return True
    return False

print(columns_to_check)

# Function to check if a row should be deleted
def should_delete_row(row):
    for column in columns_to_check:
        value = str(row[column])
        if contains_country(value) and "USA" not in value and "U.S." not in value and "United States" not in value:
            return False
    return True

# Check if the last four columns are empty
last_four_columns_empty = df.iloc[:, -4:].isnull().all(axis=1)
# Delete rows where the last four columns are empty
df = df[~last_four_columns_empty]

# Delete rows that meet the criteria
df = df[df.apply(should_delete_row, axis=1) == True]
df.to_csv('excluded.csv')


Index(['first_location', 'second_location', 'third_location',
       'fourth_location'],
      dtype='object')


In [14]:
columns_to_check

Index(['first_location', 'second_location', 'third_location',
       'fourth_location'],
      dtype='object')

### Request Output Structure

In [None]:
articles_list[0]

### Another way to get locations

In [None]:
import locationtagger

# initializing sample text
sample_text = "India has very rich and vivid culture\
        widely spread from Kerala to Nagaland to Haryana to Maharashtra. " \
        "Delhi being capital with Mumbai financial capital.\
        Can be said better than some western cities such as " \
        " Munich, London etc. Pakistan and Bangladesh share its borders"

# extracting entities.
place_entity = locationtagger.find_locations(text = full_text)

# getting all countries
print("The countries in text : ")
print(place_entity.countries)

# getting all states
print("The states in text : ")
print(place_entity.regions)

# getting all cities
print("The cities in text : ")
print(place_entity.cities)