### Importing dependencies

In [4]:
import json
import requests
from tqdm import tqdm
import pandas as pd
import numpy as np
from collections import Counter
from concurrent.futures import ThreadPoolExecutor

import nltk
import spacy

### Uncomment and run the following lines of code ONCE

In [5]:
# # essential entity models downloads
# nltk.downloader.download('maxent_ne_chunker')
# nltk.downloader.download('words')
# nltk.downloader.download('treebank')
# nltk.downloader.download('maxent_treebank_pos_tagger')
# nltk.downloader.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# !pip3 install country_list

# https://api.elsevier.com/content/search/scopus?query=title-abs-key(salinization)&apiKey=33db67edfe2549a23e9da7d09078b777

### Functions

In [6]:
import requests
import json
import spacy
from collections import Counter
from tqdm import tqdm

# Set your keywords
my_input = 'salinization AND flooding'
# Set your API Key
key = '2a0215213a533fe5b5c7cafc2655348c'

# Create a session for making requests
session = requests.Session()
session.headers['X-ELS-APIKey'] = key
session.headers['X-ELS-ResourceVersion'] = 'XOCS'
session.headers['Accept'] = 'application/json'

def scopus_search(my_input: str) -> list:
    api_resource = "https://api.elsevier.com/content/search/scopus?"
    search_param = f'query=title-abs-key({my_input})'  # for example

    # Set the desired number of results per page
    results_per_page = 25

    # Send the first request to get the total number of results
    first_page_request = session.get(api_resource + search_param + f"&count={results_per_page}&start=0")
    first_page = json.loads(first_page_request.content.decode("utf-8"))

    total_results = int(first_page['search-results']['opensearch:totalResults'])
    total_pages = (total_results // results_per_page) + 1

    # List to store all articles
    articles_list = []

    print(f"Scrapping Data Pages from Scopus using {my_input}...")
    # Iterate over all pages
    with ThreadPoolExecutor() as executor:
        for page_number in tqdm(range(total_pages)):
            start_index = page_number * results_per_page
            page_request = session.get(api_resource + search_param + f"&count={results_per_page}&start={start_index}")
            page = json.loads(page_request.content.decode("utf-8"))
            try:
                articles_list.extend(page['search-results']['entry'])
            except:
                continue

        print(f"Number of articles: {len(articles_list)}")
        return articles_list

def article_info(articles_list: list) -> set:
    print(f"\nGetting article titles...")

    article_title = []
    article_doi = []
    article_eid = []
    article_ID = []
    article_pii = []
    article_url = []
    article_creator = []
    article_pub = []
    article_coverDate = []
    article_number_citations = []
    

    global outliers
    outliers = {}
    # Access individual articles
    with ThreadPoolExecutor() as executor:
        for article in tqdm(range(len(articles_list))):
            try:
                article_pii.append(articles_list[article].get("pii"))
                article_title.append(articles_list[article].get("dc:title"))
                article_doi.append(articles_list[article].get("prism:doi"))
                article_eid.append(articles_list[article].get("eid"))
                article_ID.append(articles_list[article].get("dc:identifier"))
                article_url.append(articles_list[article].get("prism:url"))
                article_creator.append(articles_list[article].get("dc:creator"))
                article_pub.append(articles_list[article].get("prism:publicationName"))
                article_coverDate.append(articles_list[article].get("prism:coverDate"))
                article_number_citations.append(articles_list[article].get("citedby-count"))

            except:
                article_pii.append(None)
                article_doi.append(None)
                article_title.append(articles_list[article].get("dc:title"))
                article_eid.append(articles_list[article].get("eid"))
                article_ID.append(articles_list[article].get("dc:identifier"))
                article_url.append(articles_list[article].get("prism:url"))
                article_creator.append(None)
                article_pub.append(articles_list[article].get("prism:publicationName"))
                article_coverDate.append(articles_list[article].get("prism:coverDate"))
                article_number_citations.append(articles_list[article].get("citedby-count"))


        return (
            article_title, article_doi, article_eid, article_ID,
            article_pii, article_url, article_creator,
            article_pub, article_coverDate, article_number_citations
        )

affiliation = []
area = []
author_count = []

def scopus_id_abstract_retriever(scopus_id: str) -> str:
    api_endpoint = f"https://api.elsevier.com/content/abstract/scopus_id/{scopus_id}"

    # Make the request to retrieve the abstract
    response = session.get(api_endpoint)
    data = json.loads(response.content.decode("utf-8"))
    # Extract the abstract from the response
    try:
        abstract = data["abstracts-retrieval-response"]["coredata"]["dc:description"]
        try: affiliation.append(data["abstracts-retrieval-response"]["affiliation"]["affilname"])
        except:
            try: affiliation.append(data["abstracts-retrieval-response"]["affiliation"])
            except: affiliation.append(None)
        # Study Area
        try:
            result = data["abstracts-retrieval-response"]["subject-areas"]["subject-area"]
            subjects = [subject["$"] for subject in result]
            area.append(" & ".join(subjects))
        except:
            area.append(None)
        # Authors
        try: author_count.append(len(data["abstracts-retrieval-response"]["authors"]['author']))
        except: author_count.append(None)

    except:
        abstract = "NA"
        try: affiliation.append(data["abstracts-retrieval-response"]["affiliation"]["affilname"])
        except:
            try: affiliation.append(data["abstracts-retrieval-response"]["affiliation"])
            except: affiliation.append(None)
        # Study Area
        try:
            result = data["abstracts-retrieval-response"]["subject-areas"]["subject-area"]
            subjects = [subject["$"] for subject in result]
            area.append(" & ".join(subjects))
        except:
            area.append(None)
        # Authors
        try: author_count.append(len(data["abstracts-retrieval-response"]["authors"]['author']))
        except: author_count.append(None)
        
    # Return the abstract
    return abstract

### SKIP

In [7]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

def extract_text_from_pages(pdf_path, num_pages):
    text = ""
    for page_layout in extract_pages(pdf_path):
        if page_layout.pageid > num_pages:
            break
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                text += element.get_text().strip() + " "
    return text.strip()

def location_finder(text: str) -> dict:
    # Load the pre-trained model
    nlp = spacy.load("en_core_web_sm")

    # Sample text
    sample_text = text

    # Process the text
    doc = nlp(sample_text)

    # Find location words and their locations
    locations = [entity.text for entity in doc.ents if entity.label_ == "GPE" or entity.label_ == "LOC"]

    # Sorting locations by frequency
    my_dict = dict(Counter(locations))
    sorted_dict = dict(sorted(my_dict.items(), key=lambda x: x[1], reverse=True))
    first_five_elements = dict(list(sorted_dict.items())[:5])

    return first_five_elements

### GPT API?

In [10]:
API_key = "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"

import openai
from transformers import GPT2Tokenizer

def truncate_text(text, max_tokens):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokens = tokenizer.encode(text, add_special_tokens=False)
    if len(tokens) <= max_tokens:
        return text
    else:
        return tokenizer.decode(tokens[:max_tokens])

# In this updated code, a new function truncate_text is introduced, which takes the text and max_tokens
# as input and truncates the text to fit within the maximum token limit. It uses the GPT-2 tokenizer
# to count the tokens and then truncates the text accordingly. If the text is already within the token limit,
# it returns the original text.

# The context extracted from the PDF is passed to truncate_text to get the truncated_context.
# The length of the prompt is then checked to ensure it does not exceed 4096 tokens. If it does,
# the truncated_context is used in the prompt.

def ask_question(context, question):
    # Set up the OpenAI API client
    openai.api_key = API_key  # Replace with your API key
    truncated_context = truncate_text(context, 4096 - len(f"Research Paper Abstract: \nQuestion: {question}\nAnswer:"))

    # Create a prompt by combining the context and the question
    prompt = f"Research Paper Abstract: {truncated_context}\nQuestion: {question}\nAnswer:"

    # Generate a response from the ChatGPT model
    response = openai.Completion.create(
        engine='text-davinci-003',  # Choose the ChatGPT model variant
        prompt=prompt,
        max_tokens=100,  # Adjust the response length as needed
        temperature=0.7,  # Adjust the randomness of the response
        n=1,  # Generate a single response
        stop=None,  # Let the model generate a complete answer
        timeout=10,  # Set a timeout (in seconds) for the API request
    )

    # Extract the answer from the model's response
    answer = response.choices[0].text.strip()

    return answer


### Code output

In [11]:
# Perform the search and retrieve article info
my_set = article_info(scopus_search(my_input))

# Create an empty list to store the output dictionary keys
list_of_lists = []

question = "I'm making a map of all research about Salt Water Intrusion and Sea Level Rise (SWISLR). Using this abstract, tell me in which coastal region this SWISLR Research is done? I want your answer to be concise in this format: {Country: Country Name, State: State Name, City: City Name}. Write (None) if you can't find this information"

# Loop over the IDs and find locations
print(f'\nGetting locations from {len(my_set[3])} abstracts...')
with ThreadPoolExecutor() as executor:
    for n, scopus_id in tqdm(enumerate(my_set[3])):
        output_dict = ask_question(scopus_id_abstract_retriever(scopus_id), question)
        list_of_lists.append(output_dict)

Scrapping Data Pages from Scopus using salinization AND flooding...


100%|██████████| 15/15 [00:07<00:00,  1.97it/s]


Number of articles: 353

Getting article titles...


100%|██████████| 353/353 [00:00<00:00, 147630.80it/s]



Getting locations from 353 abstracts...


43it [00:53,  1.13s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (1081 > 1024). Running this sequence through the model will result in indexing errors
353it [07:29,  1.27s/it]


In [19]:
print(f"\nMaking Dataframe...")
        
data = list_of_lists

# Initialize empty lists for country, state, and city
countries = []
states = []
cities = []

# Split the strings and extract country, state, and city
for entry in data:
    if entry == 'None':
        countries.append(None)
        states.append(None)
        cities.append(None)
    else:
        components = entry.split(', ')
        country = components[0].split(': ')[1]
        state = components[1].split(': ')[1]
        city = components[2].split(': ')[1]
        countries.append(country)
        states.append(state)
        cities.append(city)



Making Dataframe...


In [30]:

# Make DataFrame
df = pd.DataFrame({
    "Paper Title": my_set[0],
    "Scopus ID" : my_set[3],
    "DOI": my_set[1],
    "URL": my_set[5],
    "Lead Author": my_set[6],
    "Affiliation": affiliation[1:],
    "Author count": author_count[1:],
    "Area of Study": area[1:],
    "Publication": my_set[7],
    "Cover Date": my_set[8],
    "Number of citations": my_set[9],
    "Country" : countries,
    "State" : states,
    "City" : cities,
    })
# Saving file

affiliation_series = df['Affiliation']

modified_series = affiliation_series.apply(lambda x: x[0]['affilname'] if isinstance(x, list) and len(x) > 0 else x)

df['Affiliation'] = modified_series

df.to_csv("output_.csv")

print("DONE!")

DONE!


In [31]:
df = df
df1 = pd.read_csv("output.csv")

In [32]:
df.head()

Unnamed: 0,Paper Title,Scopus ID,DOI,URL,Lead Author,Affiliation,Author count,Area of Study,Publication,Cover Date,Number of citations,Country,State,City
0,Linking soil organic carbon dynamics to microb...,SCOPUS_ID:85153864873,10.1016/j.apsoil.2023.104931,https://api.elsevier.com/content/abstract/scop...,Chen Y.,Northwest A&amp;F University,12.0,Ecology & Agricultural and Biological Sciences...,Applied Soil Ecology,2023-09-01,0,,,
1,Enhanced phosphorus fixation in red mud-amende...,SCOPUS_ID:85153933867,10.1016/j.envres.2023.115960,https://api.elsevier.com/content/abstract/scop...,Ding S.,China Agricultural University,8.0,Biochemistry & Environmental Science (all),Environmental Research,2023-07-15,1,,,
2,Drip irrigation in agricultural saline-alkali ...,SCOPUS_ID:85151557628,10.1016/j.scitotenv.2023.163226,https://api.elsevier.com/content/abstract/scop...,Du Y.,Northwest A&amp;F University,4.0,Environmental Engineering & Environmental Chem...,Science of the Total Environment,2023-07-01,0,,,
3,Future flooding of the Volta Delta caused by s...,SCOPUS_ID:85161886673,10.1007/s11852-023-00952-0,https://api.elsevier.com/content/abstract/scop...,Brempong E.K.,University of Cape Coast Ghana,9.0,Oceanography & Ecology & Nature and Landscape ...,Journal of Coastal Conservation,2023-06-01,0,Ghana,,
4,Toward sustainable inland aquaculture: Coastal...,SCOPUS_ID:85149064957,10.1016/j.rsase.2023.100930,https://api.elsevier.com/content/abstract/scop...,Hung W.C.,National Yang Ming Chiao Tung University,9.0,"Geography, Planning and Development & Computer...",Remote Sensing Applications: Society and Envir...,2023-04-01,0,Taiwan,Pingtung,Linbian


In [33]:
df1.head()

Unnamed: 0.1,Unnamed: 0,Paper Title,Scopus ID,DOI,URL,Lead Author,Affiliation,Author count,Area of Study,Publication,Cover Date,Number of citations,first_location,second_location,third_location,fourth_location
0,0,Linking soil organic carbon dynamics to microb...,SCOPUS_ID:85153864873,10.1016/j.apsoil.2023.104931,https://api.elsevier.com/content/abstract/scop...,Chen Y.,Northwest A&amp;F University,12.0,Ecology & Agricultural and Biological Sciences...,Applied Soil Ecology,2023-09-01,0,Proteobacteria,Actinobacteria,Symbiobacterium,BG
1,1,Enhanced phosphorus fixation in red mud-amende...,SCOPUS_ID:85153933867,10.1016/j.envres.2023.115960,https://api.elsevier.com/content/abstract/scop...,Ding S.,China Agricultural University,8.0,Biochemistry & Environmental Science (all),Environmental Research,2023-07-15,1,Calcium,,,
2,2,Drip irrigation in agricultural saline-alkali ...,SCOPUS_ID:85151557628,10.1016/j.scitotenv.2023.163226,https://api.elsevier.com/content/abstract/scop...,Du Y.,Northwest A&amp;F University,4.0,Environmental Engineering & Environmental Chem...,Science of the Total Environment,2023-07-01,0,,,,
3,3,Toward sustainable inland aquaculture: Coastal...,SCOPUS_ID:85149064957,10.1016/j.rsase.2023.100930,https://api.elsevier.com/content/abstract/scop...,Hung W.C.,National Yang Ming Chiao Tung University,9.0,"Geography, Planning and Development & Computer...",Remote Sensing Applications: Society and Envir...,2023-04-01,0,Pingtung,Taiwan,,
4,4,Progress of Euhalophyte Adaptation to Arid Are...,SCOPUS_ID:85151496711,10.3390/agriculture13030704,https://api.elsevier.com/content/abstract/scop...,Wang Y.,Xinjiang Institute of Ecology and Geography Ch...,6.0,Food Science & Agronomy and Crop Science & Pla...,Agriculture (Switzerland),2023-03-01,0,,,,


In [44]:
merged_df = pd.merge(df1[['Paper Title', 'Scopus ID', 'DOI', 'URL', 'first_location', 'second_location', 'third_location', 'fourth_location']],
                     df[['Scopus ID', 'Country', 'State', 'City']],
                     on="Scopus ID",
                     how="left")
merged_df.to_csv('merged.csv')