### Importing dependencies

In [29]:
import json
import requests
from tqdm import tqdm
import pandas as pd
import numpy as np
from collections import Counter
from concurrent.futures import ThreadPoolExecutor

import nltk
import spacy

### Uncomment and run the following lines of code ONCE

In [3]:
# # essential entity models downloads
# nltk.downloader.download('maxent_ne_chunker')
# nltk.downloader.download('words')
# nltk.downloader.download('treebank')
# nltk.downloader.download('maxent_treebank_pos_tagger')
# nltk.downloader.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# !pip3 install country_list

# https://api.elsevier.com/content/search/scopus?query=title-abs-key(salinization)&apiKey=33db67edfe2549a23e9da7d09078b777

### Functions

In [57]:
import requests
import json
import spacy
from collections import Counter
from tqdm import tqdm

# Set your keywords
my_input = 'salinization AND flooding'
# Set your API Key
key = '2a0215213a533fe5b5c7cafc2655348c'

# Create a session for making requests
session = requests.Session()
session.headers['X-ELS-APIKey'] = key
session.headers['X-ELS-ResourceVersion'] = 'XOCS'
session.headers['Accept'] = 'application/json'

def scopus_search(my_input: str) -> list:
    api_resource = "https://api.elsevier.com/content/search/scopus?"
    search_param = f'query=title-abs-key({my_input})'  # for example

    # Set the desired number of results per page
    results_per_page = 25

    # Send the first request to get the total number of results
    first_page_request = session.get(api_resource + search_param + f"&count={results_per_page}&start=0")
    first_page = json.loads(first_page_request.content.decode("utf-8"))

    total_results = int(first_page['search-results']['opensearch:totalResults'])
    total_pages = (total_results // results_per_page) + 1

    # List to store all articles
    articles_list = []

    print(f"Scrapping Data Pages from Scopus using {my_input}...")
    # Iterate over all pages
    with ThreadPoolExecutor() as executor:
        for page_number in tqdm(range(total_pages)):
            start_index = page_number * results_per_page
            page_request = session.get(api_resource + search_param + f"&count={results_per_page}&start={start_index}")
            page = json.loads(page_request.content.decode("utf-8"))
            try:
                articles_list.extend(page['search-results']['entry'])
            except:
                continue

        print(f"Number of articles: {len(articles_list)}")
        return articles_list

def article_info(articles_list: list) -> set:
    print(f"\nGetting article titles...")

    article_title = []
    article_doi = []
    article_eid = []
    article_ID = []
    article_pii = []
    article_url = []
    article_creator = []
    article_pub = []
    article_coverDate = []
    article_number_citations = []
    

    global outliers
    outliers = {}
    # Access individual articles
    with ThreadPoolExecutor() as executor:
        for article in tqdm(range(len(articles_list))):
            try:
                article_pii.append(articles_list[article].get("pii"))
                article_title.append(articles_list[article].get("dc:title"))
                article_doi.append(articles_list[article].get("prism:doi"))
                article_eid.append(articles_list[article].get("eid"))
                article_ID.append(articles_list[article].get("dc:identifier"))
                article_url.append(articles_list[article].get("prism:url"))
                article_creator.append(articles_list[article].get("dc:creator"))
                article_pub.append(articles_list[article].get("prism:publicationName"))
                article_coverDate.append(articles_list[article].get("prism:coverDate"))
                article_number_citations.append(articles_list[article].get("citedby-count"))

            except:
                article_pii.append(None)
                article_doi.append(None)
                article_title.append(articles_list[article].get("dc:title"))
                article_eid.append(articles_list[article].get("eid"))
                article_ID.append(articles_list[article].get("dc:identifier"))
                article_url.append(articles_list[article].get("prism:url"))
                article_creator.append(None)
                article_pub.append(articles_list[article].get("prism:publicationName"))
                article_coverDate.append(articles_list[article].get("prism:coverDate"))
                article_number_citations.append(articles_list[article].get("citedby-count"))


        return (
            article_title, article_doi, article_eid, article_ID,
            article_pii, article_url, article_creator,
            article_pub, article_coverDate, article_number_citations
        )

affiliation = []
area = []
author_count = []

def scopus_id_abstract_retriever(scopus_id: str) -> str:
    api_endpoint = f"https://api.elsevier.com/content/abstract/scopus_id/{scopus_id}"

    # Make the request to retrieve the abstract
    response = session.get(api_endpoint)
    data = json.loads(response.content.decode("utf-8"))
    # Extract the abstract from the response
    # try:
    abstract = data["abstracts-retrieval-response"]["coredata"]["dc:description"]
    try: affiliation.append(data["abstracts-retrieval-response"]["affiliation"]["affilname"])
    except:
        try: affiliation.append(data["abstracts-retrieval-response"]["affiliation"])
        except: affiliation.append(None)
    # Study Area
    try:
        result = data["abstracts-retrieval-response"]["subject-areas"]["subject-area"]
        subjects = [subject["$"] for subject in result]
        area.append(" & ".join(subjects))
    except:
        area.append(None)
    # Authors
    try: author_count.append(len(data["abstracts-retrieval-response"]["authors"]['author']))
    except: author_count.append(None)

    # except:
    #     abstract = "NA"
    #     try: affiliation.append(data["abstracts-retrieval-response"]["affiliation"]["affilname"])
    #     except:
    #         try: affiliation.append(data["abstracts-retrieval-response"]["affiliation"])
    #         except: affiliation.append(None)
    #     # Study Area
    #     try:
    #         result = data["abstracts-retrieval-response"]["subject-areas"]["subject-area"]
    #         subjects = [subject["$"] for subject in result]
    #         area.append(" & ".join(subjects))
    #     except:
    #         area.append(None)
    #     # Authors
    #     try: author_count.append(len(data["abstracts-retrieval-response"]["authors"]['author']))
    #     except: author_count.append(None)
        
    # Return the abstract
    return abstract

### SKIP

In [43]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

def extract_text_from_pages(pdf_path, num_pages):
    text = ""
    for page_layout in extract_pages(pdf_path):
        if page_layout.pageid > num_pages:
            break
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                text += element.get_text().strip() + " "
    return text.strip()

def location_finder(text: str) -> dict:
    # Load the pre-trained model
    nlp = spacy.load("en_core_web_sm")

    # Sample text
    sample_text = text

    # Process the text
    doc = nlp(sample_text)

    # Find location words and their locations
    locations = [entity.text for entity in doc.ents if entity.label_ == "GPE" or entity.label_ == "LOC"]

    # Sorting locations by frequency
    my_dict = dict(Counter(locations))
    sorted_dict = dict(sorted(my_dict.items(), key=lambda x: x[1], reverse=True))
    first_five_elements = dict(list(sorted_dict.items())[:5])

    return first_five_elements

### GPT API?

In [44]:
API_key = "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"

import openai
from transformers import GPT2Tokenizer

def truncate_text(text, max_tokens):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokens = tokenizer.encode(text, add_special_tokens=False)
    if len(tokens) <= max_tokens:
        return text
    else:
        return tokenizer.decode(tokens[:max_tokens])

# In this updated code, a new function truncate_text is introduced, which takes the text and max_tokens
# as input and truncates the text to fit within the maximum token limit. It uses the GPT-2 tokenizer
# to count the tokens and then truncates the text accordingly. If the text is already within the token limit,
# it returns the original text.

# The context extracted from the PDF is passed to truncate_text to get the truncated_context.
# The length of the prompt is then checked to ensure it does not exceed 4096 tokens. If it does,
# the truncated_context is used in the prompt.

def ask_question(context, question):
    # Set up the OpenAI API client
    openai.api_key = API_key  # Replace with your API key
    truncated_context = truncate_text(context, 4096 - len(f"Research Paper Abstract: \nQuestion: {question}\nAnswer:"))

    # Create a prompt by combining the context and the question
    prompt = f"Research Paper Abstract: {truncated_context}\nQuestion: {question}\nAnswer:"

    # Generate a response from the ChatGPT model
    response = openai.Completion.create(
        engine='text-davinci-003',  # Choose the ChatGPT model variant
        prompt=prompt,
        max_tokens=100,  # Adjust the response length as needed
        temperature=0.7,  # Adjust the randomness of the response
        n=1,  # Generate a single response
        stop=None,  # Let the model generate a complete answer
        timeout=10,  # Set a timeout (in seconds) for the API request
    )

    # Extract the answer from the model's response
    answer = response.choices[0].text.strip()

    return answer


### Code output

In [45]:
# Perform the search and retrieve article info
my_set = article_info(scopus_search(my_input))

# Create an empty list to store the output dictionary keys
list_of_lists = []

question = "I'm making a map of all research about Salt Water Intrusion and Sea Level Rise (SWISLR). Using this abstract, tell me in which coastal region this SWISLR Research is done? I want your answer to be concise in this format: {Country: Country Name, State: State Name, City: City Name}. Write (None) if you can't find this information"

# Loop over the IDs and find locations
print(f'\nGetting locations from {len(my_set[3])} abstracts...')
with ThreadPoolExecutor() as executor:
    for n, scopus_id in tqdm(enumerate(my_set[3])):
        output_dict = ask_question(scopus_id_abstract_retriever(scopus_id), question)
        list_of_lists.append(output_dict)

Scrapping Data Pages from Scopus using salinization AND flooding...


100%|██████████| 15/15 [00:07<00:00,  1.90it/s]


Number of articles: 353

Getting article titles...


100%|██████████| 353/353 [00:00<00:00, 145928.38it/s]



Getting locations from 353 abstracts...


353it [04:07,  1.43it/s]


In [59]:
scopus_id_abstract_retriever('SCOPUS_ID:0017628037')

KeyError: 'dc:description'

In [38]:


print(f"\nMaking Dataframe...")
        
# Extract first and second elements
first_elements = [inner_list[0] if len(inner_list) > 0 else None for inner_list in list_of_lists]
second_elements = [inner_list[1] if len(inner_list) > 1 else None for inner_list in list_of_lists]
third_elements = [inner_list[2] if len(inner_list) > 2 else None for inner_list in list_of_lists]
fourth_elements = [inner_list[3] if len(inner_list) > 3 else None for inner_list in list_of_lists]

# Make DataFrame
df = pd.DataFrame({
    "Paper Title": my_set[0],
    "Scopus ID" : my_set[3],
    "DOI": my_set[1],
    "URL": my_set[5],
    "Lead Author": my_set[6],
    "Affiliation": affiliation,
    "Author count": author_count,
    "Area of Study": area,
    "Publication": my_set[7],
    "Cover Date": my_set[8],
    "Number of citations": my_set[9],
    "first_location" : first_elements,
    "second_location" : second_elements,
    "third_location" : third_elements,
    "fourth_location": fourth_elements})

# Saving file

affiliation_series = df['Affiliation']

modified_series = affiliation_series.apply(lambda x: x[0]['affilname'] if isinstance(x, list) and len(x) > 0 else x)

df['Affiliation'] = modified_series

df.to_csv("output.csv")

print("DONE!")

### Another way to get locations

In [None]:
import locationtagger

# initializing sample text
sample_text = "India has very rich and vivid culture\
        widely spread from Kerala to Nagaland to Haryana to Maharashtra. " \
        "Delhi being capital with Mumbai financial capital.\
        Can be said better than some western cities such as " \
        " Munich, London etc. Pakistan and Bangladesh share its borders"

# extracting entities.
place_entity = locationtagger.find_locations(text = full_text)

# getting all countries
print("The countries in text : ")
print(place_entity.countries)

# getting all states
print("The states in text : ")
print(place_entity.regions)

# getting all cities
print("The cities in text : ")
print(place_entity.cities)