In [None]:
#needed to import the latest version that has the timeout parameter
%%capture
!pip install google-generativeai==0.7.2
!pip install pypdf
from pypdf import PdfReader
from typing import Dict
import google.generativeai as genai
import pandas as pd
import os
from typing import Callable, List, Any

In [None]:
genai.__version__

'0.7.2'

In [None]:
# creating a pdf reader object
reader = PdfReader('SDG_guide_lines.pdf')

# printing number of pages in pdf file
print(len(reader.pages))

# creating a page object
index_page = reader.pages[2]

# extracting text from page
# print(page.extract_text())

20


In [None]:
from google.colab import userdata
API_KEY = userdata.get('GEMINI_API')
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel('gemini-1.5-flash-001')

In [None]:
from google.generativeai.types import HarmCategory, HarmBlockThreshold

safety_settings={
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    }

## Guidelines definition

Extraction and definition of more meta information in the guidelines to ease the prompting process. The field contain the

In [None]:
sdg_descriptions = ['End poverty in all its forms everywhere',
                    'End hunger, achieve food security and improved nutrition and promote sustainable agriculture',
                    'Ensure healthy lives and promote well-being for all at all ages',
                    'Ensure inclusive and equitable quality education and promote lifelong learning opportunities for all',
                    'Achieve gender equality and empower all women and girls',
                    'Ensure availability and sustainable management of water and sanitation for all',
                    'Ensure access to affordable, reliable, sustainable and modern energy for all',
                    'Promote sustained, inclusive and sustainable economic growth, full and productive employment and decent work for all',
                    'Build resilient infrastructure, promote inclusive and sustainable industrialization and foster innovation',
                    'Reduce inequality within and among countries',
                    'Make cities and human settlements inclusive, safe, resilient and sustainable',
                    'Ensure sustainable consumption and production patterns',
                    'Take urgent action to combat climate change and its impacts',
                    'Conserve and sustainably use the oceans, seas and marine resources for sustainable development',
                    'Protect, restore and promote sustainable use of terrestrial ecosystems, sustainably manage forests, combat desertification, and halt and reverse land degradation and halt biodiversity loss',
                    'Promote peaceful and inclusive societies for sustainable development, provide access to justice for all and build effective, accountable and inclusive institutions at all levels',
                    'Strengthen the means of implementation and revitalize the Global Partnership for Sustainable Development']

sdg_descriptions = {i+1:des for i,des in enumerate(sdg_descriptions) }

In [None]:
sdg_guidelines = {'sdg ' + str(i): {'description': sdg_descriptions[i],
                                    'guidelines': (reader.pages[i+2].extract_text().replace(' , ', ', ').replace(' . ', '. ').replace('  ', ' ').replace('Index', '').strip())}
                  for i in range(1,18)}

## Data load

In [None]:
path = os.path.join(os.pardir, 'dataset', 'alma_sdg.csv')
df = pd.read_excel(path, sheet_name= 'ArticlesToTag')

## Gemini looped calls


In [None]:
results = {'SDG '+ str(i):[] for i in range(1,18)}
file_name = '---'

In [None]:
generation_config = dict(
    temperature=0.0,
    top_p=1.0,
    top_k=32,
)

### Functions


In [None]:
def process_gemini_answer(answers:pd.DataFrame)-> pd.DataFrame:
  label_mapping = {'contributes': True,
                  'does not contribute': False}
  ## Save all the explanations, ease readability.

  processed_answers = answers.map(lambda x: label_mapping[x.split('\n')[0].replace('*','').replace('OUTPUT:', '').strip().lower()],
                                  na_action= 'ignore')

  return processed_answers

def save_results(results: pd.DataFrame, processed_results:pd.DataFrame, original_data:pd.DataFrame,path:str, start_index:int = 0)-> None:
  try:
    data_to_add = original_data.drop(columns= 'comments')
  except KeyError:
    data_to_add = original_data
  size = results.shape[0]

  if 'title' not in results.columns:
    results.index = list(range(start_index, start_index+ results.shape[0]))
    processed_results.index = list(range(start_index, start_index+ results.shape[0]))
    results_with_text = data_to_add.iloc[start_index:start_index+size,:].join(results)
    processed_with_text = data_to_add.iloc[start_index:start_index+size,:].join(processed_results)

  if os.path.isfile(file_name):
    writer = pd.ExcelWriter(path, mode= 'a', if_sheet_exists='new')
  else:
    writer = pd.ExcelWriter(path, mode= 'w')

  with writer as writer:
    results_with_text.to_excel(writer, sheet_name='GeminiAnswers')
    processed_with_text.to_excel(writer, sheet_name='ProcessedOutput')

  return None

##### Prompt Functions

In [None]:
def generate_prompt(sdg:str, sdg_guidelines:Dict[str,Dict[str,str]], info:List[str]) -> str:
  prompt_model = f"""You are an expert in sustainable development. You must determine if a given scientific article contributes to {sdg}: {sdg_guidelines[sdg]['description']}

You must base your decision on the article's TITLE, ABSTRACT, and KEYWORDS, and on the following GUIDELINES:

GUIDELINES:
The purpose of the study is key to discriminate the themes the paper
contributes to (CONTRIBUTES) from those the paper does not contribute to but mentions
them as part of the general background context (DOES NOT CONTRIBUTE).

Possible impacts or implications relevant to an SDG should be considered only if they
are stated explicitly in the document.

{sdg_guidelines[sdg]['guidelines']}

Your output should be either "CONTRIBUTES" or "DOES NOT CONTRIBUTE".
You should also explain your output based on the GUIDELINES.

TITLE:	{info[0]}
ABSTRACT: {info[1]}
KEYWORDS: {info[2]}
OUTPUT:
"""
  return prompt_model



prompt_functions = {'guidelines': lambda x,y,z,w: generate_prompt(x, sdg_guidelines,[y,z,w]),}

## Gemini Calls

In [None]:
messages = [
      {"role": "system", "content": "You are an helpful and honest Sustainable Development Goals (SDGs) expert."},
      {"role": "user", "content": prompt_functions['guidelines'](sdg, title, abstract, keywords)},
  ]

input_ids = tokenizer.apply_chat_template(
      messages,
      add_generation_prompt=True,
      return_tensors="pt"
  ).to(model.device)

outputs = model.generate(
    input_ids,
    max_new_tokens= 512,
    eos_token_id=terminators,
    do_sample= True,
    temperature= 0.01,
    top_p= 1.0,
)
response = outputs[0][input_ids.shape[-1]:]

In [None]:
def gemini_call_loop(generate_prompt_function: Callable, data_to_analyze:pd.DataFrame,
                     sdg_guidelines:Dict[str,Dict[str,str]], relevant_columns:List[str],
                     config: Dict[str,Any],
                     file_name:str ='./default.xlsx', type_descriptor:str ='Unknown',
                     ) -> pd.DataFrame:


  results = {'SDG '+ str(i):[] for i in range(1,18)}
  generate_prompt= generate_prompt_function
  df = data_to_analyze


  try:
    if os.path.isfile(file_name):
      already_queried = pd.read_excel(file_name, sheet_name= 'ProcessedOutput').shape[0]
    print(f'Starting from {type_descriptor} number: {already_queried}\n')
  except NameError:
    already_queried = 0
    print(f'Starting from {type_descriptor} number: {already_queried}\n')

  for index in tqdm(range(df[already_queried:].shape[0]), desc= type_descriptor, position= 0, leave = True):
    adapted_index = index+ already_queried
    information = [df.iloc[adapted_index,:][column] for column in relevant_columns]
    # title = df.iloc[adapted_index].title
    # abstract = df.iloc[adapted_index].abstract
    # keywords = df.iloc[adapted_index].keywords

    for sdg in tqdm(range(1,18), leave= False, desc='SDG'):
      current_sdg = 'sdg '+ str(sdg)
      prompt_model = generate_prompt(current_sdg, sdg_guidelines, information)

      try:
        try:
          response = model.generate_content(prompt_model, generation_config= generation_config,
                                            request_options={'timeout': 30}, stream=False, safety_settings=safety_settings)
          time.sleep(2.5)
        except core_exceptions.TooManyRequests: #Too many calls in one minute, stopping for a little while
          print('Too many reqeusts in current time window, pausing for 30 sec.')
          time.sleep(30)
          response = model.generate_content(prompt_model, request_options={'timeout': 30}, stream=False, safety_settings=safety_settings)

      except Exception as e: # Exhausted free API calls or any other issue
        print('\nSomething went wrong, saving results...')
        frame_results = pd.DataFrame.from_dict(results, orient='index').transpose()
        try:
          processed_frame = process_gemini_answer(frame_results)
        except KeyError:
          print("Answer Processing Error...")
          processed_frame = pd.DataFrame({'Nothing':['Yes']})
        save_results(frame_results, processed_frame, df, file_name, start_index=already_queried)
        files.download("/content/"+ file_name)
        raise e

      results[current_sdg].append(response.text)

  frame_results = pd.DataFrame.from_dict(results, orient='index').transpose()
  processed_frame = process_gemini_answer(frame_results)
  save_results(frame_results, processed_frame, df, file_name, start_index=already_queried)
  files.download("/content/"+ file_name)

  return frame_results

In [None]:
#Load dataset and repeat this cycle for each sample, gathering the results
import time
from google.api_core import exceptions as core_exceptions
from google.colab import files
from tqdm.auto import tqdm

relevant_dataset_columns = ['title', 'abstract', 'keywords']

gemini_call_loop(generate_prompt_function= generate_prompt,
                 data_to_analyze= df,
                 sdg_guidelines=sdg_guidelines,
                 relevant_columns= relevant_dataset_columns,
                 config = generation_config,
                 file_name=file_name,
                 type_descriptor= 'Research Article',)


In [None]:
frame_results = pd.DataFrame.from_dict(results, orient='index').transpose()
processed_frame = process_gemini_answer(frame_results)
save_results(frame_results, processed_frame, df, file_name)
files.download("/content/"+ file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>