# Indução dos modelos

In [1]:
import logging
logging.basicConfig(level=logging.WARNING)

In [2]:
import getpass
import os

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [4]:
openai_key = ''

In [5]:
cnpq = ['new_cnpq_level_1a',
        'new_cnpq_level_2a',
        'new_cnpq_level_3a',
        'new_cnpq_level_4a']

## Lendo a base traduzida

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('unidades_26082024-translated.csv', dtype=str, na_filter=False)

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,fiocruz_id,Título,Resumo,Palavras-chaves do autor,Veículo de publicação,new_cnpq_level_1a,new_cnpq_level_2a,new_cnpq_level_3a,new_cnpq_level_4a,all
0,0,766be0fb94fb303594145c62dbe438b276d64344,Phylogenetic diversity of aerobic spore-formin...,The phylum Firmicutes comprises seven classes ...,"Firmicutes,Bacillus and related genera,Sporula...",INTERNATIONAL MICROBIOLOGY,Ciências Biológicas,Genética,Genética Molecular e de Microorganismos,,Phylogenetic diversity of aerobic spore-formin...
1,1,45676e2647955cc6f6c451fdda86309e2a2e91d3,"Isolation, Characterization and Antifungal Act...",Capsicum species belong to the Solanaceae fami...,"Trypsin inhibitor,Antimicrobial peptides,Capsi...",PROTEIN JOURNAL,Ciências Biológicas,Bioquímica,,,"Isolation, Characterization and Antifungal Act..."
2,4,d8638c08b1385dd7363f55dc2548bae5b0dcba5f,Immune response and pathogenesis of neuroschis...,The involvement of the central nervous system ...,"Schistosomiasis,Neuroschistosomiasis,Schistoso...",Acta tropica,Ciências Biológicas,Imunologia,,,Immune response and pathogenesis of neuroschis...
3,5,3426f42ecddc619fd8d0f551800df9ef5cc3b711,"Eosinophil activation status, cytokines and li...",We have been investigating whether human eosin...,"Flow cytometry,Eosinophils,Activation status,C...",Acta tropica,Ciências Biológicas,Imunologia,,,"Eosinophil activation status, cytokines and li..."
4,6,2076749904137385851c93837e028685d290e515,"Cytokines, chemokine receptors, CD4(+)CD25(HIG...",Previous studies have demonstrated that distin...,"Schistosomiasis,Cytokines,Chemokine receptors,...",Acta tropica,Ciências Biológicas,Imunologia,Imunologia Celular,,"Cytokines, chemokine receptors, CD4(+)CD25(HIG..."


In [9]:
df.shape

(8155, 11)

## Carregando o GPTClassifier

In [None]:
import random, openai, json
from sklearn.base import BaseEstimator
from sklearn.utils.multiclass import unique_labels

class GPTClassifier(BaseEstimator):

    def __init__(self, model, key):
        self.model = model
        self.key = key
        openai.api_key = self.key

    def fit(self, X, y):
        self.classes_ = unique_labels(y)
        self.labels_ = [item.split('::HiClass::Separator::')[-1] for item in self.classes_.tolist()]
        return self

    def predict(self, X):
        predictions = []
        for text in X:

            pred = ''
            text = (f'Classify the article content into one correct research area:\n {text}')
            completion = openai.chat.completions.create(
                model = self.model,
                temperature = 0,
                messages = [{'role': 'user', 'content': text}],
                tools = self.classify_content(self.labels_),
                tool_choice = {'type': 'function', 'function': {'name': 'classify_content'}}
            )

            try:
                content = completion.choices[0].message.tool_calls[0].function.arguments
                pred = json.loads(content)['prediction'][0]
                idx = self.labels_.index(pred)
            except:  
                pred = random.choice(self.labels_)
                idx = self.labels_.index(pred)

            predictions.append(self.classes_[idx])

        return np.array(predictions)

    def classify_content(self, labels):

        return [{
                'type': 'function',
                'function': {
                    'name': 'classify_content',
                    'description': 'Predict the research area for a given article content',
                    'parameters': {
                        'type': 'object',
                        'properties': {
                            'prediction': {
                                'type': 'array',
                                'items': {
                                    'type': 'string',
                                    'enum': labels
                                },
                                'description': 'The predicted research areas.'
                            }
                        },
                        'required': [
                            'prediction'
                        ]
                    }
                }
        }]

## Carregando o OllamaClassifier

In [None]:
import json, random
from sklearn.base import BaseEstimator
from sklearn.utils.multiclass import unique_labels
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate

class OllamaClassifier(BaseEstimator):

    def __init__(self, model):
        self.model = ChatOllama(model=model, format='json', temperature=0)

    def fit(self, X, y):
        self.classes_ = unique_labels(y)
        self.labels_ = [item.split('::HiClass::Separator::')[-1] for item in self.classes_.tolist()]
        self.template = ChatPromptTemplate.from_template("""
                Based on the article content:\n\n
                {text}\n\n
                Classify the content into one correct research area:
                {labels}
                Return a JSON object like ['Research Area': '']."""
            ) | self.model
        return self

    def predict(self, X):
        predictions = []
        for text in X:

            pred = ''
            self.classify_content(self.labels_)
            content = self.template.invoke({'text': text, 'labels': '; '.join(self.labels_)})

            try:
                content = content.dict()
                pred = json.loads(content['content'])['Research Area']
                idx = self.labels_.index(pred)
            except:  
                pred = random.choice(self.labels_)
                idx = self.labels_.index(pred)

            predictions.append(self.classes_[idx])

        return np.array(predictions)

        
    def classify_content(self, labels):

        self.model = self.model.bind(
            tools = [{
                'name': 'classify_content',
                'description': 'Predict the research area for a given article content',
                'parameters': {
                    'type': 'object',
                    'properties': {
                        'prediction': {
                            'type': 'array',
                            'description': 'The predicted research areas.',
                            'items': {
                                'type': 'string',
                                'enum': labels
                            },
                        }
                    },
                    'required': ['prediction']
                }
            }], 
            function_call={'name': 'classify_content'}
        )

## Definindo função para salvar as predições

In [None]:
import pickle, gzip

In [None]:
def save(config, prediction):
    with gzip.open('prediction/unidades_26082024-' + config + '.pickle', 'wb') as handle:
        pickle.dump(prediction, handle)

## Executando os Modelos de Linguagem

In [None]:
import tqdm
import numpy as np
from itertools import product
from sklearn.pipeline import Pipeline
from hiclass import LocalClassifierPerParentNode

In [None]:
strategies = {'LCPPN' : LocalClassifierPerParentNode}

In [None]:
classifiers = {'GPT4o' : GPTClassifier('gpt-4o', openai_key), 
               'GPT3.5' : GPTClassifier('gpt-3.5-turbo-0125', openai_key), 
               'Llama3.0' : OllamaClassifier('llama3:70b'),
               'Llama3.1' : OllamaClassifier('llama3.1:70b')}

In [None]:
for stg, cls in tqdm.tqdm(product(strategies, classifiers)):

    steps = [(stg + ' ' + cls, strategies[stg](classifiers[cls], bert=True))]
    pipeline = Pipeline(steps) 

    X_train, y_train = df['all'].to_numpy(), df[cnpq].to_numpy()
    
    pipeline.fit(X_train, y_train)
    prediction = pipeline.predict(X_train)
    print(prediction)
    config = ' '.join(list(pipeline.named_steps.keys()))
    save(config, prediction)