### Importamos los paquetes que necesitamos. 
### 1- Función: sustract keywords: Extrae las keywords de la claim
### 2- Función build query: Monta la query en el formato de whoosh para buscar en la colección
### 3- Función search: Dada una claim, el directorio del índice y una k, te devuelve los documentos que tienen relevancia dada la claim. Si hay más documentos que k te devuelve k documentos, sino te los devuelve todos.

In [5]:
from whoosh.index import open_dir
from sentence_transformers import SentenceTransformer
from whoosh.qparser import QueryParser
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModel,AutoModelForTokenClassification,AutoModelForSequenceClassification
import torch
from sklearn.metrics.pairwise import cosine_similarity
import torch.nn.functional as F
import nltk
nltk.download("punkt")
from keybert import KeyBERT
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from transformers import TrainingArguments, AdamW
import torch.nn as nn

kw_model = KeyBERT()

modelSBERT = SentenceTransformer('mitra-mir/setfit-model-Feb11-Misinformation-on-Media-Traditional-Social')

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/stsb-roberta-large')
model = AutoModel.from_pretrained('sentence-transformers/stsb-roberta-large')

def sustract_keywords(query):
    resul=[]
    kw = kw_model.extract_keywords(query,keyphrase_ngram_range=(1, 1), stop_words=None)
    for k,v in kw:
        resul.append(k)
        
    return resul

def build_query(lista):
    new_l=[]
    for elem in lista:
        
        new_l.append(f'evidences:{elem} OR keywords:{elem}')
        
    return " OR ".join(new_l)

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def search(q,dirindex,k):
    keyw=sustract_keywords(q)
    
    print(f'-> Keywords of the query: {keyw}')
    
    
    ix=open_dir(dirindex)
    
    personal_query=build_query(keyw)
    full_query=f'query --> {personal_query}'


    print(full_query)

    with ix.searcher() as searcher:
        parser=QueryParser("evidences",ix.schema)
        query=parser.parse(personal_query)

        results=searcher.search(query,limit=None)
        ev=[]
        for r in results:
            print(f'-> {r["evidences"]} ({r["keywords"]})')
            ev.append(r["evidences"])

        print("=================================")
        print(f'{len(results)} results')
        
    if len(results)>k:
        return ev[0:k]
    
    else:
        
        return ev

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pepe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
query="Show me a proud #Nigerian after today's catastrophic selection dubbed election. An unimaginable ruinous charade exhibited by a country deservedly nicknamed a zoo. How I wish Chinua Achebe is alive today he would have penned down a befitting eulogy for this falling giant"
docs=search(query,'index',3)
len(docs)

-> Keywords of the query: ['nigerian', 'achebe', 'chinua', 'giant', 'zoo']
query --> evidences:nigerian OR keywords:nigerian OR evidences:achebe OR keywords:achebe OR evidences:chinua OR keywords:chinua OR evidences:giant OR keywords:giant OR evidences:zoo OR keywords:zoo
-> The iVerify fact-checking process has determined as unproven the claim that the UPND chairperson has been extorting money from businessmen seeking favours or protection. The article claims that Nigerian prophet Seer 1 allegedly exposed the scandal, yet contrary to what was claimed, Seer 1 referred to “some top UPND officials”, without mentioning names. He did threaten to expose names if the act persists. The iVerify team made efforts to reach Blogger Musamba Mumba, but unfortunately without success. The iVerify team further called the Drug Enforcement Commission to substantiate if indeed such acts of money extortion had been received. In response, the Drug Enforcement Commission Public Relation Officer Mr. Kamanga 

1

### Juntamos los dos datasets que tenemos hasta el momento

In [None]:
data=pd.read_csv('iverifyzm.csv')
data=data.drop(['Unnamed: 0'], axis=1)
for i in range(len(data)):
    if data['label'][i]=='True' :
        data['label'][i]=2
    elif data['label'][i]=='Partly False' or data['label'][i]=='False' or data['label'][i]=='Misleading':
        data['label'][i]=0
    elif data['label'][i]=='Unproven':
        data['label'][i]=1
        
d=pd.read_csv('africa_check_nigeria.csv')
d = d.rename(columns={'claim':'claims'})
for i in range(len(d)):
    if d['label'][i]=='checked' or d['label'][i]=='Checked':
        d['label'][i]=2
    elif d['label'][i]=='false' or d['label'][i]=='False' or d['label'][i]=='Fake' or d['label'][i]=='Misleading':
        d['label'][i]=0
        
data=pd.concat([data,d])

### Extraemos los embeddings de las evidencias

In [None]:
cont = 1
embedding_train = []
evidence_sentences = []
for ev in data['evidences']:
    print(f"Analizando {cont}")
    cont += 1
    for evs in ev.split('.'):
        if len(evs) > 10:
            evidence_sentences.append(evs.strip())
            embedding_train.append(modelSBERT.encode(evs.strip()))

### Extraemos los embeddings de las claims y creamos una nueva lista con las top 5 evidences para cada claim

In [None]:
cont = 1
top5 = []
for claim in data['claims']:
    print(f"Analizando {cont}")
    cont += 1
    emb_claim = modelSBERT.encode(claim.strip())
    indx = np.flip(np.argsort(cosine_similarity(emb_claim.reshape(1, -1), embedding_train))[0][-5:])
    claim5 = []
    for i in indx:
        claim5.append(evidence_sentences[i])
    top5.append(claim5)
    
newevidences = []
for t in top5:
    newevidences.append('. '.join(t))
newevidences

### Creamos el nuevo dataframe

In [None]:
train = pd.DataFrame(zip(data['claims'], newevidences, data['label']), columns = ['claim', 'top5evidences', 'label'])
train