## In this notebook we will create a searcher to find queries in the index

In [1]:
from whoosh.index import open_dir
from sentence_transformers import SentenceTransformer
from whoosh.qparser import QueryParser
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModel,AutoModelForTokenClassification,AutoModelForSequenceClassification
import torch
from sklearn.metrics.pairwise import cosine_similarity
import torch.nn.functional as F
import nltk
nltk.download("punkt")
from keybert import KeyBERT
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from transformers import TrainingArguments, AdamW
import torch.nn as nn

kw_model = KeyBERT()

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/stsb-roberta-large')
model = AutoModel.from_pretrained('sentence-transformers/stsb-roberta-large')

def sustract_keywords(query):
    resul=[]
    kw = kw_model.extract_keywords(query,keyphrase_ngram_range=(1, 1), stop_words=None)
    for k,v in kw:
        resul.append(k)
        
    return resul

def build_query(lista):
    new_l=[]
    for elem in lista:
        
        new_l.append(f'evidences:{elem} OR keywords:{elem}')
        
    return " OR ".join(new_l)

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pepe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### New query comes: "Dr Edgar Chagwa Lungu will try to run for presidency in 2026"

We can sustract the keywords of this claim using the Voicelab Keywords Extraction model. 


-> We get the keywords.


-> We build our query


-> We search for documents given these keywords that we got in our collection

In [2]:
query="Show me a proud #Nigerian after today's catastrophic selection dubbed election. An unimaginable ruinous charade exhibited by a country deservedly nicknamed a zoo. How I wish Chinua Achebe is alive today he would have penned down a befitting eulogy for this falling giant"

In [76]:
k=sustract_keywords(query)
build_query(k)

'evidences:nigerian OR keywords:nigerian OR evidences:achebe OR keywords:achebe OR evidences:chinua OR keywords:chinua OR evidences:giant OR keywords:giant OR evidences:zoo OR keywords:zoo'

In [77]:
def search(q):
    keyw=sustract_keywords(q)
    
    print(f'-> Keywords of the query: {keyw}')
    
    dir1='index'
    ix=open_dir(dir1)
    
    personal_query=build_query(keyw)
    full_query=f'query --> {personal_query}'


    print(full_query)

    with ix.searcher() as searcher:
        parser=QueryParser("evidences",ix.schema)
        query=parser.parse(personal_query)

        results=searcher.search(query,limit=None)
        ev=[]
        for r in results:
            print(f'-> {r["evidences"]} ({r["keywords"]})')
            ev.append(r["evidences"])

        print("=================================")
        print(f'{len(results)} results')
        
    return ev

In [78]:
docs=search(query)

-> Keywords of the query: ['nigerian', 'achebe', 'chinua', 'giant', 'zoo']
query --> evidences:nigerian OR keywords:nigerian OR evidences:achebe OR keywords:achebe OR evidences:chinua OR keywords:chinua OR evidences:giant OR keywords:giant OR evidences:zoo OR keywords:zoo
-> The iVerify fact-checking process has determined as unproven the claim that the UPND chairperson has been extorting money from businessmen seeking favours or protection. The article claims that Nigerian prophet Seer 1 allegedly exposed the scandal, yet contrary to what was claimed, Seer 1 referred to “some top UPND officials”, without mentioning names. He did threaten to expose names if the act persists. The iVerify team made efforts to reach Blogger Musamba Mumba, but unfortunately without success. The iVerify team further called the Drug Enforcement Commission to substantiate if indeed such acts of money extortion had been received. In response, the Drug Enforcement Commission Public Relation Officer Mr. Kamanga 

### Once we have got our documents related to the query given, we are going to perform an embedding process, to represent the texts of query and each doc in numeric terms, as vectors. Then we will compute the cosine similarity between each document and the query.

In [79]:
frases=[]
for elem in docs:
    l=nltk.tokenize.sent_tokenize(elem)
    for e in l:
        e=e.replace('\n',' ')
        frases.append(e)

In [80]:
frases

['The iVerify fact-checking process has determined as unproven the claim that the UPND chairperson has been extorting money from businessmen seeking favours or protection.',
 'The article claims that Nigerian prophet Seer 1 allegedly exposed the scandal, yet contrary to what was claimed, Seer 1 referred to “some top UPND officials”, without mentioning names.',
 'He did threaten to expose names if the act persists.',
 'The iVerify team made efforts to reach Blogger Musamba Mumba, but unfortunately without success.',
 'The iVerify team further called the Drug Enforcement Commission to substantiate if indeed such acts of money extortion had been received.',
 'In response, the Drug Enforcement Commission Public Relation Officer Mr. Kamanga stated that the Commission had not received an official report concerning the case.',
 'The iVerify fact checking team attended a press briefing organized by the UPND spokesperson Mr. Cornelius Mwitwa and attended by the UPND Chairperson Mr. Steven Katuk

In [65]:
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, max pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
sentence_embeddings=F.normalize(sentence_embeddings,p=2,dim=1)
print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[ 0.0045,  0.0428, -0.0010,  ..., -0.0067, -0.0298,  0.0363],
        [-0.0176,  0.0403,  0.0102,  ..., -0.0321,  0.0069, -0.0017],
        [-0.0133,  0.0265,  0.0428,  ..., -0.0357, -0.0132,  0.0733],
        ...,
        [ 0.0188,  0.0236, -0.0157,  ..., -0.0322, -0.0321,  0.0146],
        [ 0.0095,  0.0153,  0.0067,  ...,  0.0148, -0.0226,  0.0814],
        [ 0.0499,  0.0127, -0.0162,  ...,  0.0067, -0.0216,  0.0420]])


In [66]:
best,sim='',0
for i in range(1,len(sentence_embeddings)):
    cos=cosine_similarity(sentence_embeddings[0].reshape(1,-1),sentence_embeddings[i].reshape(1,-1))
    if cos > sim:
        best=sentences[i]
        sim=cos
    print(f'Cosine similarity between query and document {i}: {cos}')
    
print(best,sim)

Cosine similarity between query and document 1: [[0.12796442]]
Cosine similarity between query and document 2: [[0.20836748]]
Cosine similarity between query and document 3: [[-0.00150992]]
Cosine similarity between query and document 4: [[0.13368455]]
Cosine similarity between query and document 5: [[0.02912207]]
Cosine similarity between query and document 6: [[0.10472247]]
Cosine similarity between query and document 7: [[0.0408664]]
Cosine similarity between query and document 8: [[0.0509023]]
Cosine similarity between query and document 9: [[0.12690613]]
The article claims that Nigerian prophet Seer 1 allegedly exposed the scandal, yet contrary to what was claimed, Seer 1 referred to “some top UPND officials”, without mentioning names. [[0.20836748]]


### Trying to compute the SBERT for classify the claim with the evidences

In [3]:
modelSBERT = SentenceTransformer('mitra-mir/setfit-model-Feb11-Misinformation-on-Media-Traditional-Social')

In [57]:
data=pd.read_csv('iverifyzm.csv')
data=data.drop(['Unnamed: 0'], axis=1)
data

Unnamed: 0,claims,evidences,label,date
0,"On March 2, 2023, Koswe Facebook page publish...",The iVerify Zambia has determined as false the...,False,"Mar, 09 2023"
1,"On 22 February 2023, a Facebook page called Za...",iVerify has determined as true reports that Cy...,True,"Feb, 23 2023"
2,"On February 21, 2023, a Facebook page called K...",iVerify Zambia has determined as misleading th...,Misleading,"Feb, 23 2023"
3,"The Patriotic Front (PF) Facebook page, on Jan...",iVerify Zambia has determined as false the cla...,False,"Feb, 22 2023"
4,"On Wednesday, February 17, 2023, Grindstone Te...",iVerify Zambia has determined as misleading th...,Misleading,"Feb, 18 2023"
...,...,...,...,...
206,"On 13th October, 2021, The Candidates uploaded...",The Fact Checking process has determined as tr...,True,"Oct, 20 2021"
207,"On 28th July 2021, News Diggers Newspaper publ...",The fact checking process has determined as mi...,Misleading,"Aug, 04 2021"
208,The Patriotic Front (PF) Facebook Page publish...,The Fact Checking Process has determined as fa...,False,"Nov, 12 2021"
209,"On 12th July, 2021, the Smart Eagles Facebook ...",The fact checking process has determined the c...,Partly False,"Jul, 20 2021"


In [58]:
for i in range(len(data)):
    if data['label'][i]=='True' :
        data['label'][i]=2
    elif data['label'][i]=='Partly False' or data['label'][i]=='False' or data['label'][i]=='Misleading':
        data['label'][i]=0
    elif data['label'][i]=='Unproven':
        data['label'][i]=1

In [6]:
data['label'].unique()

array([0, 2, 1], dtype=object)

In [59]:
d=pd.read_csv('africa_check_nigeria.csv')
d = d.rename(columns={'claim':'claims'})
for i in range(len(d)):
    if d['label'][i]=='checked' or d['label'][i]=='Checked':
        d['label'][i]=2
    elif d['label'][i]=='false' or d['label'][i]=='False' or d['label'][i]=='Fake' or d['label'][i]=='Misleading':
        d['label'][i]=0
    
    

In [8]:
d['label'].unique()

array([0, 2, 1], dtype=object)

In [45]:
type(data)

pandas.core.frame.DataFrame

In [60]:

data=pd.concat([data,d])
data

Unnamed: 0,claims,evidences,label,date
0,"On March 2, 2023, Koswe Facebook page publish...",The iVerify Zambia has determined as false the...,0,"Mar, 09 2023"
1,"On 22 February 2023, a Facebook page called Za...",iVerify has determined as true reports that Cy...,2,"Feb, 23 2023"
2,"On February 21, 2023, a Facebook page called K...",iVerify Zambia has determined as misleading th...,0,"Feb, 23 2023"
3,"The Patriotic Front (PF) Facebook page, on Jan...",iVerify Zambia has determined as false the cla...,0,"Feb, 22 2023"
4,"On Wednesday, February 17, 2023, Grindstone Te...",iVerify Zambia has determined as misleading th...,0,"Feb, 18 2023"
...,...,...,...,...
61,A screenshot shared on social media in Nigeria...,A screenshot showing the presidential campaign...,0,19/10/2022
62,A number of Facebook pages claim to offer inve...,Several Facebook pages have been created claim...,0,28/09/2022
63,An online rumour is that Tokunbo Awolowo-Dosum...,Nigeria is set to choose a new president in mo...,0,19/09/2022
64,As reports emerged that presidential hopeful P...,"“Any Nigerian who lives abroad, funding the ca...",0,19/09/2022


In [61]:
cont = 1
embedding_train = []
evidence_sentences = []
for ev in data['evidences']:
    print(f"Analizando {cont}")
    cont += 1
    for evs in ev.split('.'):
        if len(evs) > 10:
            evidence_sentences.append(evs.strip())
            embedding_train.append(modelSBERT.encode(evs.strip()))

Analizando 1
Analizando 2
Analizando 3
Analizando 4
Analizando 5
Analizando 6
Analizando 7
Analizando 8
Analizando 9
Analizando 10
Analizando 11
Analizando 12
Analizando 13
Analizando 14
Analizando 15
Analizando 16
Analizando 17
Analizando 18
Analizando 19
Analizando 20
Analizando 21
Analizando 22
Analizando 23
Analizando 24
Analizando 25
Analizando 26
Analizando 27
Analizando 28
Analizando 29
Analizando 30
Analizando 31
Analizando 32
Analizando 33
Analizando 34
Analizando 35
Analizando 36
Analizando 37
Analizando 38
Analizando 39
Analizando 40
Analizando 41
Analizando 42
Analizando 43
Analizando 44
Analizando 45
Analizando 46
Analizando 47
Analizando 48
Analizando 49
Analizando 50
Analizando 51
Analizando 52
Analizando 53
Analizando 54
Analizando 55
Analizando 56
Analizando 57
Analizando 58
Analizando 59
Analizando 60
Analizando 61
Analizando 62
Analizando 63
Analizando 64
Analizando 65
Analizando 66
Analizando 67
Analizando 68
Analizando 69
Analizando 70
Analizando 71
Analizando 72
A

In [62]:
cont = 1
top5 = []
for claim in data['claims']:
    print(f"Analizando {cont}")
    cont += 1
    emb_claim = modelSBERT.encode(claim.strip())
    indx = np.flip(np.argsort(cosine_similarity(emb_claim.reshape(1, -1), embedding_train))[0][-5:])
    claim5 = []
    for i in indx:
        claim5.append(evidence_sentences[i])
    top5.append(claim5)
    
newevidences = []
for t in top5:
    newevidences.append('. '.join(t))
newevidences

Analizando 1
Analizando 2
Analizando 3
Analizando 4
Analizando 5
Analizando 6
Analizando 7
Analizando 8
Analizando 9
Analizando 10
Analizando 11
Analizando 12
Analizando 13
Analizando 14
Analizando 15
Analizando 16
Analizando 17
Analizando 18
Analizando 19
Analizando 20
Analizando 21
Analizando 22
Analizando 23
Analizando 24
Analizando 25
Analizando 26
Analizando 27
Analizando 28
Analizando 29
Analizando 30
Analizando 31
Analizando 32
Analizando 33
Analizando 34
Analizando 35
Analizando 36
Analizando 37
Analizando 38
Analizando 39
Analizando 40
Analizando 41
Analizando 42
Analizando 43
Analizando 44
Analizando 45
Analizando 46
Analizando 47
Analizando 48
Analizando 49
Analizando 50
Analizando 51
Analizando 52
Analizando 53
Analizando 54
Analizando 55
Analizando 56
Analizando 57
Analizando 58
Analizando 59
Analizando 60
Analizando 61
Analizando 62
Analizando 63
Analizando 64
Analizando 65
Analizando 66
Analizando 67
Analizando 68
Analizando 69
Analizando 70
Analizando 71
Analizando 72
A

["The iVerify Zambia has determined as false the claim by the Koswe Facebook page that Zambia’s former Republican President, Dr Edgar Chagwa Lungu had  declared himself ready for the 2026 General Elections and that he was determined to remove the current Republican President, Mr. “PDP G5 Governors led by Governor Nyesom Wike have today endorsed Bola Ahmed Tinubu for President, saying Peter Obi's South East support is not enough for him to be President in 2023,” begins a message circulating on Facebook since November 2022. Following remarks from the members of the public on social media platforms criticizing the President’s recent meeting with the Pope at the Vatican, the Chief Government Spokesperson Chushi Kasanda in a statement, also clarified that President Hakainde Hichilema did not visit the Vatican as an Adventist but as a Head of State whose citizens belong to various religious persuasions including Catholics and is a president for all Zambians, who belong to different denominat

In [11]:
query

"Show me a proud #Nigerian after today's catastrophic selection dubbed election. An unimaginable ruinous charade exhibited by a country deservedly nicknamed a zoo. How I wish Chinua Achebe is alive today he would have penned down a befitting eulogy for this falling giant"

In [83]:
newevidences

["The iVerify Zambia has determined as false the claim by the Koswe Facebook page that Zambia’s former Republican President, Dr Edgar Chagwa Lungu had  declared himself ready for the 2026 General Elections and that he was determined to remove the current Republican President, Mr. “PDP G5 Governors led by Governor Nyesom Wike have today endorsed Bola Ahmed Tinubu for President, saying Peter Obi's South East support is not enough for him to be President in 2023,” begins a message circulating on Facebook since November 2022. Following remarks from the members of the public on social media platforms criticizing the President’s recent meeting with the Pope at the Vatican, the Chief Government Spokesperson Chushi Kasanda in a statement, also clarified that President Hakainde Hichilema did not visit the Vatican as an Adventist but as a Head of State whose citizens belong to various religious persuasions including Catholics and is a president for all Zambians, who belong to different denominat

In [63]:
train = pd.DataFrame(zip(data['claims'], newevidences, data['label']), columns = ['claim', 'top5evidences', 'label'])
train

Unnamed: 0,claim,top5evidences,label
0,"On March 2, 2023, Koswe Facebook page publish...",The iVerify Zambia has determined as false the...,0
1,"On 22 February 2023, a Facebook page called Za...",iVerify has determined as true reports that Cy...,2
2,"On February 21, 2023, a Facebook page called K...",According to the statement dated October 19 an...,0
3,"The Patriotic Front (PF) Facebook page, on Jan...",iVerify Zambia has determined as false the cla...,0
4,"On Wednesday, February 17, 2023, Grindstone Te...",But did the former military governor write thi...,0
...,...,...,...
272,A screenshot shared on social media in Nigeria...,An image circulating on social media appears t...,0
273,A number of Facebook pages claim to offer inve...,In a statement posted on their social media pa...,0
274,An online rumour is that Tokunbo Awolowo-Dosum...,"“Tokunbo Awolowo-Dosunmu, the daughter of Nige...",0
275,As reports emerged that presidential hopeful P...,A video also fact checked by iVerify team post...,0


In [25]:
train['claim'][0]

'On March 2,  2023, Koswe Facebook page published an article claiming that Zambia’s Former Republican President, Dr Edgar Changwa Lungu had declared himself ready for the 2026 General Elections and was determined to remove the current Republican President, Mr. Hakainde Hichilema from office. The post was accompanied by a video clip of the Former President, addressing a public gathering where he allegedly declared his readiness for the elections.'

In [26]:
train['top5evidences'][0]

"The iVerify Zambia has determined as false the claim by the Koswe Facebook page that Zambia’s former Republican President, Dr Edgar Chagwa Lungu had  declared himself ready for the 2026 General Elections and that he was determined to remove the current Republican President, Mr. “PDP G5 Governors led by Governor Nyesom Wike have today endorsed Bola Ahmed Tinubu for President, saying Peter Obi's South East support is not enough for him to be President in 2023,” begins a message circulating on Facebook since November 2022. Following remarks from the members of the public on social media platforms criticizing the President’s recent meeting with the Pope at the Vatican, the Chief Government Spokesperson Chushi Kasanda in a statement, also clarified that President Hakainde Hichilema did not visit the Vatican as an Adventist but as a Head of State whose citizens belong to various religious persuasions including Catholics and is a president for all Zambians, who belong to different denominati

In [78]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenized_dataset = []
for i,row in train.iterrows():
    tokenized_dataset.append(tokenizer(row["claim"], row['top5evidences'],
                             pad_to_max_length = True,  max_length = 256,return_attention_mask=True,add_special_tokens = True
                            ,return_tensors = 'pt'))
    
model = AutoModelForSequenceClassification.from_pretrained("vikram71198/distilroberta-base-finetuned-fake-news-detection", num_labels=3,
                                                          ignore_mismatched_sizes=True)


X_train, X_val,  y_train, y_val = train_test_split(tokenized_dataset,train['label'], test_size = 0.2)

train_input_ids = torch.cat([d['input_ids'] for d in X_train],dim=0)
train_attention_masks = torch.cat([d['attention_mask'] for d in X_train],dim=0)
train_labels = torch.tensor(y_train.map({0: 0,  1: 1,  2: 2}).values)
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
#train_dataset = TensorDataset(train_input_ids,  train_labels)

val_input_ids = torch.cat([d['input_ids'] for d in X_val],dim=0)
val_attention_masks = torch.cat([d['attention_mask'] for d in X_val],dim=0)
val_labels = torch.tensor(y_val.map({0: 0,  1: 1,  2: 2}).values)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
#val_dataset = TensorDataset(val_input_ids, val_labels)

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Crear un DataLoader que divide el conjunto de datos de val en lotes y les aplica aleatorización y paralelización en la CPU o GPU
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will alwa

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [61]:
"""for param in model.roberta.parameters():
    param.requires_grad = False"""

In [79]:
optim = AdamW(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()
# Número de épocas que deseas entrenar
num_epochs = 10

# Entrenamiento
for epoch in range(num_epochs):
    # Entrenamiento del modelo
    model.train()
    for batch_idx, (inputs, masks, target) in enumerate(train_dataloader):
        # Reinicia el gradiente acumulado en el optimizador
        optim.zero_grad()

        # Pasa los datos por el modelo para obtener las predicciones
        
        output = model(input_ids=inputs, attention_mask=masks, labels=target)
        #print(output)
        # Calcula la pérdida
        loss = loss_fn(output.logits, target)

        # Retropropagación
        loss.backward()

        # Actualiza los parámetros del modelo
        optim.step()

    # Validación del modelo
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for (inputs, masks, target) in val_dataloader:
            # Pasa los datos por el modelo para obtener las predicciones
            output = model(input_ids=inputs, attention_mask=masks, labels=target)

            # Predice la clase con la mayor probabilidad
            _, predicted = torch.max(output.logits, 1)

            # Calcula el número de predicciones correctas y el número total de ejemplos
            total += target.size(0)
            correct += (predicted == target).sum().item()

        # Calcula la precisión en la validación
        val_acc = correct / total

    # Imprime la precisión en la validación
    print('Epoch: {}, Val Acc: {}'.format(epoch, val_acc))

Epoch: 0, Val Acc: 0.75
Epoch: 1, Val Acc: 0.75
Epoch: 2, Val Acc: 0.75


KeyboardInterrupt: 

### Trying SVM with the embeddings of the top5evidences

In [64]:
train

Unnamed: 0,claim,top5evidences,label
0,"On March 2, 2023, Koswe Facebook page publish...",The iVerify Zambia has determined as false the...,0
1,"On 22 February 2023, a Facebook page called Za...",iVerify has determined as true reports that Cy...,2
2,"On February 21, 2023, a Facebook page called K...",According to the statement dated October 19 an...,0
3,"The Patriotic Front (PF) Facebook page, on Jan...",iVerify Zambia has determined as false the cla...,0
4,"On Wednesday, February 17, 2023, Grindstone Te...",But did the former military governor write thi...,0
...,...,...,...
272,A screenshot shared on social media in Nigeria...,An image circulating on social media appears t...,0
273,A number of Facebook pages claim to offer inve...,In a statement posted on their social media pa...,0
274,An online rumour is that Tokunbo Awolowo-Dosum...,"“Tokunbo Awolowo-Dosunmu, the daughter of Nige...",0
275,As reports emerged that presidential hopeful P...,A video also fact checked by iVerify team post...,0


In [65]:
train['evclaim']=train['claim']+train['top5evidences']

In [66]:
ev=train['evclaim']
emb=[]
cont=0
for e in ev:
    cont+=1
    print(f'Analizando {cont}')
    
    emb.append(modelSBERT.encode(e.strip()))
        
        
    

Analizando 1
Analizando 2
Analizando 3
Analizando 4
Analizando 5
Analizando 6
Analizando 7
Analizando 8
Analizando 9
Analizando 10
Analizando 11
Analizando 12
Analizando 13
Analizando 14
Analizando 15
Analizando 16
Analizando 17
Analizando 18
Analizando 19
Analizando 20
Analizando 21
Analizando 22
Analizando 23
Analizando 24
Analizando 25
Analizando 26
Analizando 27
Analizando 28
Analizando 29
Analizando 30
Analizando 31
Analizando 32
Analizando 33
Analizando 34
Analizando 35
Analizando 36
Analizando 37
Analizando 38
Analizando 39
Analizando 40
Analizando 41
Analizando 42
Analizando 43
Analizando 44
Analizando 45
Analizando 46
Analizando 47
Analizando 48
Analizando 49
Analizando 50
Analizando 51
Analizando 52
Analizando 53
Analizando 54
Analizando 55
Analizando 56
Analizando 57
Analizando 58
Analizando 59
Analizando 60
Analizando 61
Analizando 62
Analizando 63
Analizando 64
Analizando 65
Analizando 66
Analizando 67
Analizando 68
Analizando 69
Analizando 70
Analizando 71
Analizando 72
A

In [67]:
len(emb[0])

768

In [68]:
from sklearn.decomposition import PCA
import time

pca_model=PCA(n_components=50)
pca_model.fit(emb)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.9866741754961157


In [69]:
emb_comps = pca_model.transform(emb)
emb_comps.shape

(277, 50)

In [70]:
emb_comps=list(emb_comps)
len(emb_comps[0])

50

In [71]:
train_ev_label=pd.DataFrame()
train_ev_label['ev']=emb_comps
train_ev_label['label']=train['label']

In [72]:
train_ev_label

Unnamed: 0,ev,label
0,"[0.14130706588783418, -0.16928630477653253, 0....",0
1,"[0.4381842942546462, -0.029237316533280298, 0....",2
2,"[-0.1707170538386425, -0.24186457064568703, 0....",0
3,"[0.532045374289575, -0.05688646205040172, -0.0...",0
4,"[-0.6651761804732204, 0.20337295783948103, -0....",0
...,...,...
272,"[-0.5418617176292417, -0.006220766348764834, 0...",0
273,"[0.19073723711258644, 0.023844795074883143, 0....",0
274,"[0.1374581970275183, -0.22419119931879056, 0.0...",0
275,"[0.26516229689717996, -0.20844564974367508, -0...",0


In [74]:
from sklearn.svm import SVC
start = time.time() 

svm_classifier = SVC()
svm_classifier.fit(x_train,y_train)

end = time.time()
process = round(end-start,2)
print("Support Vector Machine Classifier has fitted, this process took {} seconds".format(process))

Support Vector Machine Classifier has fitted, this process took 0.01 seconds


In [75]:
svm_classifier.score(x_test,y_test)

0.7857142857142857

In [76]:
# More algorithms!!!!
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB,BernoulliNB

rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
print("Score of RFC",rfc.score(x_test,y_test))

logreg = LogisticRegression()
logreg.fit(x_train,y_train)
print("Score of LogReg",logreg.score(x_test,y_test))

gnb = GaussianNB()
gnb.fit(x_train,y_train)
print("Score of GaussianNB",gnb.score(x_test,y_test))

bnb = BernoulliNB()
bnb.fit(x_train,y_train)
print("Score of BernoulliNB",bnb.score(x_test,y_test))

Score of RFC 0.8214285714285714
Score of LogReg 0.7857142857142857
Score of GaussianNB 0.6428571428571429
Score of BernoulliNB 0.75


In [51]:
pred=rfc.predict(x_test)

len(pred)

56

In [77]:
from sklearn.metrics import confusion_matrix

confusion_matrix(pred,y_test)

array([[37,  1,  8],
       [ 7,  1,  1],
       [ 0,  0,  1]], dtype=int64)

In [81]:
cont=0
for i in range(len(train)):
    if train['label'][i]==0:
        cont+=1
        
cont

213