In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/news_tagging_model
# !mkdir news_tagging_model

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
# # import pandas as pd
# df = pd.read_csv("estadao_2010.csv")
# df.category.value_counts()
# df_health = df[df['category'] == 'politica']
# df_health
# df_not_health = df[df['category'] != 'politica']
# df_not_health = df_not_health.sample(n=len(df_health),random_state=10)
# df_balanced = df_not_health.append(df_health)
# df_balanced.to_csv("balanced.csv")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

import torch
from torch.utils.data import Dataset
# from omegaconf import DictConfig, OmegaConf
# import hydra
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
# import wandb
import os
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [None]:
def compute_metrics(outs):
    predictions, labels = outs
    predictions = np.argmax(predictions, axis = -1)

    ## computes overall scores (accuracy, f1, recall, precision)
    accuracy = accuracy_score(labels, predictions) * 100
    f1 = f1_score(labels, predictions, average = "macro") * 100
    recall = recall_score(labels, predictions, average = "macro") * 100
    precision = precision_score(labels, predictions, average = "macro") * 100

    return {
        "accuracy" : float(accuracy),
        "f1" : float(f1),
        "recall" : float(recall),
        "precision" : float(precision),
    }



In [None]:
def encode_labels(labels):
  labels_set = set(labels)
  endcoded_labels = labels
  # counter = 0
  # for current_label in labels_set:
  for j in range(len(endcoded_labels)):
    # print(endcoded_labels[j] )
    if endcoded_labels[j] == 'TRUE':
      endcoded_labels[j] = 1
    else:
      endcoded_labels[j] = 0
      # if endcoded_labels[j] == current_label:
      #   endcoded_labels[j] = counter
    # counter+=1
  return endcoded_labels

encode_labels(["x","health","y","x","z"])

In [None]:
def load_data(path):
    """
    read CSV file and return the tweets and labels lists
    """
    df = pd.read_csv(path)
    titles = df['title'].tolist()
    labels = encode_labels(df['Politics_Label'].tolist())
    print("max(labels)")

    print(max(labels))
    return titles, labels

In [None]:
class MultiDialectDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels != None:
          item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
df = pd.read_csv("Brazilian politics - estadao-export-2022-08-09.csv")
df

In [None]:
df.Politics_Label.value_counts()

In [None]:
df =  df[df.Politics_Label.isin(['FALSE','TRUE'])]
df

In [None]:
val_tweets = df.title.tolist()
val_labels = []
for i in  df.Politics_Label.tolist():
  if i == 'FALSE':
    val_labels.append(0)
  elif i == 'TRUE':
    val_labels.append(1)
# val_labels.replace('FALSE',0)

In [None]:
print("Loading Model...")
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/news_tagging_model/finetune_out/best_ckpt", num_labels = 2)
print("Loading Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/news_tagging_model/finetune_out/best_ckpt")


In [None]:
# train_all_tweets, train_all_labels = load_data("/content/drive/MyDrive/news_tagging_model/Brazilian politics - estadao-export-2022-08-09.csv")
# test_tweets, test_labels = load_data("/content/drive/MyDrive/news_tagging_model/Brazilian politics - estadao-export-2022-08-09.csv")

# #split the train_all to train and validation
# train_tweets, val_tweets, train_labels, val_labels = train_test_split(
#     train_all_tweets,
#     train_all_labels,
#     test_size=.99,
#     random_state= 5)

#tokenize the data
print("Tokenizeing the inputs...")
# train_encodings = tokenizer(train_tweets,
#                             truncation=True,
#                             padding=True,
#                             #max_length=model.config.max_position_embeddings
#                             )
val_encodings = tokenizer(val_tweets,
                          truncation=True,
                          padding=True,
                            #max_length=model.config.max_position_embeddings
                          )

In [None]:
val_ds = MultiDialectDataset(val_encodings, val_labels)


In [None]:
trainer = Trainer(model=model)
trainer.model = model.cuda()
# y = trainer.predict(small_eval_dataset

In [None]:
val_pred = trainer.predict(val_ds)


In [None]:
pred_0 = []
pred_1 = []
ispolitics_pred = []
for pred in val_pred.predictions:
  pred_0.append(pred[0])
  pred_1.append(pred[1])
  if pred[0]>pred[1]:
    ispolitics_pred.append(0)
  else:
    ispolitics_pred.append(1)

In [None]:
df_results = pd.DataFrame({'title':val_tweets,'ispolitics_truth':val_labels,'ispolitics_pred':ispolitics_pred,'pred_0':pred_0,'pred_1':pred_1})

In [None]:
df_results.to_csv('Brazilian politics - estadao-export-2022-08-09 - prediction_output.csv')

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(val_labels, ispolitics_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()



In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(val_labels, ispolitics_pred)

