In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/news_tagging_model
# !mkdir news_tagging_model

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install -U sentence-transformers

In [None]:
import pandas as pd
df = pd.read_csv("estadao_2010.csv")
df.category.value_counts()
df_health = df[df['category'] == 'politica']
df_health
df_not_health = df[df['category'] != 'politica']
df_not_health = df_not_health.sample(n=len(df_health),random_state=10)
df_balanced = df_not_health.append(df_health)
df_balanced.to_csv("balanced.csv")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [None]:
def compute_metrics(outs):
    predictions, labels = outs
    predictions = np.argmax(predictions, axis = -1)

    ## computes overall scores (accuracy, f1, recall, precision)
    accuracy = accuracy_score(labels, predictions) * 100
    f1 = f1_score(labels, predictions, average = "macro") * 100
    recall = recall_score(labels, predictions, average = "macro") * 100
    precision = precision_score(labels, predictions, average = "macro") * 100

    return {
        "accuracy" : float(accuracy),
        "f1" : float(f1),
        "recall" : float(recall),
        "precision" : float(precision),
    }

def encode_labels(labels):
  labels_set = set(labels)
  endcoded_labels = labels
  for j in range(len(endcoded_labels)):
    if endcoded_labels[j] == 'politica':
      endcoded_labels[j] = 1
    else:
      endcoded_labels[j] = 0
  return endcoded_labels

def load_data(path):
    """
    read CSV file and return the tweets and labels lists
    """
    df = pd.read_csv(path)
    titles = df['title'].tolist()
    labels = encode_labels(df['category'].tolist())
    print("max(labels)")

    print(max(labels))
    return titles, labels

In [None]:
train_all_tweets, train_all_labels = load_data("/content/drive/MyDrive/news_tagging_model/balanced.csv")
test_tweets, test_labels = load_data("/content/drive/MyDrive/news_tagging_model/balanced.csv")

#split the train_all to train and validation
train_tweets, val_tweets, train_labels, val_labels = train_test_split(
    train_all_tweets,
    train_all_labels,
    test_size=.25,
    random_state= 10)

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L12-v2')

train_tweets_embeddings = model.encode(train_tweets, show_progress_bar = True )
val_tweets_embeddings = model.encode(val_tweets, show_progress_bar = True )

In [None]:
# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt
# from matplotlib.pyplot import figure

# pca = PCA()
# Xt = pca.fit_transform(train_tweets_embeddings)

# figure(figsize=(10, 10), dpi=80)
# plot = plt.scatter(Xt[:,0], Xt[:,1], c=train_labels)
# plt.legend(handles=plot.legend_elements()[0], labels=list(train_labels))
# plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [None]:
def make_embeddings_positive(embeddings):
  min_emb = 100
  for emb in embeddings:
    current_min_emb = min(emb)
    if current_min_emb < min_emb:
      min_emb = current_min_emb
  
  min_emb = min_emb - 0.1 # just to not include 0
  min_emb = min_emb * (-1)
  for i in range(len(embeddings)):
    for j in range(len(embeddings[i])):
      embeddings[i][j]+=min_emb

  return embeddings

In [None]:
train_tweets_embeddings_fixed = make_embeddings_positive(train_tweets_embeddings)
val_tweets_embeddings_fixed = make_embeddings_positive(val_tweets_embeddings)

In [None]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(train_tweets_embeddings_fixed, train_labels)

In [None]:
y_pred = naive_bayes_classifier.predict(val_tweets_embeddings_fixed)

In [None]:
print(metrics.classification_report(val_labels, y_pred,
                                            ))

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

train_tweets_embeddings = model.encode(train_tweets, show_progress_bar = True )
val_tweets_embeddings = model.encode(val_tweets, show_progress_bar = True )

train_tweets_embeddings_fixed = make_embeddings_positive(train_tweets_embeddings)
val_tweets_embeddings_fixed = make_embeddings_positive(val_tweets_embeddings)

naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(train_tweets_embeddings_fixed, train_labels)

y_pred = naive_bayes_classifier.predict(val_tweets_embeddings_fixed)

print(metrics.classification_report(val_labels, y_pred,
                                            ))