In [1]:
import re
import os
import tqdm
import torch
import warnings
import numpy as np
import pandas as pd
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

warnings.filterwarnings('ignore')

In [2]:
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "TRUE"
os.environ["WANDB_DISABLED"] = "TRUE"

### Read NER Dataset

In [3]:
file_path = '../datasets/test.csv'
data = pd.read_csv(file_path)

### Clear Text

In [4]:
def Clear(text):
    text = text.replace('rt @user', '')
    text = text.replace('@user', '')
    pattern = re.compile('[^a-zA-Z0-9\sáéíóúàèìòùâêîôûãõçÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÃÕÇ]')
    text = re.sub(r'http\S+', '', text)
    text = pattern.sub(r' ', text)
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    return text

In [5]:
data['text'] = data.text.apply(lambda x: Clear(x))

### Load Model

In [6]:
model_name = './finetuned/'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

### Predict

In [7]:
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer, device=0)

In [8]:
results = []
for item in tqdm.tqdm(data.text):
    x = pipe(item)
    x = x[0]['label']
    x = x.replace('LABEL_', '')
    results.append(int(x))

100%|██████████| 4200/4200 [00:47<00:00, 88.66it/s]


In [9]:
data['label'] = results

In [10]:
data = data[['label']] 

In [11]:
data['id'] = data.index

In [12]:
data = data[['id','label']]

In [13]:
data.to_csv('submission.csv', index=False)