In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
sub = pd.read_csv("./data/sample_submission.csv")

In [81]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [4]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
def clf(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    #
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    return config.id2label[ranking[0]]

In [82]:
train["s"] = train["text"].apply(lambda x: preprocess(x))
test["s"] = test["text"].apply(lambda x: preprocess(x))

In [84]:
ans, err = {}, []
for sample in tqdm(range(len(test))):
    id, text = test.iloc[sample]["id"], test.iloc[sample]["text"]
    try: ans[id] = clf(text)
    except Exception as e:
        print(f"Err: {id} {e}")
        err.append(id)

  3%|▎         | 1242/48000 [00:50<28:03, 27.77it/s]

Err: TEST_01237 The expanded size of the tensor (536) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 536].  Tensor sizes: [1, 514]


100%|██████████| 48000/48000 [32:51<00:00, 24.35it/s]


In [85]:
ans_df = []
for k, v in zip(ans.keys(), ans.values()):
    ans_df.append(pd.DataFrame({"id": [k], "Ans":[v]}))
ans = pd.concat(ans_df, axis=0)

In [86]:
def clf_ans(x):
    x = x.lower()
    if x=="neutral": return 0
    elif x=="positive": return 1
    elif x=="negative": return 2
    
ans["ans"] = ans["Ans"].apply(clf_ans)

In [87]:
ans2 = ans[["id", "ans"]]

In [64]:
sub

Unnamed: 0,id,sentiment
0,TEST_00000,-1
1,TEST_00001,-1
2,TEST_00002,-1
3,TEST_00003,-1
4,TEST_00004,-1
...,...,...
47995,TEST_47995,-1
47996,TEST_47996,-1
47997,TEST_47997,-1
47998,TEST_47998,-1


In [88]:
sub_df = pd.merge(left=sub, right=ans2, how="left")[["id", "ans"]]

In [93]:
sub_df = sub_df.fillna(0)
sub_df.columns = ["id", "sentiment"]

In [94]:
sub_df.to_csv("./data/sub231012_2.csv", index=False)

In [96]:
sub_df = pd.read_csv("./data/sub231012_2.csv")

In [98]:
sub_df

Unnamed: 0,id,sentiment
0,TEST_00000,1.0
1,TEST_00001,2.0
2,TEST_00002,0.0
3,TEST_00003,0.0
4,TEST_00004,0.0
...,...,...
47995,TEST_47995,2.0
47996,TEST_47996,1.0
47997,TEST_47997,1.0
47998,TEST_47998,1.0
