In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train=pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip',sep='\t')
test=pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip',sep='\t')

In [None]:
train.head()
test.head()

In [None]:
checkpoint = "siebert/sentiment-roberta-large-english"

In [None]:
from datasets import load_dataset
from datasets import load_metric

from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler

import torch
from torch.utils.data import DataLoader

from tqdm.auto import tqdm

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=5, ignore_mismatched_sizes=True)

In [None]:
def tokenize_function(example):
    return tokenizer(example, truncation=True)

tokenized_datasets = train['Phrase'].map(tokenize_function)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
sentiment = train['Sentiment'].tolist()

for i in range(len(tokenized_datasets)):
    tokenized_datasets[i]['label'] = sentiment[i]
print(tokenized_datasets[:2])

In [None]:
train_dataloader = DataLoader(
    tokenized_datasets, shuffle=True, batch_size=64, collate_fn=data_collator
)

In [None]:
optimizer = AdamW(model.parameters(), lr = 3e-5)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

num_epochs = 3
num_training_steps = num_epochs*len(train_dataloader)
lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
     num_training_steps =num_training_steps
)

progress_bar = tqdm(range(num_training_steps))

model.train()

for epoch in range(num_epochs):
     for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        print(loss)
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        


In [None]:
tokenized_datasets_test = test["Phrase"].map(tokenize_function)

In [None]:
test_dataloader = DataLoader(
    tokenized_datasets_test, batch_size=64, collate_fn=data_collator
)

In [None]:
model.eval()
test_predictions = list()

for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    test_predictions.extend(predictions)

In [None]:
test_predictions = [i.item()  for i in test_predictions]

In [None]:
test_id = test['PhraseId']

print(f"test ids: {test_id[:4]}")
print(f"test preds: {test_predictions[:4]}")

In [None]:
submission = pd.DataFrame(list(zip(test_id, test_predictions)),
               columns =['PhraseId', 'Sentiment'])
submission.head(20)

In [None]:
submission.to_csv('submission.csv', index=False)