In [None]:
!pip install -U -q datasets==2.15 evaluate accelerate

In [3]:
import pandas as pd
import torch
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [4]:
data = pd.read_csv('/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')

In [5]:
def labelling(rating):
    if rating < 3: return 'negative'
    if rating < 4: return 'neutral'
    return 'positive'
data['label'] = data['Rating'].apply(labelling)

In [None]:
data['label'].value_counts()

In [7]:
import emoji
import re

def text_cleaning(text):
    text = emoji.demojize(text)
    text = re.sub('n\'t', 'not', text)
    return text
data['Review'] = data['Review'].apply(text_cleaning)
data = data.drop(['Rating'], axis=1)

In [None]:
from collections import Counter
most_word=" ".join(data["Review"].values).split()
word_counts=Counter(most_word)
word_counts.most_common()
most_common_words_list = [{'word': word, 'count': count} for word, count in word_counts.items()]

most_words_df=pd.DataFrame(data=most_common_words_list,columns=["word","count"])
most_words_df.head()

In [None]:
# visualize the frequency of top 20 most common words
plt.figure(figsize=(15,15))
plt.subplot(2,1,1)
plt.title('Top 20 most common words')
sns.barplot(data=most_words_df[:20].sort_values(by="count",ascending=False) ,y="word",x="count",palette="viridis")
plt.show()

In [None]:
labels=most_words_df[:8].sort_values(by="count",ascending=False)["word"].values
sizes=most_words_df[:8].sort_values(by="count",ascending=False)["count"].values
explode = (0.1, 0, 0, 0, 0, 0, 0, 0)
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue',"cyan","red","orange","brown"]

plt.figure(figsize=(15,15))
plt.subplot(2,1,2)

plt.pie(sizes,explode=explode,labels=labels,colors=colors,autopct='%1.1f%%', shadow=True,textprops={'fontsize': 10},labeldistance=0.85, startangle=0)
plt.title("most used words")
plt.gca().add_artist(plt.Circle((0,0),0.70,fc='white'))

plt.gcf().set_facecolor('#f0f0f0')

plt.tight_layout()
plt.show()

In [None]:
from wordcloud import WordCloud


text=str(list(data["Review"]))
plt.rcParams['figure.figsize'] = (15, 15)
wordcloud = WordCloud(background_color = 'white', width = 1200,  height = 1200, max_words = 121).generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_valid_test, y_train, y_valid_test = train_test_split(data.loc[:, data.columns!='label'],
                                                                data['label'],
                                                                test_size=0.2,
                                                                stratify=data['label'])
X_valid, X_test, y_valid, y_test = train_test_split(X_valid_test,
                                                    y_valid_test,
                                                    test_size=0.5,
                                                    stratify=y_valid_test)

In [10]:
data_train = pd.DataFrame([X_train['Review'], y_train]).T
data_test = pd.DataFrame([X_test['Review'], y_test]).T
data_valid = pd.DataFrame([X_valid['Review'], y_valid]).T

In [11]:
from datasets import DatasetDict, Dataset
dts = DatasetDict()
dts['train'] = Dataset.from_pandas(data_train)
dts['test'] = Dataset.from_pandas(data_test)
dts['valid'] = Dataset.from_pandas(data_valid)

In [12]:
label2id = {'negative': 0, 'neutral': 1, 'positive': 2}
id2label = {value: key for key, value in label2id.items()}

In [13]:
import torch
if torch.cuda.is_available():
    device = 'cuda'
else: device = 'cpu'

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_ckpt ='google-bert/bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                          id2label=id2label,
                                                          label2id=label2id,
                                                          num_labels=3).to(device)

In [None]:
def tokenize(examples):
    tokens = tokenizer(examples['Review'],
                       max_length=512,
                       truncation=True
                      )
    examples['label'] = [label2id[l] for l in examples['label']]
    return tokens

dts = dts.map(tokenize, batched=True, batch_size=64)

In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score

def compute_metrics(eval_pred):
    predictions, labels = np.array(eval_pred.predictions).argmax(-1), eval_pred.label_ids
    return {'precision': precision_score(labels, predictions, average='weighted'),
            'recall': recall_score(labels, predictions, average='weighted'),
            'f1': f1_score(labels, predictions, average='weighted')}

In [None]:
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_arguments = TrainingArguments(output_dir='sentiment-bert-base-uncased',
                                       do_eval=True,
                                       num_train_epochs=3,
                                       evaluation_strategy='epoch',
                                       per_device_train_batch_size=16,
                                       per_device_eval_batch_size=16,
                                       learning_rate=2e-5,
                                       lr_scheduler_type='cosine',
                                       warmup_ratio=0.1,
                                       logging_strategy='steps',
                                       logging_steps=10,
                                       save_strategy='epoch',
                                       metric_for_best_model='recall',
                                       gradient_accumulation_steps=2,
                                       fp16=True,
                                       report_to='wandb'
                                       push_to_hub=True
                                       )

In [None]:
trainer = Trainer(model=model,
                  args=training_arguments,
                  train_dataset=dts['train'],
                  eval_dataset=dts['valid'],
                  compute_metrics=compute_metrics,
                  data_collator=data_collator
                 )
trainer.train()

In [None]:
trainer.push_to_hub()

In [73]:
torch.cuda.empty_cache()

### Model checking


In [55]:
from transformers import AutoModel
check_model = AutoModelForSequenceClassification.from_pretrained('1-13-am/sentiment-bert-based-uncased')

In [56]:
text = ['Everything about this hotel is amazing. I will definitely come back!']
tokens = tokenizer(text)

In [59]:
out = check_model(input_ids=torch.LongTensor(tokens['input_ids']), 
                  attention_mask=torch.LongTensor(tokens['attention_mask']))

In [73]:
predict = check_model.config.id2label[out.logits.argmax(-1).item()]

In [None]:
predict