In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import time
import pandas as pd
from transformers import pipeline
import torch
from tqdm.notebook import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# sentiment analysis

In [2]:
def sentiment_analyze_data(input_df_path, model, output_path):
    df = pd.read_csv(input_df_path)
    res = pd.DataFrame(columns=df.columns.values)
    res.to_csv(output_path, mode="w", index=False, header=True)
    batch_size = 128
    
    t = tqdm(range(len(df) // batch_size + 1))
    print(t)
    for i in t:
        start_index = i * batch_size
        end_index = min(len(df), (i + 1) * batch_size)
        comments = list(df.loc[start_index:end_index, "Content"].apply(str))
        labels = [x["label"] for x in model(comments)]
        res = df.loc[start_index:end_index].reset_index(drop=True)
        res["Content"] = labels
        res.to_csv(output_path, mode="a", index=False, header=False)

# model 1

In [3]:
model_name = "lxyuan/distilbert-base-multilingual-cased-sentiments-student"
model1 = pipeline("sentiment-analysis", model=model_name,tokenizer=model_name, max_length=512,truncation=True, device=device)

In [4]:
sentiment_analyze_data("reviews/reviews first 5000.csv", model1, "reviews/reviews first 5000 - sentiment lxyuan-distilbert.csv")

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]


# model 2

In [5]:
model_name = "finiteautomata/bertweet-base-sentiment-analysis"
model2 = pipeline("sentiment-analysis", model=model_name,tokenizer=model_name, max_length=512,truncation=True, device=device)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [6]:
sentiment_analyze_data("reviews/reviews first 5000.csv", model1, "reviews/reviews first 5000 - sentiment finiteautomata-bertweet-base.csv")

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]


# model 3

In [7]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
model3 = pipeline("sentiment-analysis", model=model_name,tokenizer=model_name, max_length=512,truncation=True, device=device)

In [8]:
sentiment_analyze_data("reviews/reviews first 5000.csv", model1, "reviews/reviews first 5000 - cardiffnlp-twitter-roberta.csv")

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]
