In [4]:
import torch
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

# ================================================================
# 1. Load zero-shot model (best for EC2)
# ================================================================
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli", 
    device=0 if torch.cuda.is_available() else -1   # GPU or CPU
)

print("ðŸ”§ Loaded facebook/bart-large-mnli successfully.")

# ================================================================
# 2. Labels for sentiment classification
# ================================================================
sentiment_labels = ["positive", "neutral", "negative"]
print("Using sentiment labels:", sentiment_labels)


# ================================================================
# 3. Zero-shot sentiment classification function
# ================================================================
def zero_shot_sentiment(df, text_col):
    preds = []
    confs = []

    print(f"\nðŸ”Ž Predicting SENTIMENT for column: {text_col}")
    for text in tqdm(df[text_col], desc="Sentiment"):
        out = classifier(text, sentiment_labels)
        preds.append(out["labels"][0])     # best label
        confs.append(out["scores"][0])     # confidence score

    df["zs_sentiment"] = preds
    df["zs_sentiment_conf"] = confs
    return df


# ================================================================
# 4. Load your datasets
# ================================================================
threads_df = pd.read_csv("Data/threads_reviews.csv")
twitter_df = pd.read_csv("Data/twitter_reviews.csv")

print("\nLoaded datasets:")
print("threads_df:", threads_df.shape)
print("twitter_df:", twitter_df.shape)

twitter_df["review_text"] = twitter_df["review_text"].astype(str)
twitter_df["review_text"] = twitter_df["review_text"].fillna("")

# Remove rows where text is empty or 'nan'
twitter_df = twitter_df[twitter_df["review_text"].str.strip().str.lower() != "nan"]
twitter_df = twitter_df[twitter_df["review_text"].str.strip() != ""]
twitter_df = twitter_df.reset_index(drop=True)

print("\nAfter cleaning Twitter text:", twitter_df.shape)



Device set to use cuda:0


ðŸ”§ Loaded facebook/bart-large-mnli successfully.
Using sentiment labels: ['positive', 'neutral', 'negative']

Loaded datasets:
threads_df: (32910, 4)
twitter_df: (34788, 9)

After cleaning Twitter text: (34785, 9)


In [5]:
# ================================================================
# 5. Apply zero-shot sentiment labeling
# ================================================================
twitter_df = zero_shot_sentiment(twitter_df, text_col="review_text")
twitter_df.to_csv("twitter_sentiment_zero_shot.csv", index=False)
print("\nSaved:")
print("twitter_sentiment_zero_shot.csv")

threads_df = zero_shot_sentiment(threads_df, text_col="review_description")
threads_df.to_csv("threads_sentiment_zero_shot.csv", index=False)
print("\nSaved:")
print("threads_sentiment_zero_shot.csv")

print("\nZero-shot sentiment labeling completed.")



ðŸ”Ž Predicting SENTIMENT for column: review_text


Sentiment: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 34785/34785 [25:19<00:00, 22.89it/s]



Saved:
twitter_sentiment_zero_shot.csv

ðŸ”Ž Predicting SENTIMENT for column: review_description


Sentiment: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 32910/32910 [23:51<00:00, 22.98it/s]



Saved:
threads_sentiment_zero_shot.csv

Zero-shot sentiment labeling completed.
