In [None]:
!pip install torch
!pip install transformers
!pip install bertopic
!pip install sentence_transformers
!pip install datasets

In [None]:
# imports
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, Trainer, TrainingArguments, AutoModelForSeq2SeqLM

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

from datasets import Dataset

In [None]:
# reproducibility
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

import random
random.seed(seed)

import numpy as np
np.random.seed(seed)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Load the dataset
df = pd.read_csv("book_texts.csv")

# Sentiment Analysis

In [None]:
model_id = "clapAI/modernBERT-base-multilingual-sentiment"
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, torch_dtype=torch.float16)

model.to(device)
model.eval()


# Retrieve labels from the model's configuration
id2label = model.config.id2label


true_positive = [{'label': 'positive', 'text': i} for i in df.loc[df['score'] == 5.0]['all'].sample(3)]
true_neu = [{'label': 'neutral', 'text': i} for i in df.loc[df['score'] == 3.0]['all'].sample(3)]
true_negative = [{'label': 'negative', 'text': i} for i in df.loc[df['score'] == 1.0]['all'].sample(3)]


texts = true_positive + true_neu + true_negative

for item in texts:
    text = item["text"]
    label = item["label"]

    inputs = tokenizer(text, return_tensors="pt").to(device)

    # Perform inference in inference mode
    with torch.inference_mode():
        outputs = model(**inputs)
        predictions = outputs.logits.argmax(dim=-1)
    print(f"Text: {text}")
    print(f"Label: {label}")
    print(f"Prediction: {id2label[predictions.item()]}")
    print("______________________________")


# Topic modelling

In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

documents = df["all"].tolist()
embeddings = embedding_model.encode(documents, show_progress_bar=True)

bertopic_model = BERTopic()
topics, probs = bertopic_model.fit_transform(documents, embeddings)

# Save topics to the DataFrame
#df["topic"] = topics

# Display top topics
topic_info = bertopic_model.get_topic_info()
print(topic_info.head(10))

In [None]:
bertopic_model.visualize_topics()

In [None]:
#bertopic_model.visualize_documents(documents, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
bertopic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [None]:
bertopic_model.visualize_barchart()
#bertopic_model.visualize_barchart(topics=[3,4,5,6])

In [None]:
bertopic_model.visualize_hierarchy()

# Summarization

In [None]:
# Load the summarization pipeline
summarizer = pipeline("summarization", model="google/flan-t5-large")

# Function to summarize reviews
def summarize_review(text, max_length=100, min_length=30):
    try:
        summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        return str(e)

# Apply summarization on ten larger reviews
df.index = df['all'].str.len()
df = df.sort_index(ascending=False).reset_index(drop=True)

for larger_text in df["all"][10:15]:
  print(larger_text)
  print()
  print(summarize_review(larger_text))
  print()
  print('_________________________________________________')


# Opinion target extraction

In [None]:
# Load the ABSA model and tokenizer
model_name = "yangheng/deberta-v3-base-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

text = """
I just completed W.E.B. Griffin's latest book in his Presidential Agent series and man it is long! The Hunters begins where the last book, The Hostage ended. Charley and company are tracking down the baddies involved in the Iraqi oil-for-food scandal. An interesting idea that does not realize its full potential. The characters of Charley Castillo, Col Jake Torine, and others are well drawn. But the various meetings and travel detail were at the expense of far less action scenes than there could have been. If there had been more action, it would have been a better book.I enjoyed the first Presidential Agent book, By Order of the President. I have become a fan of Griffin's excellent Badge of Honor series in the past few years as well. However, the only reason I finished The Hunters was my own stubborness. I will think twice before spending (wasting?) the time to read the next adventure of Charley and company."""

# Select some books relevant targets
for aspect in ['characters', 'book']:
   print(aspect, classifier(text,  text_pair=aspect))

# Review helpfulness prediction: zero shot

In [None]:
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

candidate_labels = ['useful review', 'unuseful review']



texts = df["all"].sample(5).to_list() + ['Amazing book!']
for text in texts:
  print(text)
  print()
  print(classifier(text, candidate_labels))
  print()
  print('_________________________________________________')


# Use zero shot examples to create a dataset and finetune the model

In [None]:
# Apply zero-shot classification
df["helpfulness_score"] = df["reviewText"].apply(lambda x: classify_helpfulness(x))

# Convert helpfulness labels to binary values
df["helpfulness_binary"] = df["helpfulness_score"].apply(lambda x: 1 if x == "useful review" else 0)

# Convert dataset to Hugging Face Dataset format
def preprocess_data(examples):
    return tokenizer(examples["reviewText"], truncation=True, padding="max_length", max_length=512)

model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = Dataset.from_pandas(df)
dataset = dataset.map(preprocess_data, batched=True)
dataset = dataset.rename_column("helpfulness_binary", "labels")
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

dataset = dataset.train_test_split(test_size=0.2)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_deberta_helpfulness")
tokenizer.save_pretrained("./fine_tuned_deberta_helpfulness")