In [9]:
import pandas as pd
from transformers import pipeline
import torch
import numpy as np
from tqdm import tqdm


In [4]:
books = pd.read_csv("books_with_categories.csv")

In [6]:

device = torch.device("mps")
classifier = pipeline("text-classification",
                      model="j-hartmann/emotion-english-distilroberta-base",
                      top_k = None)
classifier("I love this!")

Downloading:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/294 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

[{'label': 'joy', 'score': 0.9771687984466553},
 {'label': 'surprise', 'score': 0.008528691716492176},
 {'label': 'neutral', 'score': 0.005764588713645935},
 {'label': 'anger', 'score': 0.004419783595949411},
 {'label': 'sadness', 'score': 0.0020923903211951256},
 {'label': 'disgust', 'score': 0.0016119909705594182},
 {'label': 'fear', 'score': 0.0004138521908316761}]

In [8]:
emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

def calculate_max_emotion_scores(predictions):
    per_emotion_scores = {label: [] for label in emotion_labels}
    for prediction in predictions:
        sorted_predictions = sorted(prediction, key=lambda x: x["label"])
        for index, label in enumerate(emotion_labels):
            per_emotion_scores[label].append(sorted_predictions[index]["score"])
    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}

In [10]:
emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

for i in tqdm(range(len(books))):
    isbn.append(books["isbn13"][i])
    sentences = books["description"][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

100%|██████████| 5197/5197 [13:45<00:00,  6.30it/s]


In [11]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df["isbn13"] = isbn

In [12]:
books = pd.merge(books, emotions_df, on = "isbn13")
books.to_csv("books_with_emotions.csv", index = False)

We can see that our data has a long tail problem with the categories. Most of the books belong to Fiction categories; however, there are a large amount of categories that contain only a few books or even a single book; for some, the categories don't even make sense/useful. 

An idea that we could use is to use LLMs to perform text classification to form reasonable categories. The way that we are going to do this is to use the book description. However, the description needs to be informative and meaningful enough to classify the book. Therefore, we need to screen the data and remove uninformative description. In our case, uninformative descriptions can be ones that are too short