In [1]:
#NOTES ON SENTIMENT ANALYSIS AND FINE-TUNING TECHNIQUE
#because we are working with text data - we have the ability to extract sentiment from the data (not always possible with traditional data sets. The learning here is that the type of data and teh quality of data you are working with can open up different opportunities in data science projects.

#the plan in this section is to be able to classify our text into 7 different emotion categories: fear, anger, disgust, joy, sadness, surprise, neutral (for no emotional content).
# we can use an LLM to classify the dominant emotion from the text in our book description
# we are going to treat sentiment analysis as a text classification problem but instead of using zero-shot classification the technique we are using is called "fine-tuning" in order to get LLMs to define the emotion


In [2]:
import pandas as pd
books = pd.read_csv("books_with_categories.csv")


In [3]:
#initialise the fine-tuning model that has been trained on ekmans 6 emotions
# NOTE: use dataloop ai for information on models including accuracy: https://dataloop.ai/library/model/j-hartmann_emotion-english-distilroberta-base/
from transformers import pipeline
#this is going to test our model emotion classification using the model example snippet. It will score the 6 ekman emotions based on the string we pass to the classifier
classifier = pipeline("text-classification",
                      model="j-hartmann/emotion-english-distilroberta-base",
                      top_k= None,
                      device = "mps"
                      )
classifier("I love this!")

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use mps


[[{'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'surprise', 'score': 0.00852868054062128},
  {'label': 'neutral', 'score': 0.005764597561210394},
  {'label': 'anger', 'score': 0.004419781267642975},
  {'label': 'sadness', 'score': 0.002092391485348344},
  {'label': 'disgust', 'score': 0.001611992483958602},
  {'label': 'fear', 'score': 0.00041385178337804973}]]

In [4]:
# because descriptions are long, and the emotional tone can vary by sentene, we can get more accuracy by passing in for analysis each sentence in a description rather than the whole description as one chunk. As we can see with an example looking at the first description passing in each sentence:
classifier(books["description"][0].split("."))

[[{'label': 'surprise', 'score': 0.7296027541160583},
  {'label': 'neutral', 'score': 0.1403856724500656},
  {'label': 'fear', 'score': 0.06816212832927704},
  {'label': 'joy', 'score': 0.04794240742921829},
  {'label': 'anger', 'score': 0.009156349115073681},
  {'label': 'disgust', 'score': 0.002628474263474345},
  {'label': 'sadness', 'score': 0.0021221607457846403}],
 [{'label': 'neutral', 'score': 0.449371337890625},
  {'label': 'disgust', 'score': 0.2735912799835205},
  {'label': 'joy', 'score': 0.10908260941505432},
  {'label': 'sadness', 'score': 0.09362735599279404},
  {'label': 'anger', 'score': 0.04047820344567299},
  {'label': 'surprise', 'score': 0.026970213279128075},
  {'label': 'fear', 'score': 0.006879056803882122}],
 [{'label': 'neutral', 'score': 0.6462153196334839},
  {'label': 'sadness', 'score': 0.24273410439491272},
  {'label': 'disgust', 'score': 0.04342260584235191},
  {'label': 'surprise', 'score': 0.028300518169999123},
  {'label': 'joy', 'score': 0.0142114330

In [20]:
# the question now is how we can we make sense of multiple emotions for a book?
# the solution we wil try is to add a column for each emotion to each book, and get one maximum score for each emotion for a book.
# the code we are writing next is to efficiently extract the max emotion probability for each emotion for each description
# this also requires re-ordering each output of emotion analysis to be ordered by label instead of score, so we always have the data in the same expected order

import numpy as np
# set up our emotions list to use as keys
emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
# initialise empty isbn array that will be used later to merge our data back into the books df
isbn = []
# define an emotion score dictionary comprehension
emotion_scores = {label: [] for label in emotion_labels}

def calculate_max_emotion_scores(predictions):
    per_emotion_scores = {label: [] for label in emotion_labels}
    print(f"Number of predictions: {len(predictions)}")
    for prediction in predictions:
        # sort predictions in label order instead of score order
        sorted_prediction = sorted(prediction, key=lambda x: x["label"])
        # for the index and label in emotion_labels array, append the predicted emotion score to the dictionary comprehension
        for index, label, in enumerate(emotion_labels):
            per_emotion_scores[label].append(sorted_prediction[index]["score"])
    # and then return the max score for each of the predictions so that for each description we have a dictionary of emotions containing the max probablit score for each of the emotion labels
    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}

In [21]:
# now we want to apply this function for each of the books.
# FIRST , a test
for i in range(10):
    #append the isbn for our book selection to the empty isbn array
    isbn.append(books["isbn13"][i])
    if not books["description"][i]:
        print(f"Empty description for book {i}")
        continue

    sentences = books["description"][i].split(".")
    if not sentences:
        print(f"No sentences found for book {i}")
        continue

    predictions = classifier(sentences)
    # print(f"Book {i}: Got {len(predictions)} predictions")
    max_scores = calculate_max_emotion_scores(predictions)
    # print(f"Max emotion scores for book {i}: {max_scores}")
    # take resulting scores returned by the calculate max emotion scores dictionary and append it to the emotion score dictionary
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])


Number of predictions: 8
Number of predictions: 10
Number of predictions: 3
Number of predictions: 4
Number of predictions: 7
Number of predictions: 4
Number of predictions: 11
Number of predictions: 3
Number of predictions: 6
Number of predictions: 3


In [22]:
emotion_scores

{'anger': [0.06413356214761734,
  0.612618625164032,
  0.06413356214761734,
  0.35148370265960693,
  0.0814124196767807,
  0.23222501575946808,
  0.5381841063499451,
  0.06413356214761734,
  0.3006702959537506,
  0.06413356214761734],
 'disgust': [0.2735912799835205,
  0.3482849597930908,
  0.10400660336017609,
  0.15072238445281982,
  0.1844952255487442,
  0.727174699306488,
  0.15585507452487946,
  0.10400660336017609,
  0.2794807255268097,
  0.17792712152004242],
 'fear': [0.928168535232544,
  0.9425278306007385,
  0.9723208546638489,
  0.3607065975666046,
  0.09504325687885284,
  0.05136275663971901,
  0.7474278807640076,
  0.4044959247112274,
  0.9155239462852478,
  0.05136275663971901],
 'joy': [0.9327973127365112,
  0.7044211030006409,
  0.7672379612922668,
  0.2518810033798218,
  0.040564361959695816,
  0.04337586089968681,
  0.8725655674934387,
  0.040564361959695816,
  0.040564361959695816,
  0.040564361959695816],
 'sadness': [0.6462153196334839,
  0.887939453125,
  0.549477