In [1]:
import pandas as pd
books = pd.read_csv("books_with_categories.csv")

In [6]:
from transformers import pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
classifier("I love this book!")

[[{'label': 'joy', 'score': 0.9771487712860107},
  {'label': 'surprise', 'score': 0.010831079445779324},
  {'label': 'neutral', 'score': 0.004581673536449671},
  {'label': 'anger', 'score': 0.0037370002828538418},
  {'label': 'sadness', 'score': 0.002046084962785244},
  {'label': 'disgust', 'score': 0.0012559964088723063},
  {'label': 'fear', 'score': 0.0003995068836957216}]]

In [7]:
classifier(books.description[0])

[[{'label': 'fear', 'score': 0.6548408269882202},
  {'label': 'neutral', 'score': 0.16985218226909637},
  {'label': 'sadness', 'score': 0.11640916764736176},
  {'label': 'surprise', 'score': 0.020700693130493164},
  {'label': 'disgust', 'score': 0.01910068467259407},
  {'label': 'joy', 'score': 0.015161366201937199},
  {'label': 'anger', 'score': 0.003935147542506456}]]

In [8]:
#since description might have sentences that could convey different meanings run the prediction on each sentence
classifier(books.description[0].split("."))

[[{'label': 'surprise', 'score': 0.7296027541160583},
  {'label': 'neutral', 'score': 0.14038535952568054},
  {'label': 'fear', 'score': 0.06816219538450241},
  {'label': 'joy', 'score': 0.047942645847797394},
  {'label': 'anger', 'score': 0.009156349115073681},
  {'label': 'disgust', 'score': 0.002628469141200185},
  {'label': 'sadness', 'score': 0.002122162841260433}],
 [{'label': 'neutral', 'score': 0.449370801448822},
  {'label': 'disgust', 'score': 0.2735905647277832},
  {'label': 'joy', 'score': 0.10908355563879013},
  {'label': 'sadness', 'score': 0.09362763166427612},
  {'label': 'anger', 'score': 0.040478263050317764},
  {'label': 'surprise', 'score': 0.02697017416357994},
  {'label': 'fear', 'score': 0.00687905540689826}],
 [{'label': 'neutral', 'score': 0.6462150812149048},
  {'label': 'sadness', 'score': 0.24273422360420227},
  {'label': 'disgust', 'score': 0.04342268407344818},
  {'label': 'surprise', 'score': 0.028300533071160316},
  {'label': 'joy', 'score': 0.0142114646

In [10]:
sentences = books.description[0].split(".")
predictions = classifier(sentences)

In [13]:
sentences[3], predictions[3]

(' Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist',
 [{'label': 'fear', 'score': 0.9281689524650574},
  {'label': 'anger', 'score': 0.032190579921007156},
  {'label': 'neutral', 'score': 0.012808583676815033},
  {'label': 'sadness', 'score': 0.008756810799241066},
  {'label': 'surprise', 'score': 0.008597836829721928},
  {'label': 'disgust', 'score': 0.008431733585894108},
  {'label': 'joy', 'score': 0.001045580138452351}])

In [14]:
sorted(predictions[3], key=lambda x: x['label'])

[{'label': 'anger', 'score': 0.032190579921007156},
 {'label': 'disgust', 'score': 0.008431733585894108},
 {'label': 'fear', 'score': 0.9281689524650574},
 {'label': 'joy', 'score': 0.001045580138452351},
 {'label': 'neutral', 'score': 0.012808583676815033},
 {'label': 'sadness', 'score': 0.008756810799241066},
 {'label': 'surprise', 'score': 0.008597836829721928}]

In [15]:
import numpy as np

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

def calculate_max_emotion_scores(predictions):
    per_emotion_scores = {label: [] for label in emotion_labels}
    for prediction in predictions:
        sorted_predictions = sorted(prediction, key=lambda x: x['label'])
        for index, label in enumerate(emotion_labels):
            per_emotion_scores[label].append(sorted_predictions[index]['score'])
    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}

In [16]:
for i in range(10):
    isbn.append(books.isbn13[i])
    sentences = books.description[i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

In [18]:
emotion_scores['anger'], emotion_scores['disgust']

([0.06413384526968002,
  0.6126174926757812,
  0.06413384526968002,
  0.3514835238456726,
  0.08141230791807175,
  0.23222479224205017,
  0.5381848812103271,
  0.06413384526968002,
  0.30066993832588196,
  0.06413384526968002],
 [0.2735905647277832,
  0.34828636050224304,
  0.10400692373514175,
  0.15072260797023773,
  0.18449553847312927,
  0.7271748781204224,
  0.15585485100746155,
  0.10400692373514175,
  0.27948155999183655,
  0.1779262125492096])

In [19]:
from tqdm import tqdm

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

for i in tqdm(range(len(books))):
    isbn.append(books["isbn13"][i])
    sentences = books["description"][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

100%|██████████| 5197/5197 [11:54<00:00,  7.27it/s]


In [20]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df['isbn13'] = isbn
emotions_df

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,neutral,isbn13
0,0.064134,0.273591,0.928169,0.932798,0.646215,0.967158,0.729603,9780002005883
1,0.612617,0.348286,0.942528,0.704423,0.887940,0.111690,0.252546,9780002261982
2,0.064134,0.104007,0.972321,0.767238,0.549476,0.111690,0.078766,9780006178736
3,0.351484,0.150723,0.360706,0.251882,0.732686,0.111690,0.078766,9780006280897
4,0.081412,0.184496,0.095043,0.040565,0.884390,0.475880,0.078766,9780006280934
...,...,...,...,...,...,...,...,...
5192,0.148209,0.030643,0.919165,0.255171,0.853721,0.980877,0.030656,9788172235222
5193,0.064134,0.114383,0.051363,0.400263,0.883198,0.111690,0.227765,9788173031014
5194,0.009997,0.009929,0.339217,0.947779,0.375754,0.066685,0.057625,9788179921623
5195,0.064134,0.104007,0.459271,0.759456,0.951104,0.368111,0.078766,9788185300535


In [21]:
books = pd.merge(books, emotions_df, on = "isbn13")

In [22]:
books.to_csv("books_with_emotions.csv", index = False)