In [1]:
import json
import pandas as pd

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
from simpletransformers.classification import MultiLabelClassificationModel, MultiLabelClassificationArgs
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
books_out_df = pd.read_csv('../dataset//transformed/books_final_out.csv')
books_out_df.head(1)

Unnamed: 0,title,description,clean_description,genres,weighted_score,Fiction,Romance,Nonfiction,Children's,Young Adult,...,Crime,Thriller,Fantasy,Science Fiction,Horror,Drama,Poetry,Art,Humor,Religion
0,Harry Potter and the Sorcerer's Stone,Harry Potter's life is miserable. His parents ...,harry potter life miserable parent dead he stu...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",2.388,1,0,0,1,1,...,0,0,1,0,0,0,0,0,0,0


In [5]:
books_df = pd.read_csv("../dataset/transformed/books_final_trans.csv")
books_df.head(2)

Unnamed: 0,clean_description,genres,Fiction,Romance,Nonfiction,Children's,Young Adult,Teen,Mystery,Crime,Thriller,Fantasy,Science Fiction,Horror,Drama,Poetry,Art,Humor,Religion
0,harry potter life miserable parent dead he stu...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0
1,winning mean fame fortune losing mean certain ...,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",1,1,0,1,1,1,0,0,0,1,1,0,0,0,0,0,0


In [6]:
book_genres = json.load(open("../dataset/book_genres.json", "r"))
major_genres = [x['genre'] for x in book_genres[:-1]]
major_genres, len(major_genres)

(['Fiction',
  'Romance',
  'Nonfiction',
  "Children's",
  'Young Adult',
  'Teen',
  'Mystery',
  'Crime',
  'Thriller',
  'Fantasy',
  'Science Fiction',
  'Horror',
  'Drama',
  'Poetry',
  'Art',
  'Humor',
  'Religion'],
 17)

In [7]:
books_with_label_df = books_out_df.copy()
books_with_label_df['labels'] = books_with_label_df.apply(lambda x: [x[feature] for feature in major_genres], axis=1)
books_with_label_df.head(2)

Unnamed: 0,title,description,clean_description,genres,weighted_score,Fiction,Romance,Nonfiction,Children's,Young Adult,...,Thriller,Fantasy,Science Fiction,Horror,Drama,Poetry,Art,Humor,Religion,labels
0,Harry Potter and the Sorcerer's Stone,Harry Potter's life is miserable. His parents ...,harry potter life miserable parent dead he stu...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",2.388,1,0,0,1,1,...,0,1,0,0,0,0,0,0,0,"[1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,The Hunger Games,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,winning mean fame fortune losing mean certain ...,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",2.274822,1,1,0,1,1,...,0,1,1,0,0,0,0,0,0,"[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."


In [8]:
books_with_label_df['clean_description'] = books_with_label_df['title'] + ' ' + books_with_label_df['clean_description']
books_with_label_df.head(2)

Unnamed: 0,title,description,clean_description,genres,weighted_score,Fiction,Romance,Nonfiction,Children's,Young Adult,...,Thriller,Fantasy,Science Fiction,Horror,Drama,Poetry,Art,Humor,Religion,labels
0,Harry Potter and the Sorcerer's Stone,Harry Potter's life is miserable. His parents ...,Harry Potter and the Sorcerer's Stone harry po...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",2.388,1,0,0,1,1,...,0,1,0,0,0,0,0,0,0,"[1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,The Hunger Games,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,The Hunger Games winning mean fame fortune los...,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",2.274822,1,1,0,1,1,...,0,1,1,0,0,0,0,0,0,"[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."


In [9]:
import math
train_size = math.floor(len(books_df) * 0.7)

In [10]:
books_t_train = books_with_label_df[:train_size].copy()[['clean_description', 'labels']]
books_t_test = books_with_label_df[train_size:].copy()[['clean_description', 'labels']]

In [11]:
books_t_test.iloc[0]['labels']

[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [12]:
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_df = books_t_train
train_df.columns = ["text", "labels"]

eval_df = books_t_test
eval_df.columns = ["text", "labels"]

# Optional model configuration
model_args = MultiLabelClassificationArgs(num_train_epochs=5,
    overwrite_output_dir=True)

# Create a MultiLabelClassificationModel
# "roberta",
#     "roberta-base",
model = MultiLabelClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=len(major_genres),
    args=model_args
)

# Train the model
model.train_model(train_df)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(
    eval_df
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
56it [00:33,  1.67it/s]                        
  scaler = amp.GradScaler()
  with amp.autocast():
Epochs 1/5. Running Loss:    0.2289: 100%|██████████| 3444/3444 [07:01<00:00,  8.17it/s]
Epochs 2/5. Running Loss:    0.1473: 100%|██████████| 3444/3444 [07:03<00:00,  8.13it/s]
Epochs 3/5. Running Loss:    0.

In [13]:
result

{'LRAP': np.float64(0.9143690798778307), 'eval_loss': 0.26405196367692546}

In [None]:
result

In [14]:
import pickle
pickle.dump(model, open('model-v33.pkl', 'wb'))

In [15]:
test_desc = "John Form has found the perfect gift for his expectant wife, Mia - a beautiful, rare vintage doll in a pure white wedding dress. But Mia's delight with Annabelle doesn't last long. On one horrific night, their home is invaded by members of a satanic cult, who violently attack the couple. Spilled blood and terror are not all they leave behind. The cultists have conjured an entity so malevolent that nothing they did will compare to the sinister conduit to the damned that is now... Annabelle"

In [16]:
prediction, raw_outputs = model.predict([test_desc])

1it [00:05,  5.40s/it]
  with amp.autocast():
100%|██████████| 1/1 [00:00<00:00,  8.90it/s]


In [None]:
prediction, raw_outputs = model.predict([test_desc])

In [17]:
[feature for i, feature in enumerate(major_genres) if prediction[0][i]]

['Fiction', "Children's", 'Mystery', 'Crime', 'Thriller', 'Fantasy', 'Horror']