# Load Dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import shutil
import zipfile
import os

file_name = 'Dataset_TXA.zip'

# copying the dataset to the temporary workspace
print('Copying Dataset to temporary workspace...')
shutil.copyfile('drive/MyDrive/Dataset_TXA.zip', 'Dataset_TXA.zip')
print('Copied...')

# unzipping the dataset
path = 'Dataset'
with zipfile.ZipFile(file_name, 'r') as zip_ref:
    zip_ref.extractall(path)

#moving inside the Dataset folder
os.chdir(path)
os.getcwd()

Mounted at /content/drive
Copying Dataset to temporary workspace...
Copied...


'/content/Dataset'

# General Preprocessing

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('gr_training_set.csv', usecols = ['book_id', 'review_text', 'genre'])

df['n_review_per_book'] = df.groupby('book_id')['book_id'].transform(len)
average_rev = int(df[['book_id', 'n_review_per_book']].drop_duplicates(keep = 'first')['n_review_per_book'].mean())

keep_idx_bool = df['n_review_per_book'] <= average_rev
indexes_to_keep = df[keep_idx_bool].index
index_sampled = df[~keep_idx_bool].groupby('book_id').sample(average_rev, random_state = 42).index
new_indexes = sorted(indexes_to_keep.tolist()+index_sampled.tolist())

df = df.loc[new_indexes].reset_index(drop=True)

#Dropping NaN values
df.dropna(subset=['genre'], inplace=True)

# made it slightly more readable
genres_corresp = ((("Fantasy", "Superheroes", "Shapeshifters", "Science Fiction Fantasy"), #-->
                   ("Fantasy")),
                  (("Romance", "Erotica", "Polyamorous", "Category Romance"), #-->
                   ("Romance")),
                  (("Fiction", "Young Adult", "New Adult", "Womens Fiction", "Adult Fiction", 
                    "Christian Fiction", "Realistic Fiction", "Fan Fiction", "Magical Realism"), #-->
                   ("Fiction")),
                  (("Sequential Art", "Music", "Couture"), #-->
                   ("Art")),
                  (("Thriller", "Mystery", "Crime"), #-->
                   ("Thriller")),
                  (("Science Fiction", ), #just add a single comma to not loop in the string -->
                   ("Science Fiction")),
                  (("Horror", "Paranormal", "Dark", "Suspense"), #-->
                   ("Horror")),
                  (("Classics", "Contemporary", "Poetry", "Plays"), #-->
                   ("Literature")),
                  (("Nonfiction", "Autobiography", "Biography"), #-->
                   ("NonFiction")),
                  (("Historical", "History", "War", "Mythology"), #-->
                   ("History")))

genres_to_keep_dict = {k : v for ks, v in genres_corresp for k in ks}
df = df[df.genre.isin(genres_to_keep_dict.keys())].reset_index(drop=True)
df.genre = df.genre.map(lambda genere: genres_to_keep_dict[genere])

n = 2000
genres_w_more_than_n_reviews = df.groupby('genre')['genre'].transform(len) >= n
df = pd.concat((df[genres_w_more_than_n_reviews].groupby('genre').sample(n=n, random_state = 42), 
                df[~genres_w_more_than_n_reviews]), ignore_index=True)
train_df = df[['review_text', 'genre']]
del df
print(f'Number of Rows: {len(train_df)}')

Number of Rows: 20000


In [3]:
# loading and preprocessing the test set
test_df = pd.read_csv('gr_test_set.csv', usecols=['review_text', 'genre']).dropna().reset_index(drop=True)
test_df = test_df[test_df.genre.isin(genres_to_keep_dict.keys())].reset_index(drop=True)
test_df.genre = test_df.genre.map(lambda genere: genres_to_keep_dict[genere])

print(f'Number of Rows: {len(test_df)}')

Number of Rows: 373300


# Preprocessing for non pre-trained models

In [4]:
!pip install swifter
!pip install wordcloud
!pip install gensim
!pip install transformers
!pip install spacy
!pip install umap-learn
!spacy download en_core_web_sm

# to remove the output of the installation
from IPython.display import clear_output
clear_output(wait=True)
print('Set up complete')

Set up complete


In [5]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import swifter

default_params = mpl.rcParamsDefault

import re
import time
from collections import Counter

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

# Pos tagging correspondence
from nltk.corpus import wordnet

# Stop words
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
# Compute bigrams.
from gensim.models import Phrases
# Utility to compute dictionary
from gensim.corpora import Dictionary

# Ner visualization
import spacy
from spacy import displacy

import torch

from wordcloud import WordCloud
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

# Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Topic Distribution with UMAP and tSNE
from yellowbrick.text import UMAPVisualizer
from yellowbrick.text import TSNEVisualizer

from sklearn.metrics import classification_report

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

stop_words = set(STOPWORDS)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [6]:
# cleaning different patterns
def clean_tokens(tokens):
    tokens = re.sub(r'[^\w\s]', '', tokens)
    tokens = re.sub(r'<.*?>', '', tokens) #removing HTMLS
    tokens = re.sub(r"\\", "", tokens) #removing \ character
    tokens = re.sub(r"\n", "", tokens)
    tokens = re.sub(r"http\S+", "", tokens) #removing HTTPS
    tokens = re.sub(r'\b\d+\b', '', tokens) #removing numbers
    
    return tokens.strip().lower()

In [7]:
#cleaning stopwords and words containing non alphanumeric characters
def remove_stop_words(column, pos = False):
    """
    It removes the stop words from a column or an iterable of tokens, 
    the pos parameter specify if the pos is present in the iterable.
    e.g. pos == True iff [(holy, JJ), (crap, NN), (awesome, NN) <-- [0] word, [1] pos

    """

    if pos == False:
        return [[token for token in review_text if (token not in stop_words) and token.isalpha()] for review_text in column]
    else: #token[0] because it is assumed the token is the first position of the tuple
        return [[token for token in review_text if (token[0] not in stop_words) and token[0].isalpha()] for review_text in column]

In [8]:
# to interpret the postag to wordnet lexicon

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [9]:
def lemmatize_row(row):
    lemmatizer = WordNetLemmatizer()
    lemmatized_row = list()
    
    for token, pos in row:
        if pos_mapper[pos] is None:
            lemma = lemmatizer.lemmatize(token)
        else:
            lemma = lemmatizer.lemmatize(token, pos = pos_mapper[pos])

        lemmatized_row.append(lemma) # we discarded the pos here

    return lemmatized_row

In [10]:
#cleaning review_text col
train_df.review_text = train_df.review_text.swifter.apply(lambda x: clean_tokens(x))
train_df["tokenized_text"] = train_df.review_text.swifter.apply(lambda x: word_tokenize(x))
train_df["postagged_text"] = train_df.tokenized_text.swifter.apply(lambda x: nltk.pos_tag(x))

list_of_tags = train_df.postagged_text.swifter.apply(lambda x: [el[1] for el in x if len(el) == 2]).tolist()
set_of_tags = set()

for ls in list_of_tags:
    for tag in ls:
        set_of_tags.add(tag)

pos_mapper = {k : get_wordnet_pos(k) for k in set_of_tags}

train_df["tokenized_text"] = remove_stop_words(train_df["tokenized_text"], pos = False)
train_df["postagged_text"] = remove_stop_words(train_df["postagged_text"], pos = True)
train_df["tokenized_text"] = train_df.tokenized_text.map(lambda x: [el for el in x if len(el) > 2])
train_df["postagged_text"] = train_df.postagged_text.map(lambda x: [el for el in x if len(el[0]) > 2])
train_df['lemmatized_text'] = train_df.postagged_text.swifter.apply(lambda x: lemmatize_row(x))

Pandas Apply:   0%|          | 0/20000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/20000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/20000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/20000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/20000 [00:00<?, ?it/s]

In [11]:
train_df

Unnamed: 0,review_text,genre,tokenized_text,postagged_text,lemmatized_text
0,governor gets his comeuppance and the gang fin...,Art,"[governor, gets, comeuppance, gang, way, woodb...","[(governor, NN), (gets, VBZ), (comeuppance, NN...","[governor, get, comeuppance, gang, way, woodbu..."
1,i originally read these graphic novels back in...,Art,"[originally, read, graphic, novels, got, delux...","[(originally, RB), (read, VB), (graphic, JJ), ...","[originally, read, graphic, novel, get, deluxe..."
2,i like that i hear all the character voices as...,Art,"[like, hear, character, voices, read, honestly...","[(like, IN), (hear, VBP), (character, NN), (vo...","[like, hear, character, voice, read, honestly,..."
3,i an not accurately describe how beautiful the...,Art,"[accurately, beautiful, illustrations, colors,...","[(accurately, RB), (beautiful, JJ), (illustrat...","[accurately, beautiful, illustration, color, b..."
4,all things must come to an end and unfortunate...,Art,"[things, come, end, unfortunately, sandman, bo...","[(things, NNS), (come, VB), (end, NN), (unfort...","[thing, come, end, unfortunately, sandman, boo..."
...,...,...,...,...,...
19995,continuing in the spirit of rereading i am rev...,Thriller,"[continuing, spirit, rereading, revisting, chr...","[(continuing, VBG), (spirit, NN), (rereading, ...","[continue, spirit, reread, revisting, christie..."
19996,edit just finished this one blew me away revi...,Thriller,"[edit, finished, blew, away, review, come, ant...","[(edit, NN), (finished, VBN), (blew, VBD), (aw...","[edit, finish, blow, away, review, come, antic..."
19997,spoiler alert major spoiler ...,Thriller,"[spoiler, alert, major, spoiler, seriously, vi...","[(spoiler, NN), (alert, NN), (major, JJ), (spo...","[spoiler, alert, major, spoiler, seriously, vi..."
19998,okay wow when i saw a review pop up on my fee...,Thriller,"[okay, wow, saw, review, pop, feed, khan, wasn...","[(okay, NN), (wow, NN), (saw, VBD), (review, N...","[okay, wow, saw, review, pop, feed, khan, wasn..."


# BERT

## Preprocessing

In [12]:
from IPython.display import clear_output

!pip install transformers datasets evaluate
clear_output(wait=True)
_

Unnamed: 0,review_text,genre,tokenized_text,postagged_text,lemmatized_text
0,governor gets his comeuppance and the gang fin...,Art,"[governor, gets, comeuppance, gang, way, woodb...","[(governor, NN), (gets, VBZ), (comeuppance, NN...","[governor, get, comeuppance, gang, way, woodbu..."
1,i originally read these graphic novels back in...,Art,"[originally, read, graphic, novels, got, delux...","[(originally, RB), (read, VB), (graphic, JJ), ...","[originally, read, graphic, novel, get, deluxe..."
2,i like that i hear all the character voices as...,Art,"[like, hear, character, voices, read, honestly...","[(like, IN), (hear, VBP), (character, NN), (vo...","[like, hear, character, voice, read, honestly,..."
3,i an not accurately describe how beautiful the...,Art,"[accurately, beautiful, illustrations, colors,...","[(accurately, RB), (beautiful, JJ), (illustrat...","[accurately, beautiful, illustration, color, b..."
4,all things must come to an end and unfortunate...,Art,"[things, come, end, unfortunately, sandman, bo...","[(things, NNS), (come, VB), (end, NN), (unfort...","[thing, come, end, unfortunately, sandman, boo..."
...,...,...,...,...,...
19995,continuing in the spirit of rereading i am rev...,Thriller,"[continuing, spirit, rereading, revisting, chr...","[(continuing, VBG), (spirit, NN), (rereading, ...","[continue, spirit, reread, revisting, christie..."
19996,edit just finished this one blew me away revi...,Thriller,"[edit, finished, blew, away, review, come, ant...","[(edit, NN), (finished, VBN), (blew, VBD), (aw...","[edit, finish, blow, away, review, come, antic..."
19997,spoiler alert major spoiler ...,Thriller,"[spoiler, alert, major, spoiler, seriously, vi...","[(spoiler, NN), (alert, NN), (major, JJ), (spo...","[spoiler, alert, major, spoiler, seriously, vi..."
19998,okay wow when i saw a review pop up on my fee...,Thriller,"[okay, wow, saw, review, pop, feed, khan, wasn...","[(okay, NN), (wow, NN), (saw, VBD), (review, N...","[okay, wow, saw, review, pop, feed, khan, wasn..."


In [13]:
from datasets import Dataset, DatasetDict, Value, Features, ClassLabel
import numpy as np

class_names = list(set(train_df['genre']))
class_labels = ClassLabel(names=class_names)
features = Features({'text': Value('string'), 'label': class_labels})

In [14]:
train_df = train_df[['review_text', 'genre']]
training_df = train_df.rename(columns={"review_text": "text", "genre": "label"})
training_set = Dataset.from_pandas(training_df[['text','label']], preserve_index = False, features = features)
training_set

Dataset({
    features: ['text', 'label'],
    num_rows: 20000
})

In [15]:
# 70% train, 30% test + validation
training_set = training_set.train_test_split(test_size=0.3)

In [16]:
training_set

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 14000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 6000
    })
})

In [17]:
#setting up the test set
test_df = test_df.rename(columns={"review_text": "text", "genre": "label"})

test_set = Dataset.from_pandas(test_df[['text','label']], preserve_index = False, features = features)
test_set

Dataset({
    features: ['text', 'label'],
    num_rows: 373300
})

In [18]:
# gather everyone if you want to have a single DatasetDict
dataset = DatasetDict({
    'train': training_set['train'],
    'valid': training_set['test'],
    'test': test_set})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 14000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 373300
    })
})

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/374 [00:00<?, ?ba/s]

## Preparing the model and evaluation

In [20]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", 
                                                           num_labels=len(set(dataset['train']['label'])))

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [21]:
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'], 
            "f1 weighted": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]}


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

## Set up the training

In [22]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"output_dir", 
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3
)

In [23]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid'],
    compute_metrics=compute_metrics,
)

## Training the model

In [24]:
trainer.train()

***** Running training *****
  Num examples = 14000
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 10500
  Number of trainable parameters = 108317962
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss,Accuracy,F1 weighted
1,1.6233,1.514186,0.499333,0.495889
2,1.2784,1.388887,0.544667,0.545033
3,0.8384,1.565959,0.571167,0.572688


***** Running Evaluation *****
  Num examples = 6000
  Batch size = 4
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to output_dir/checkpoint-3500
Configuration saved in output_dir/checkpoint-3500/config.json
Model weights saved in output_dir/checkpoint-3500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 6000
  Batch size = 4
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to output_dir/checkpoint-7000
Configuration saved in output_dir/checkpoint-7000/config.json
Model weights saved in output_dir/checkpoint

TrainOutput(global_step=10500, training_loss=1.390285606747582, metrics={'train_runtime': 5087.8815, 'train_samples_per_second': 8.255, 'train_steps_per_second': 2.064, 'total_flos': 1.1051458080768e+16, 'train_loss': 1.390285606747582, 'epoch': 3.0})

## Evaluating the model

In [None]:
trainer.evaluate(tokenized_datasets['test'])

***** Running Evaluation *****
  Num examples = 373300
  Batch size = 4
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
