In [13]:
import pandas as pd
df = pd.read_csv('lib/task.csv')

In [14]:
df

Unnamed: 0.1,Unnamed: 0,Title,Synopsis,Tag
0,0,I tre volti della paura,Note: this synopsis is for the orginal Italian...,cult
1,1,Mitt liv som hund,The action takes place in the years 1958-1959 ...,cult
2,2,The Brood,"At the Somafree Institute, Dr. Hal Raglan humi...",cult
3,3,The Haunted,This creepy and scary story centers around The...,paranormal
4,4,The Frozen Ground,The film opens in an Anchorage motel room in 1...,dramatic
...,...,...,...,...
1561,1561,Chuck & Buck,Buck O'Brien (Mike White) is a 27-year-old ama...,cult
1562,1562,The Manster,American foreign news correspondent Larry Stan...,cult
1563,1563,Le grand bleu,"Two children, Jacques Mayol (Jean-Marc Barr) a...",cult
1564,1564,You're a Big Boy Now,"Bernard Chanticleer (Peter Kastner), called ""B...",cult


In [15]:
df.groupby(['Tag'])['Title'].count()

Tag
cult          1033
dramatic       167
paranormal     366
Name: Title, dtype: int64

In [16]:
def len_syn(l):
    return len(l['Synopsis'])

df["synopsis_length"] = df.apply(lambda x: len_syn(x), axis=1)
df.synopsis_length.describe()

count     1566.000000
mean      5219.414432
std       4800.731210
min        781.000000
25%       2587.500000
50%       3909.000000
75%       5890.000000
max      48487.000000
Name: synopsis_length, dtype: float64

In [25]:
df.loc[df.synopsis_length == max(df.synopsis_length), ["Synopsis"]].values[0][0]

'It is November 16th, 1983 in Hawkins, Indiana. Inside Hawkins National Laboratory, a scientist bursts out a door, running from something that isn\'t there. He runs to the elevator, tapping the floor button repeatedly, looking back and forth. He then gets in and taps the button again. He looks forward then up slowly, as he hears and sees something that we can\'t see. He is then pulled up as the elevator doors close, yelling loudly.We then see the Wheeler residence, as four friends; Mike Wheeler, Will Byers, Dustin Henderson, and Lucas Sinclair, play Dungeons & Dragons. We then see that the Demogorgon is played, as Will is pressured to cast Fireball. He rolls the dice but it lands on the floor, with the group scrambling for it. Mike is then called by his mother, Nancy, to end the game. Meanwhile, the trio find it, but it is a 7, as only a 13 or higher can cast a Fireball, which means Will was eaten. Will then leaves, after telling Mike that it was a 7.Will rides his bike through the woo

In [26]:
from transformers import AutoTokenizer

# Tokenize text and truncate sequences
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# return_overflowing_tokens option to tokenize the whole input and split it into several chunks
outputs = tokenizer(
    df.loc[df.synopsis_length == max(df.synopsis_length), ["Synopsis"]].values[0][0],
    truncation=True,
    max_length=512,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 22
Input chunk lengths: [512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 176]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [88]:
unseen = df.sample(10)
df = df.loc[~df.index.isin(unseen.index), :]

In [95]:
print(f"Unseen by the model for you to test:\n {unseen['Title']}")

Unseen by the model for you to test:
 679               Revenge of the Ninja
1070                              Ring
568                           The Blob
790                   Midnight Madness
882                            Admiral
801                        Bob Roberts
756              Rock Around the Clock
1373                        It's Alive
567                         Shark Tale
1437    All Quiet on the Western Front
Name: Title, dtype: object


In [89]:
df

Unnamed: 0.1,Unnamed: 0,Title,Synopsis,Tag
0,0,I tre volti della paura,Note: this synopsis is for the orginal Italian...,cult
1,1,Mitt liv som hund,The action takes place in the years 1958-1959 ...,cult
2,2,The Brood,"At the Somafree Institute, Dr. Hal Raglan humi...",cult
3,3,The Haunted,This creepy and scary story centers around The...,paranormal
4,4,The Frozen Ground,The film opens in an Anchorage motel room in 1...,dramatic
...,...,...,...,...
1561,1561,Chuck & Buck,Buck O'Brien (Mike White) is a 27-year-old ama...,cult
1562,1562,The Manster,American foreign news correspondent Larry Stan...,cult
1563,1563,Le grand bleu,"Two children, Jacques Mayol (Jean-Marc Barr) a...",cult
1564,1564,You're a Big Boy Now,"Bernard Chanticleer (Peter Kastner), called ""B...",cult


In [3]:
df.iloc[0]

Unnamed: 0                                                    0
Title                                   I tre volti della paura
Synopsis      Note: this synopsis is for the orginal Italian...
Tag                                                        cult
Name: 0, dtype: object

In [97]:
n_labels = df['Tag'].unique().shape[0]
print(n_labels)

3


In [4]:
# Encode tags
encode_tag = {tag: i for i, tag in enumerate(df['Tag'].unique())}

# Store instances
data = []

# process data
count_instances = 0
for i in range(len(df)):

    line = df.iloc[i]
    tag = line["Tag"]
    text = line["Synopsis"]

    if tag and text:
        label = encode_tag[tag]
        instance = {'label': label, 'text': text}
        if instance not in data:
            data.append(instance)
            count_instances+=1
print(count_instances)


1553


In [13]:
decode_label = {v: k for k,v in encode_tag.items()}

In [5]:
import numpy as np 

test = np.random.choice(len(data), 10, replace=False)
test

array([ 845,  873,  173,  499, 1296,  692, 1286,  295,  486, 1262])

In [6]:
train_set = [data[i] for i in range(len(data)) if i not in test]
test_set = [data[i] for i in test]

In [33]:
from datasets import Dataset, DatasetDict
ds = DatasetDict({
    'train': Dataset.from_list(train_set)
})
ds = ds['train'].train_test_split(test_size=0.2)


In [8]:
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1234
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 309
    })
})

In [72]:
from transformers import AutoTokenizer

# Tokenize text and truncate sequences
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess_function(examples):
    return tokenizer(examples["text"], 
                    truncation=True,
                    padding=True, 
                    max_length=512, 
                    add_special_tokens = True)

tokenized_ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/1234 [00:00<?, ? examples/s]

Map:   0%|          | 0/309 [00:00<?, ? examples/s]

In [73]:
# return_overflowing_tokens option to tokenize the whole input and split it into several chunks
outputs = tokenizer(
    ds["train"][:2]["text"],
    truncation=True,
    max_length=512,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 6
Input chunk lengths: [512, 120, 512, 512, 512, 23]
Chunk mapping: [0, 0, 1, 1, 1, 1]


In [74]:
TOKENIZERS_PARALLELISM = True
from transformers import DataCollatorWithPadding

# Dynamically pad the sentences to the longest length in a batch during collation
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [75]:
import evaluate
accuracy = evaluate.load("accuracy")

# Compute accuracy
def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [76]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=n_labels, id2label=decode_label, label2id=encode_tag
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [77]:
training_args = TrainingArguments(
    output_dir="models/bert_film_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/78 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.9105373024940491, 'eval_accuracy': 0.6213592233009708, 'eval_runtime': 6.7718, 'eval_samples_per_second': 45.631, 'eval_steps_per_second': 2.953, 'epoch': 1.0}
{'train_runtime': 101.4515, 'train_samples_per_second': 12.163, 'train_steps_per_second': 0.769, 'train_loss': 0.8444587511894031, 'epoch': 1.0}


TrainOutput(global_step=78, training_loss=0.8444587511894031, metrics={'train_runtime': 101.4515, 'train_samples_per_second': 12.163, 'train_steps_per_second': 0.769, 'train_loss': 0.8444587511894031, 'epoch': 1.0})

In [78]:
trainer.save_model()

In [79]:
test_set

[{'label': 0,
  'text': 'The movie is about three childhood friends, Akash Malhotra (Aamir Khan), Sameer Mulchandani (Saif Ali Khan), and Siddharth "Sid" Sinha (Akshaye Khanna). Akash does not believe in the concept of love and does not engage in relationships lasting more than two weeks. Sameer is a genial, well-meaning, desperately romantic, but confused guy who is prone to romantic infatuations and believes to have found true love whenever he gets attracted to a girl. Sid, an artist by profession and the most mature of the three, is not interested in trivial romances and is dedicated to his work.\nAkash, who is a cad in his personal life, proposes to a girl named Shalini (Preity Zinta) in jest, without being aware that she is engaged to Rohit. He also engineers a breakup between Sameer and his then girlfriend Priya (Suchitra Pillai).\nThe three friends then go to Goa for a vacation. There Sameer falls in love with a foreign lady and while Akash and Sid return, Sameer stays with the 

In [86]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("bert_film_classifier")
model = AutoModelForSequenceClassification.from_pretrained("bert_film_classifier")

text = test_set[6]['text']
print(text)
print(test_set[6]['label'])

inputs = tokenizer(text, 
                   return_tensors="pt", 
                   truncation=True,
                   padding=True,
                   max_length=512,
                   add_special_tokens = True)
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
print('tag: ', model.config.id2label[predicted_class_id], 'label: ', predicted_class_id)

Five years later, a local theater is showing The Dark Beneath for the first time since the murders. The theater's staff, Bridget (Rebekah Brandes), Rachael (Brea Grant), and Kenny (Shaun Ausmus) welcome a small group of customers, including a biker couple, Harley (Stan Ellsworth) and Babe (Melissa Steach), Dr. Wayne and Detective Barrons (Jon Briddell), who both believe Radford will appear, and Bridget's boyfriend Josh (Daniel Bonjour), who is accompanied by his friend Mario (Greg Cirulnick), Mario's girlfriend Samantha (Mandell Maughan) and their awkward friend Sully (Michael Schwartz). Bridget's younger brother Timmy (Justin Baric) also arrives, but is sent home due to his age. As the movie is about to begin, Josh convinces Bridget to allow Kenny to be in charge so she can watch the movie with him.
However, after a while Bridget becomes unsettled by the movie and enters the lobby, where it is revealed she was abused by her father as a child. At this time, Kenny enters the basement to