In [294]:
import pandas as pd
import numpy as np
import zipfile
import torch
from os.path import exists
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

loading file vocab.txt from cache at /Users/max/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /Users/max/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/tokenizer_config.json
loading configuration file config.json from cache at /Users/max/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_

In [295]:
prep_dataset = None
if exists('merged_dataset.csv'):
    dataset = pd.read_csv('merged_dataset.csv')
    dataset = dataset.drop(columns=['Plot','Title', 'Unnamed: 0'])
    dataset.head()
    prep_dataset = dataset
elif exists('prepared.zip'):
    with zipfile.ZipFile('prepared.zip', 'r') as zip_ref:
        zip_ref.extractall('.')
        prep_dataset = pd.read_csv('prepared.csv')
prep_dataset.head()
prep_dataset['Genre'] = prep_dataset['Genre'].str.split(',')
# prep_dataset['Genre'] = prep_dataset['Genre'].str.strip()

In [296]:
prep_dataset.head()

Unnamed: 0.1,Unnamed: 0,Genre,Processed plot
0,0,"[cult, horror, gothic, murder, atmospheric]",note : thi synopsi is for the orgin italian re...
1,1,[violence],"two thousand year ago , nhagruul the foul , a ..."
2,2,[romantic],"matuschek ' s , a gift store in budapest , is ..."
3,3,"[inspiring, romantic, stupid, feel-good]","glenn holland , not a morn person by anyon ' s..."
4,4,"[cruelty, murder, dramatic, cult, violence...","in may 1980 , a cuban man name toni montana ( ..."


In [297]:
prep_dataset = prep_dataset.explode("Genre")
prep_dataset.head()

Unnamed: 0.1,Unnamed: 0,Genre,Processed plot
0,0,cult,note : thi synopsi is for the orgin italian re...
0,0,horror,note : thi synopsi is for the orgin italian re...
0,0,gothic,note : thi synopsi is for the orgin italian re...
0,0,murder,note : thi synopsi is for the orgin italian re...
0,0,atmospheric,note : thi synopsi is for the orgin italian re...


In [298]:
unique_genres_count = prep_dataset['Genre'].unique().shape[0]

In [299]:
print(f'Количество уникальных жанров в датасете: {unique_genres_count}')

Количество уникальных жанров в датасете: 1754


In [300]:
prep_dataset.shape[0]

175321

In [301]:
prep_dataset = prep_dataset.dropna()
labels = prep_dataset['Genre'].unique()
def get_label_index(label: str):
    return np.where(labels == label)[0][0]


assert 1 == get_label_index( ' horror') , 'get_labels_index doesn\'t work as expected'
prep_dataset['GenreCoded'] = prep_dataset['Genre'].apply(get_label_index)
prep_dataset = prep_dataset.drop(columns=['Unnamed: 0'])
prep_dataset.reset_index()
prep_dataset.index = [x for x in range(1, len(prep_dataset.values)+1)]
prep_dataset.head()


labels_to_codes = dict()
for a in labels:
    # print(f'add label {a} into dict ')
    labels_to_codes[a] = get_label_index(a)
labels = labels_to_codes

In [302]:
print(f'Количество уникальных жанров: {len(labels)}')

Количество уникальных жанров: 1753


##### Объявляем датасет pytorch для работы с подготовленными данными

In [303]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['Genre']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['Processed plot']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

##### Разбиваем исходные данные на обучающие и тестовую выборку

In [304]:
np.random.seed(112)
df_train, df_val, df_test = np.split(prep_dataset.sample(frac=1, random_state=42), 
                                     [int(.8*len(prep_dataset)), int(.9*len(prep_dataset))])
print(len(df_train),len(df_val), len(df_test))
df_test.head()


138505 17313 17314


Unnamed: 0,Genre,Processed plot,GenreCoded
97505,Music,a new - wave extravaganza in which a young wom...,1744
78644,romantic comedy,the stori revolv around bharathi ( udaya ) and...,171
60047,drama,"chiyo sakamoto ( suzuka ohgo ), a young girl f...",147
1317,cult,the film begin with a flashback set to the fir...,13
157900,Horror,after an african dinosaur ancestor of the croc...,1742


##### Наконец строим простенькую модель

In [305]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

##### Цикл обучения

In [306]:
df = prep_dataset
device = torch.device('mps') # hope it will works good enough

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)
    print('train & val dataloaders ready')


    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    model.to(device)
    criterion.to(device)

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  

In [307]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [308]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

138505 17313 17314


In [309]:
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

loading configuration file config.json from cache at /Users/max/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file pytorch_model.bin from cache at /Users/max/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/pytorch_model.bin
Some weights of the model

train & val dataloaders ready


  0%|          | 2/69253 [04:34<2669:31:29, 138.77s/it]

In [None]:
evaluate(model, df_test)