<a href="https://colab.research.google.com/github/maxt/ds-practise/blob/final/final/final_tuned_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers

In [2]:
import pandas as pd
import numpy as np
import zipfile
import torch
from os.path import exists
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Mounted at /content/drive


Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [3]:
prep_dataset = None
if exists('merged_dataset.csv'):
    dataset = pd.read_csv('merged_dataset.csv')
    dataset = dataset.drop(columns=['Plot','Title', 'Unnamed: 0'])
    dataset.head()
    prep_dataset = dataset
elif exists('/content/drive/My Drive/prepared.zip'):
    with zipfile.ZipFile('/content/drive/My Drive/prepared.zip', 'r') as zip_ref:
        zip_ref.extractall('.')
        prep_dataset = pd.read_csv('prepared.csv')
prep_dataset.head()
prep_dataset['Genre'] = prep_dataset['Genre'].str.split(',')
# prep_dataset['Genre'] = prep_dataset['Genre'].str.strip()

In [4]:
prep_dataset.head()

Unnamed: 0.1,Unnamed: 0,Genre,Processed plot
0,0,"[cult, horror, gothic, murder, atmospheric]",note : thi synopsi is for the orgin italian re...
1,1,[violence],"two thousand year ago , nhagruul the foul , a ..."
2,2,[romantic],"matuschek ' s , a gift store in budapest , is ..."
3,3,"[inspiring, romantic, stupid, feel-good]","glenn holland , not a morn person by anyon ' s..."
4,4,"[cruelty, murder, dramatic, cult, violence...","in may 1980 , a cuban man name toni montana ( ..."


In [5]:
prep_dataset = prep_dataset.explode("Genre")
prep_dataset.head()

Unnamed: 0.1,Unnamed: 0,Genre,Processed plot
0,0,cult,note : thi synopsi is for the orgin italian re...
0,0,horror,note : thi synopsi is for the orgin italian re...
0,0,gothic,note : thi synopsi is for the orgin italian re...
0,0,murder,note : thi synopsi is for the orgin italian re...
0,0,atmospheric,note : thi synopsi is for the orgin italian re...


In [6]:
prep_dataset = prep_dataset.replace('Drama', 'drama')
prep_dataset = prep_dataset.replace(' Drama', 'drama')
prep_dataset = prep_dataset.replace('Drama ', 'drama')

prep_dataset = prep_dataset.replace('Comedy', 'comedy')
prep_dataset = prep_dataset.replace(' Comedy', 'comedy')


count_df = prep_dataset.groupby(['Genre'])['Genre'].count().reset_index(name='count').sort_values(['count'], ascending=False)
genres_to_filter = count_df.head(10)['Genre'].to_numpy()
unique_genres_count = prep_dataset['Genre'].unique().shape[0]

In [7]:
prep_dataset = prep_dataset.loc[prep_dataset['Genre'].isin(genres_to_filter)]

In [8]:
print(f'Количество уникальных жанров в датасете: {unique_genres_count}')

Количество уникальных жанров в датасете: 1750


In [9]:
prep_dataset.shape[0]

80869

In [10]:
prep_dataset = prep_dataset.dropna()
prep_dataset = prep_dataset.iloc[:10000,:]
prep_dataset = prep_dataset.drop_duplicates(subset = 'Processed plot', keep = 'first')
labels = prep_dataset['Genre'].unique()
def get_label_index(label: str):
    return np.where(labels == label)[0][0]


# assert 1 == get_label_index( ' horror') , 'get_labels_index doesn\'t work as expected'
prep_dataset['GenreCoded'] = prep_dataset['Genre'].apply(get_label_index)
prep_dataset = prep_dataset.drop(columns=['Unnamed: 0'])
prep_dataset.reset_index()
prep_dataset.index = [x for x in range(1, len(prep_dataset.values)+1)]
prep_dataset.head()


labels_to_codes = dict()
for a in labels:
    # print(f'add label {a} into dict ')
    labels_to_codes[a] = get_label_index(a)
labels = labels_to_codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prep_dataset['GenreCoded'] = prep_dataset['Genre'].apply(get_label_index)


In [11]:
print(f'Количество уникальных жанров: {len(labels)}')

Количество уникальных жанров: 5


##### Объявляем датасет pytorch для работы с подготовленными данными

In [12]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['Genre']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['Processed plot']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

##### Разбиваем исходные данные на обучающие и тестовую выборку

In [13]:
np.random.seed(112)
df_train, df_val, df_test = np.split(prep_dataset.sample(frac=1, random_state=42), 
                                     [int(.8*len(prep_dataset)), int(.9*len(prep_dataset))])
print(len(df_train),len(df_val), len(df_test))
df_test.head()


5875 734 735


Unnamed: 0,Genre,Processed plot,GenreCoded
2994,comedy,the evil witch mother malkin ( juliann moor ) ...,2
5513,murder,"somewher in southeast asia , the present day ....",0
6314,drama,a choru girl get bad advic from her fellow cho...,4
1192,murder,colleg student alex and her friend go to the w...,0
5773,comedy,befor head out to a basebal game at a nearbi b...,2


##### Наконец строим простенькую модель

In [14]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, len(labels))
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

##### Цикл обучения

In [15]:
df = prep_dataset
device = torch.device("cuda")

def train_f(model, train_dataloader, val_dataloader, learning_rate, epochs):

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    model.cuda()
    criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0
            
            print(f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(df_train): .3f} | Train Accuracy: {total_acc_train / len(df_train): .3f} ')
                  

In [16]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=1)

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [17]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))]) # 0.8, 0.9 для train

print(len(df_train),len(df_val), len(df_test))

5875 734 735


In [18]:
train, val = Dataset(df_train), Dataset(df_val)
train_dataloader = torch.utils.data.DataLoader(train, batch_size=1, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val, batch_size=1)

In [19]:
EPOCHS = 10
model = BertClassifier()
LR = 1e-6
print(f'cuda available: {torch.cuda.is_available()}, device count: {torch.cuda.device_count()}, current: {torch.cuda.current_device()}')
train_f(model, train_dataloader, val_dataloader, LR, EPOCHS)
torch.save(model, '/content/drive/My Drive/max-bert-model.txt')

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


cuda available: True, device count: 1, current: 0


100%|██████████| 5875/5875 [12:45<00:00,  7.68it/s]


Epochs: 1 | Train Loss:  1.309 | Train Accuracy:  0.492 


100%|██████████| 5875/5875 [12:44<00:00,  7.69it/s]


Epochs: 2 | Train Loss:  1.139 | Train Accuracy:  0.554 


100%|██████████| 5875/5875 [12:44<00:00,  7.68it/s]


Epochs: 3 | Train Loss:  0.993 | Train Accuracy:  0.601 


100%|██████████| 5875/5875 [12:44<00:00,  7.68it/s]


Epochs: 4 | Train Loss:  0.868 | Train Accuracy:  0.661 


100%|██████████| 5875/5875 [12:43<00:00,  7.69it/s]


Epochs: 5 | Train Loss:  0.710 | Train Accuracy:  0.747 


100%|██████████| 5875/5875 [12:44<00:00,  7.68it/s]


Epochs: 6 | Train Loss:  0.545 | Train Accuracy:  0.807 


100%|██████████| 5875/5875 [12:44<00:00,  7.69it/s]


Epochs: 7 | Train Loss:  0.394 | Train Accuracy:  0.873 


100%|██████████| 5875/5875 [12:43<00:00,  7.69it/s]


Epochs: 8 | Train Loss:  0.266 | Train Accuracy:  0.930 


100%|██████████| 5875/5875 [12:44<00:00,  7.69it/s]


Epochs: 9 | Train Loss:  0.169 | Train Accuracy:  0.961 


100%|██████████| 5875/5875 [12:43<00:00,  7.69it/s]


Epochs: 10 | Train Loss:  0.109 | Train Accuracy:  0.979 


In [20]:
evaluate(model, df_test)

Test Accuracy:  0.562
