## Loading librarires and dataset

In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

import os
import pandas as pd
import numpy as np
import random
import re
import itertools
import argparse
import gc
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from pathlib import Path

import pickle
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
import warnings
from torch.optim import Adam, SGD, AdamW
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
data_dir = './dataset/'
df = pd.read_csv(data_dir+'processed_train.csv')
df.shape

(159571, 2)

* use 2000 samples to make sure my code run

In [4]:
dfs = df.sample(2000).reset_index(drop=True)
dfs.head(5)

Unnamed: 0,comment_text,labels
0,"Hannibal \n\nLike I said earlier, the only th...","[0, 0, 0, 0, 0, 0]"
1,"""\n\n some old news about second expansion \n\...","[0, 0, 0, 0, 0, 0]"
2,Rude C*nt \n\nDon't delete the streak on Jack ...,"[1, 0, 0, 0, 0, 0]"
3,The standard offer \n\nUser:Tokyogirl79\nWhat ...,"[0, 0, 0, 0, 0, 0]"
4,"""\n\nI think it should be removed, yes. Could ...","[0, 0, 0, 0, 0, 0]"


In [5]:
dfs.comment_text.values[1]

'"\n\n some old news about second expansion \n\n2nd expansion, Experimentals, should be called ""The Experimentals""? at least CT mentioned like that and in official GPG forum we all call it TE, so i guess better keep the word ""The"".\nAnother thing, TE now canceled, as seen in recent(not really, some mth ago) interview, CT talked about this. Can check supcom fan site for this news or forum old topic."'

Split into train and validation set

In [6]:
train_df = dfs.sample(frac=0.8, random_state=42)
val_df = dfs.drop(train_df.index).reset_index(drop=True)

## Creating Dataset, DataLoader

In [7]:
tokenizer.encode_plus?

Object `tokenizer.encode_plus` not found.


In [8]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer: transformers.BertTokenizer, max_len = 128):
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

        self.text = self.df.comment_text.values
        self.label = self.df.labels.values
    
    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = self.text[idx]
        label = self.label[idx]

        input = self.tokenizer.encode_plus(
            text = text,
            text_pair = None,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length = self.max_len,
            return_tensors='pt',
            return_token_type_ids=True
        )
        return {
            'input_ids' : input.input_ids.squeeze(),
            'attention_mask' : input.attention_mask.squeeze(), 
            'token_type_ids' : input.token_type_ids.squeeze()
        }
# dataset = MyDataset(dfs, tokenizer, 100) # 512 is tokenizer.model_max_length
# dataset[100]

In [9]:
tokenizer = AutoTokenizer.from_pretrained('./bert')
train = MyDataset(train_df, tokenizer, 512)
val = MyDataset(val_df, tokenizer, 512)

In [10]:
next(iter(train))

{'input_ids': tensor([  101,  1000,  1024,  1024,  1024,  2054,  1005,  1055, 26316,  2003,
          2008,  1996,  1000,  1000,  3296,  1000,  1000, 10086,  2038,  2468,
          1037,  4568, 15083,  1999,  5606,  1997, 11744,  1012,  2074,  2079,
          1037,  3945,  2006,  1000,  1000,  7955,  4009, 12619,  2154,  2003,
          2019,  3296,  2724,  1012,  1000,  1000,  2017,  1005,  2222,  2131,
          2055,  1015,  1010,  2199,  4978,  1010,  1998,  2471,  2035,  2024,
         27394, 16948,   999,   999,  1011,  1007,  2026,  5440,  2003,  1037,
          2678,  2011,  8505,  2546,  8889,  2102,  1012,  2175,  2000,  1017,
          1024,  2539,  1997,  1000,  1000,  5255,  2431,  1037,  2454, 12849,
          5521,  2015,  1012,  1000,  1000,  2009,  1005,  1055,  1037, 12117,
         12326,  1997, 16948,  4760,  1996,  3296, 12034,  2401,  3351,  1012,
          2293,  2009,   999,  1011,  1007,  1000,   102,     0,     0,     0,
             0,     0,     0,     0,   

In [11]:
print(os.cpu_count())

8


In [12]:
train_dataloader = DataLoader(train, batch_size=32, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val, batch_size=32, shuffle=False, num_workers=4)

## Model

In [13]:
DEVICE = "cuda" if torch.cuda.is_available() else "gpu"

In [14]:
# optimizer = torch.optim.AdamW(params=model.parameters(), lr=3e-5)

In [15]:
model = AutoModel.from_pretrained('./bert')

In [16]:
batch = next(iter(train_dataloader))

In [17]:
batch['input_ids']

tensor([[ 101, 2123, 1005,  ...,    0,    0,    0],
        [ 101, 1045, 9530,  ...,    0,    0,    0],
        [ 101, 2045, 2052,  ...,    0,    0,    0],
        ...,
        [ 101, 1000, 1045,  ...,    0,    0,    0],
        [ 101, 1000, 1045,  ...,    0,    0,    0],
        [ 101, 2092, 1045,  ...,    0,    0,    0]])

In [None]:
model(batch['input_ids'],batch['attention_mask'])