In [1]:
# Only do this install once, for experimenting with hugging face's transformers
# CPU usage, just to ensure that model architecture will be working.
# Will later likely need to use GPUs using a virtual environment,
# IF: our dataset is too big, possibly not the case.
#pip install transformers[torch]

# Same here, run once
#!pip install datasets

In [65]:
# imports need necessary installs found above

from naiveModel import NBLangIDModel
from BERTModel import BERTGenreClassification, train_model
import pandas as pd
from sklearn.model_selection import train_test_split
from util import get_dataloader

# don't need cuda until using virtual machine
from torch import manual_seed#, cuda

In [3]:
%load_ext autoreload
%autoreload 2

In [30]:
naiveBayes = NBLangIDModel()

In [66]:
# initialize model

# change later, if using virtual machine
device ='cpu' #"cuda" if cuda.is_available() else "cpu"

# seed the model before initializing weights so that your code is deterministic
manual_seed(457)

freeze_bert = False # change later
batch_size = 16 # default
epochs = 5 # default
learning_rate = 1e-2 # default

model = BERTGenreClassification(freeze_bert = freeze_bert).to(device)

In [32]:
# load data, train test split
descriptions = pd.read_csv("cleanedData.csv")

print("Shape before dropping NaN values:", descriptions.shape)
descriptions = descriptions.dropna()
print("Shape after dropping NaN values:", descriptions.shape)

#TESTING THIS
# print("Rows to be dropped:")
# print(descriptions[descriptions['description'].isna() | ~descriptions['description'].apply(lambda x: isinstance(x, str))])
# #drop the 511 rows where description column is NaN or not a string
# descriptions = descriptions.dropna(subset=['description'])
# descriptions = descriptions[descriptions['description'].apply(lambda x: isinstance(x, str))]

train, test = train_test_split(descriptions, test_size=0.2)

Shape before dropping NaN values: (42661, 5)
Shape after dropping NaN values: (37478, 5)


In [33]:
train = train.drop("Unnamed: 0", axis= 1)
test = test.drop("Unnamed: 0", axis= 1)


In [34]:
train_X = train['description']
train_y1 = train['genre1']
train_y2 = train['genre2']
train_y3 = train['genre3']

In [35]:
# fit the NB model 
naiveBayes.fit(train_X.tolist(), train_y1.tolist())

In [36]:
#train['text'] = pd.Series()
#train['label'] = None
train['genre1'] = train['genre1'].str.strip()
train['genre2'] = train['genre2'].str.strip()
train['genre3'] = train['genre3'].str.strip()

In [38]:
from transformers import AutoTokenizer

label_vocab = naiveBayes.labels

label_as_id = {l:k  for k, l in enumerate(label_vocab)}
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [39]:
bert_train, bert_val = train_test_split(train, test_size= 0.2)
#bert_train.drop("Unnamed: 0", axis= 1)
#bert_val.drop("Unnamed: 0", axis= 1)


In [73]:
from datasets import Dataset

raw_bert_train = Dataset.from_pandas(bert_train)
raw_bert_val = Dataset.from_pandas(bert_val)

ds = {'train': raw_bert_train, 'validation': raw_bert_val}

In [74]:
#latest test
from util import BERT_preprocess
from torch.utils.data import DataLoader

label_vocab = naiveBayes.labels
id2label = {k:l  for k, l in enumerate(label_vocab)}

for split in ds:
    ds[split] = ds[split].map(lambda x: BERT_preprocess(x, id2label, tokenizer), remove_columns= ['description', 'genre1', 'genre2', 'genre3'])
    ds[split] = DataLoader(ds[split], batch_size=batch_size)

Map: 100%|██████████| 26983/26983 [00:24<00:00, 1090.08 examples/s]
Map: 100%|██████████| 2999/2999 [00:02<00:00, 1074.63 examples/s]


In [83]:
next(iter(ds['train']))['input_ids']

[tensor([101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
         101, 101]),
 tensor([ 2241,  2035,  1000,  7763,  2037, 11750,  2013,  5006,  1996,  1996,
          1999,  2014,  1045,  1996,  2005,  2054]),
 tensor([ 2006, 10707,  2054,  2706, 12168, 26775,  1996,  3960,  2190,  3496,
          1996,  2540,  5807,  3810,  8141,  6433]),
 tensor([ 2995,  5671,  2065,  3283,  2757, 20755,  6674,  2038, 23836,  1997,
          3467,  2038,  1521,  2391,  1997,  2043]),
 tensor([2824, 2359, 8101, 1010, 1998, 2094, 1024, 1037, 2075, 2198, 1997, 2000,
         1056, 2005, 1047, 1996]),
 tensor([ 1010,  2001,  2020, 15669,  2037,  2003,  5472,  3291,  3166, 20635,
         12104,  3338,  2031,  1996, 15238,  2200]),
 tensor([2021, 1037, 1037, 4904, 2188, 1037, 9956, 2007, 1997, 9065, 2683, 2077,
         4622, 2972, 2094, 5932]),
 tensor([ 4406,  2166,  5272,  5736,  2439,  2186,  2078,  2334,  2893,  5008,
          1010,  2009,  2032,  2732,  7570, 12530]),
 tensor(

In [82]:
train_model(model= model, train_dataloader= ds['train'], 
            dev_dataloader= ds['validation'], epochs= epochs, learning_rate= learning_rate)

AttributeError: 'list' object has no attribute 'size'

In [75]:
emb_length    = len(ds['train'][0]['input_ids'])
output_length = len(ds['train'][0]['label'])

print(f'Length of input embeddings: {emb_length}')
print(f'Length of output layer: {output_length}')

TypeError: 'DataLoader' object is not subscriptable

In [None]:
train_model()

In [13]:
'''
from util import BERT_preprocess
bert_train = BERT_preprocess(bert_train, label_as_id)
bert_val = BERT_preprocess(bert_val, label_as_id)
'''

In [20]:
'''
bert_val.head()
print(sum(bert_val.iloc[3, 6]))
'''

3


In [26]:
# BERT data loaders and training
#from util import BERT_preprocess
#bert_train = BERT_preprocess(bert_train, label_as_id)
#bert_val = BERT_preprocess(bert_val, label_as_id)

# have to change these to implement from df, not from csv
train_dataloader = get_dataloader(bert_train, batch_size=batch_size)
val_dataloader = get_dataloader(bert_val, batch_size=batch_size)


# currently, embedding dimensions are wrong
#train_model(model, train_dataloader, dev_dataloader, epochs, learning_rate)

TypeError: <lambda>() got an unexpected keyword argument 'batched'