In [None]:
!mkdir ./data
!mkdir ckpts
!mkdir .vector_cache
!mkdir ./outputs

!pip install --upgrade wandb
!pip install pytorch-lightning

In [1]:
import torch

#handling text data
from torchtext import data
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SEED = 2020

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True

#define hyperparameters
args = {'data_path': './data/IMDB Dataset.csv', 
        'embedding_dim': 100, 
        'dropout': 0.5,
        'lr': 5e-4,
        'batch_size': 256, 
        'epochs': 50, 
        'gpus': 0, 
        'progress_bar_refresh_rate': 25, 
        'wandb_log_step': 10, 
        'wandb_run_name': 'run_1', 
        'wandb_project_name': 'deep_dream', 
        'model_ckpt_path': './ckpts/model.ckpt'}


In [2]:
TEXT = data.Field(
    tokenize='spacy', batch_first=True, include_lengths=True)

LABEL = data.Field(batch_first=True, sequential=False)

fields = [('text', TEXT), ('label', LABEL)]

print(f"Loading file: {args['data_path']}")
training_data = data.TabularDataset(
    path=args['data_path'], format='csv', fields=fields, skip_header=True)

#print preprocessed text
# print(vars(training_data.examples[0]))

print("Splitting the data!")
train_data, valid_data = training_data.split(split_ratio=0.7, random_state = random.seed(SEED))

print("Building Vocab!")
TEXT.build_vocab(train_data, min_freq=3, vectors = "glove.6B.100d")
LABEL.build_vocab(train_data)

# #Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = args['batch_size'],
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

Loading file: ./data/IMDB Dataset.csv
Splitting the data!
Building Vocab!
100%|█████████▉| 399096/400000 [00:41<00:00, 9378.90it/s]

In [4]:
from model import TextClassifier

model = TextClassifier(args, 
                        TEXT=TEXT, 
                        LABEL=LABEL, 
                        train_iterator=train_iterator,
                        valid_iterator=valid_iterator,
                        wandb_logger=None)

model = model.load_from_checkpoint('./ckpts/model.ckpt', 
                                   TEXT=TEXT, 
                                   LABEL=LABEL, 
                                   train_iterator=train_iterator, 
                                   valid_iterator=valid_iterator)

RuntimeError: Error(s) in loading state_dict for TextClassifier:
	size mismatch for embedding.weight: copying a param with shape torch.Size([57775, 100]) from checkpoint, the shape in current model is torch.Size([57774, 100]).