In [1]:
cd ..

/home/neil/cloud/habitual_errors_NLP


In [2]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline
from haberrspd.__init__ import *
from haberrspd.__init_paths import data_root

# goto the correct folder just for practicality's sake

### **Convolutional Neural Networks for Sentence Classification** adapted for use with characters embeddings

In [8]:
import torchtext.data as data
import haberrspd.charCNN.mydatasets as mydatasets
import torch # To set the device
import datetime
import os
from haberrspd.charCNN.model import CNN_Text
import haberrspd.charCNN.train as train
from haberrspd.pdnet_flair import Config

In [9]:
args = Config(
    # Arguments
    batch_size = 64,
    lr=0.001,
    epochs=10,
    kernel_sizes = [3,4,5],
    kernel_num = 100,
    embed_dim=128,
    dropout=0.5,
    cuda = torch.cuda.is_available(),
    snapshot = None,
    static=False,
    predict=None,
    test=False,
    log_interval=1,
    test_interval=100,
    early_stop=1000,
    save_best=True,
    save_interval=500,
    save_dir = 'snapshot'
)

In [10]:
# In this example, words are embedded
def mr(text_field, label_field, **kargs):
    train_data, dev_data = mydatasets.MR.splits(text_field, label_field)
    text_field.build_vocab(train_data, dev_data)
    label_field.build_vocab(train_data, dev_data)
    train_iter, dev_iter = data.Iterator.splits(
                                (train_data, dev_data), 
                                batch_sizes=(args.batch_size, len(dev_data)),
                                **kargs)
    return train_iter, dev_iter

# load data
text_field = data.Field(lower=True)
label_field = data.Field(sequential=False)
# Data passed to model here 
train_iter, dev_iter = mr(text_field, 
                          label_field, 
                          device=torch.device('cuda'), 
                          repeat=False)

# update args and print
args.set("embed_num",len(text_field.vocab))
args.set("class_num",len(label_field.vocab) - 1)
args.set("save_dir", os.path.join(args.save_dir, 
                                  datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')))

downloading
extracting


In [None]:
# In this example, each character is embedded


### Data-example here (task: infer if positive/negative)

In [19]:
train_iter.dataset.examples[0].text

['his',
 'best',
 'film',
 'remains',
 'his',
 'shortest',
 ',',
 'the',
 'hole',
 ',',
 'which',
 'makes',
 'many',
 'of',
 'the',
 'points',
 'that',
 'this',
 'film',
 'does',
 'but',
 'feels',
 'less',
 'repetitive',
 '']

In [6]:
args

{'batch_size': 64,
 'lr': 0.001,
 'epochs': 10,
 'kernel_sizes': [3, 4, 5],
 'kernel_num': 100,
 'embed_dim': 128,
 'dropout': 0.5,
 'cuda': True,
 'snapshot': None,
 'static': False,
 'predict': None,
 'test': False,
 'log_interval': 1,
 'test_interval': 100,
 'early_stop': 1000,
 'save_best': True,
 'save_interval': 500,
 'save_dir': 'snapshot/2019-06-05_15-00-47',
 'embed_num': 21114,
 'class_num': 2}

In [7]:
# model
cnn = CNN_Text(args)
if args.cuda:
    cnn = cnn.cuda()

In [8]:
# train or predict
if args.predict is not None:
    label = train.predict(args.predict, cnn, text_field, label_field, args.cuda)
    print('\n[Text]  {}\n[Label] {}\n'.format(args.predict, label))
elif args.test:
    try:
        train.eval(test_iter, cnn, args) 
    except Exception as e:
        print("\nSorry. The test dataset doesn't exist.\n")
else:
    print()
    try:
        train.train(train_iter, dev_iter, cnn, args)
    except KeyboardInterrupt:
        print('\n' + '-' * 89)
        print('Exiting from training early')


Batch[100] - loss: 0.694505  acc: 57.0000%(37/64)
Evaluation - loss: 0.670083  acc: 55.0000%(593/1066) 

Batch[101] - loss: 0.678370  acc: 59.0000%(38/64)



Batch[200] - loss: 0.465207  acc: 87.0000%(56/64)
Evaluation - loss: 0.618228  acc: 64.0000%(689/1066) 

Batch[300] - loss: 0.485058  acc: 78.0000%(47/60)
Evaluation - loss: 0.621354  acc: 65.0000%(702/1066) 

Batch[400] - loss: 0.305611  acc: 90.0000%(58/64)
Evaluation - loss: 0.599315  acc: 69.0000%(740/1066) 

Batch[500] - loss: 0.102855  acc: 100.0000%(64/64)
Evaluation - loss: 0.618641  acc: 68.0000%(725/1066) 

Batch[600] - loss: 0.068792  acc: 100.0000%(60/60)
Evaluation - loss: 0.625644  acc: 69.0000%(741/1066) 

Batch[700] - loss: 0.045885  acc: 98.0000%(63/64)
Evaluation - loss: 0.668458  acc: 69.0000%(743/1066) 

Batch[800] - loss: 0.012878  acc: 100.0000%(64/64)
Evaluation - loss: 0.686670  acc: 70.0000%(754/1066) 

Batch[900] - loss: 0.014656  acc: 100.0000%(60/60)
Evaluation - loss: 0.712205  acc: 70.0000%(749/1066) 

Batch[1000] - loss: 0.008831  acc: 100.0000%(64/64)
Evaluation - loss: 0.725757  acc: 70.0000%(756/1066) 

Batch[1100] - loss: 0.004772  acc: 100.0000%(64/6

## **Character-level Convolutional Networks for Text Classification**

In [3]:
from haberrspd.pdnet_flair import Config, make_train_test_dev
from haberrspd.charCNN.auxiliary import make_MJFF_data_loader
from haberrspd.charCNN.train import run
import torch
import pandas as pd

In [4]:
# For deploy version, all of this will be placed in the parser
args = Config(
    # Model arguments
    val_path='haberrspd/charCNN/data/dev.csv',
    test_path='haberrspd/charCNN/data/test.csv',
    train_path='haberrspd/charCNN/data/train.csv',
    alphabet_path='haberrspd/charCNN/alphabet.json',# Where our alphabet lives
    load_very_long_sentences=False, # Temporary fix to remove unreasonably long sequences
    max_sample_length=8192, # The maximum length allowed for each training example
    num_workers=1,
    batch_size = 32,
    lr=0.001, # Learning rate
    epochs=5, # Set epochs here
    max_norm=400,
    optimizer='Adam',
    class_weight=None,
    dynamic_lr=False, # Dynamic learning rate, used for all but Adam
    milestones = [5,10,15],
    decay_factor = 0.5, # Rate by which we reduce the learning rate
    kernel_sizes = [3,4,5],
    kernel_num = 100,
    embed_dim=128,
    shuffle=False,
    dropout=0.5,
    cuda=True, # We have a GPU so let's use it
    verbose=False,
    continue_from='', # If we already trained a model we can continue from the stored one
    checkpoint=False,
    checkpoint_per_batch=10000,
    save_folder='haberrspd/charCNN/run_results',
    log_config=False,
    log_result=False,
    log_interval=2,
    val_interval=200,
    save_interval=1
)

In [5]:
assert torch.cuda.is_available() is True
run(args)

Loading data from haberrspd/charCNN/data/train.csv
Loading data from haberrspd/charCNN/data/dev.csv

Number of training samples: 834
	Label 0:           583
	Label 1:           251

Number of developing samples: 110
	Label 0:            67
	Label 1:            43

 Directory for saving results, already exists.

Configuration:
	Alphabet path:          haberrspd/charCNN/alphabet.json
	Batch size:             32
	Checkpoint:             False
	Checkpoint per batch:   10000
	Class weight:           None
	Continue from:          
	Cuda:                   True
	Decay factor:           0.5
	Dropout:                0.5
	Dynamic lr:             False
	Embed dim:              128
	Epochs:                 5
	Kernel num:             100
	Kernel sizes:           [3, 4, 5]
	Load very long sentences:False
	Log config:             False
	Log interval:           1
	Log result:             False
	Lr:                     0.001
	Max norm:               400
	Max sample length:      8192
	Milestones:       




Evaluation - loss: 0.603630  lr: 0.00100  acc: 67.000% (65/96) 
Label: [31m  0[0m	Prec: [32m 66.3[0m% (57/86)      Recall: [32m 96.6[0m% (57/59)      F-Score: [32m 78.6[0m%[0m
Label: [31m  1[0m	Prec: [32m 80.0[0m% (8/10)       Recall: [32m 21.6[0m% (8/37)       F-Score: [32m 34.0[0m%[0m


=> found better validated model, saving to haberrspd/charCNN/run_results/CharCNN_best.pth.tar


Epoch[2] Batch[1] - loss: 0.557843  lr: 0.00100  acc: 81.000% (26/32)
Epoch[2] Batch[2] - loss: 0.469927  lr: 0.00100  acc: 81.000% (26/32)
Epoch[2] Batch[3] - loss: 0.519208  lr: 0.00100  acc: 68.000% (22/32)
Epoch[2] Batch[4] - loss: 0.478063  lr: 0.00100  acc: 78.000% (25/32)
Epoch[2] Batch[5] - loss: 0.482974  lr: 0.00100  acc: 78.000% (25/32)
Epoch[2] Batch[6] - loss: 0.478311  lr: 0.00100  acc: 81.000% (26/32)
Epoch[2] Batch[7] - loss: 0.430139  lr: 0.00100  acc: 84.000% (27/32)
Epoch[2] Batch[8] - loss: 0.405333  lr: 0.00100  acc: 84.000% (27/32)
Epoch[2] Batch[9] - loss: 0.394905 

In [11]:
import torch
import torch.nn as nn

In [12]:
batch_size = 70
x = torch.ones(2**13, batch_size, 50) # Input    
m = nn.Sequential(
    nn.Conv1d(batch_size, 64, kernel_size=16, stride=1), # Function
    nn.ReLU(),
    nn.MaxPool1d(kernel_size=3,stride=3)
)
out = m(x)
print(out.size())
# print(m)

torch.Size([8192, 64, 11])


In [25]:
out.view(out.size(0),-1).shape

torch.Size([8192, 448])