In [0]:
!wget "https://www.dropbox.com/s/w03mdyw4kqimwgt/data.zip?dl=0"
!unzip data.zip\?dl\=0

--2019-08-13 06:38:46--  https://www.dropbox.com/s/w03mdyw4kqimwgt/data.zip?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.82.1, 2620:100:6032:1::a27d:5201
Connecting to www.dropbox.com (www.dropbox.com)|162.125.82.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/w03mdyw4kqimwgt/data.zip [following]
--2019-08-13 06:38:47--  https://www.dropbox.com/s/raw/w03mdyw4kqimwgt/data.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uceedcc6ef572d05d73085e27cdf.dl.dropboxusercontent.com/cd/0/inline/Amhj8zPwrSCdDZq1hZUSdo6Bd4LQGBchQvzIw5oT9ybBIh6wioutPGptPtoofGARAZ_NHeliDvh_XSq8w660XZUIrBEQbH2RcKgougyZ1QyHtbwK7y49bQJdE_RPDIe6k28/file# [following]
--2019-08-13 06:38:47--  https://uceedcc6ef572d05d73085e27cdf.dl.dropboxusercontent.com/cd/0/inline/Amhj8zPwrSCdDZq1hZUSdo6Bd4LQGBchQvzIw5oT9ybBIh6wioutPGptPtoofGARAZ_NHeliDvh_XSq8w660XZUIrBEQbH2RcKgougyZ1QyHtbwK7y49bQ

In [0]:
import csv
import torch
import itertools
from torch.utils.data import Dataset, DataLoader

class Voc:
    def __init__(self, size=4, lines=[]):
        assert size >= 4
        self.index2word = {0: "SOS", 1: "EOS", 2:"PAD", 3:"UNK"}
        word2count = {}
        for l in lines:
            for word in l.split(' '):
                if word not in word2count:
                    word2count[word] = 1
                else:
                    word2count[word] += 1
        word2count = list(word2count.items())
        word2count.sort(key=lambda x: x[1], reverse=True)
        size = min(size, len(word2count))
        if len(lines):
            print("{} words trimmed to {} words".format(len(word2count), size))
        for i in range(size-4):
            self.index2word[i+4] = word2count[i][0]
        self.word2index = {v: k for k, v in self.index2word.items()}

    def getIndex(self, word):
        if word in self.word2index:
            return self.word2index[word]
        else:
            return self.word2index["UNK"]

    def save2file(self, path):
        with open(path, 'w') as f:
            yaml.dump(self.index2word, f, default_flow_style=False, allow_unicode=True)

    def load_file(self, path):
        with open(path, 'r') as f:
            self.index2word = yaml.load(f)
            self.word2index = {v: k for k, v in self.index2word.items()}
            self.size = len(self.index2word)

class Twit_dataset(Dataset):
    def __init__(self, csv_file, voc):
        with open(csv_file) as f:
            reader = csv.reader(f)
            self.data = list(reader)
        self.data = [(int(d[0]), [voc.getIndex(w) for w in d[1]]) for d in self.data]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def collate_fn(batch):
    def pad(seqs, fillvalue=2):
        tmp = list(itertools.zip_longest(*seqs, fillvalue=fillvalue))
        return torch.LongTensor(tmp)
    ys, xs = list(zip(*batch))
    xs = pad(xs)
    return torch.LongTensor(xs), torch.FloatTensor(ys)

seqs = []
size = 20000
with open('data/train.csv') as f:
    reader = csv.reader(f)
    for r in reader:
        seqs.append(r[1])
voc = Voc(size, seqs)
train_dataset = Twit_dataset('data/train.csv', voc)
valid_dataset = Twit_dataset('data/test.csv', voc)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=1, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False, num_workers=1, collate_fn=collate_fn)

82945 words trimmed to 20000 words


In [0]:
import torch.nn as nn

class My_model(nn.Module):
    def __init__(self, n_layers, voc_size, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(voc_size, 256)
        self.rnn = nn.GRU(256, 256, n_layers, dropout=dropout)
        self.fc = nn.Linear(256, 1)

    def forward(self, x):
        emb = self.embedding(x)
        output, hidden = self.rnn(emb)
        output = self.fc(output[-1])
        return torch.sigmoid(output)

In [0]:
device = 'cuda'
n_epoch = 5
model = My_model(2, 20000).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
print_loss = 0.0

In [0]:
for e in range(n_epoch):
    print('============= Epoch: {} ============='.format(e+1))
    for idx, (x, y) in enumerate(train_loader):
        x = x.to(device)
        y = y.to(device)
        o = model(x)
        loss = criterion(o.squeeze(), y)
        print_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (idx+1) % 100 == 0:
            print('loss:', print_loss/100)
            print_loss = 0.0


loss: 0.5047378954291344
loss: 0.3942653985321522
loss: 0.3876168209314346
loss: 0.40268398225307467
loss: 0.4069484996795654
loss: 0.39989668875932693
loss: 0.39441015258431433
loss: 0.4004122844338417
loss: 0.3985528263449669
loss: 0.41054607152938843
loss: 0.40015999257564544
loss: 0.4140382158756256
loss: 0.4008857062458992
loss: 0.40533855006098746
loss: 0.38861322045326235
loss: 0.42293769747018817
loss: 0.3954067088663578
loss: 0.4058258217573166
loss: 0.4146680372953415
loss: 0.41023378729820253
loss: 0.40644326478242876
loss: 0.4084769403934479
loss: 0.4092845797538757
loss: 0.41397729963064195
loss: 0.39755660921335223
loss: 0.40115255922079085
loss: 0.4120865029096603
loss: 0.4165258812904358
loss: 0.3999462580680847
loss: 0.4145052808523178
loss: 0.40048198208212854
loss: 0.48043680131435396
loss: 0.3858619250357151
loss: 0.4034507718682289
loss: 0.39701565831899643
loss: 0.3904173263907433
loss: 0.40349579572677613
loss: 0.40099132925271985
loss: 0.3928798070549965
loss: 0