In [0]:
import os

import pandas as pd
import numpy as np

import nltk
import torch.nn
import torch
import torch.optim

from torchtext import data
from torch import device

import torch.nn.functional as F

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm_notebook
from sklearn.datasets import fetch_20newsgroups

from sklearn.datasets import fetch_20newsgroups
from google.colab import drive
from torchtext import data

import json
import random

In [0]:
def _train_epoch(model, iterator, optimizer, curr_epoch, device):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        
        loss = model(batch.text.to(device), batch.pos.to(device), batch.neg.to(device)).sum()
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, device):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with torch.no_grad():
        for batch in iterator:
            
            loss = model(batch.text.to(device), batch.pos.to(device), batch.neg.to(device)).sum()
            #print(loss, loss.shape)
            epoch_loss += loss.data.cpu().detach().item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, optimizer, device, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, epoch, device)
        valid_loss = _test_epoch(model, valid_iterator, device)
        scheduler.step(valid_loss)


        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)
    return history

def encode_as_vecs(batch_iter, model,  device, texts_fn, labels_fn):
  texts = open(texts_fn, 'ab')
  labels = open(labels_fn, 'ab')

  n_batches = len(batch_iter)
  iterator = tqdm_notebook(batch_iter, total = n_batches)

  all_vecs = []
  all_labels = []

  for batch in iterator:
    vecs = model.branch(batch.text.to(device))
    vecs = vecs.data.cpu().numpy()
    labels = batch.label.data.numpy()
    for vec in vecs:
      all_vecs.append(vec)
    for label in labels:
      all_labels.append(label)
    
  all_vecs = np.array(all_vecs)
  all_labels = np.array(all_labels)

  np.save(texts_fn, all_vecs)
  np.save(labels_fn, all_labels)
  #return out

In [0]:
def dump_vocab(vocab, dir_name):
  os.makedirs(dir_name, exist_ok=True)
  with open(os.path.join(dir_name, 'freqs.json'), 'w', encoding='utf-8') as foutp:
    json.dump(vocab.freqs, foutp, ensure_ascii=False, indent=2)
  with open(os.path.join(dir_name, 'itos.json'), 'w', encoding='utf-8') as foutp:
    json.dump(vocab.itos, foutp, ensure_ascii=False, indent=2)
  with open(os.path.join(dir_name, 'stoi.json'), 'w', encoding='utf-8') as foutp:
    json.dump(vocab.stoi, foutp, ensure_ascii=False, indent=2)

def dump_model_weights(model, fname):
  torch.save(model.state_dict(), fname)

def load_model(lm, state_dict_path):
  state_dict = torch.load(state_dict_path)
  lm.load_state_dict(state_dict)
  return lm

def load_vocab(dir_name):
  freqs_path = os.path.join(dir_name, 'freqs.json')
  itos_path = os.path.join(dir_name, 'itos.json')
  stoi_path = os.path.join(dir_name, 'stoi.json')

  with open(freqs_path, 'r', encoding='utf-8') as finp:
    freqs = Counter(json.load(finp))
  with open(itos_path, 'r', encoding='utf-8') as finp:
    itos = json.load(finp)
  with open(stoi_path, 'r', encoding='utf-8') as finp:
    stoi = json.load(finp)
  
  vocab = Vocab(freqs)
  vocab.itos = itos
  vocab.stoi = stoi

  return vocab

In [8]:
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/hw7')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
dataset = fetch_20newsgroups(subset='train', download_if_missing=True)

Посчитаем примерную длину:

In [0]:
pd.Series([len(i.split()) for i in dataset.data]).quantile([i*0.1 for i in range(10)])

0.0     14.0
0.1     69.0
0.2     96.0
0.3    120.0
0.4    148.0
0.5    176.0
0.6    211.0
0.7    260.0
0.8    335.0
0.9    508.0
dtype: float64

Возьмём максимальную длину 300 слов

In [0]:
newsgroups = [[] for i in range(20)]

In [0]:
for i in range(len(dataset.data)):
  newsgroups[dataset.target[i]].append(dataset.data[i])

In [0]:
import random

In [0]:
def get_triplets(grouped_examples, triplets_per_example=10):
  data = []
  for group_id, group in enumerate(grouped_examples):
    for text_id, text in enumerate(group):
      ##text = word_tokenize(text)
      pos_examples = random.sample([group[i] for i in range(len(group)) if i!=text_id], k=triplets_per_example)
      neg_examples = []
      for i in range(triplets_per_example):
        other_group = random.choice([grouped_examples[i] for i in range(len(grouped_examples)) if i!=group_id])
        neg_examples.append(random.choice(other_group))
      for pos_example, neg_example in zip(pos_examples, neg_examples):
        data.append((text, pos_example, neg_example)
  return data

In [0]:
triplet_data = get_triplets(newsgroups, 1)

In [0]:
triplets = pd.DataFrame(triplet_data, columns = ['text', 'pos', 'neg'])

In [0]:
triplets.to_csv("newsgroups_triplets1.csv")

In [0]:
TEXT = data.Field(
    batch_first = True,
    tokenize = word_tokenize,
    lower = True,
    fix_length=300,
    include_lengths=False,
    stop_words = stopwords.words("english")
)

dataset = data.TabularDataset('newsgroups_triplets1.csv',
                              format = 'csv',
                              fields = [(None, None),
                                        ('text', TEXT),
                                        ('pos', TEXT),
                                        ('neg', TEXT)],
                              skip_header = True)

In [0]:
TEXT.build_vocab(dataset, min_freq=5)

In [0]:
random.seed(42)
train, val = dataset.split(0.8, random_state=random.getstate())

In [0]:
batch_size = 128

train_iter, val_iter = data.BucketIterator.splits((train, val),
                                                  batch_sizes=(batch_size,batch_size),
                                                  sort_key=lambda x: len(x.text))

Embed vectors taken from: http://vectors.nlpl.eu/repository/#


In [4]:
os.listdir()

['__pycache__',
 'news-commentary-v13.ru-en.en',
 'news-commentary-v13.ru-en.ru',
 'src_vocab',
 'tgt_vocab',
 'weights.pt',
 'transformer.py',
 'data',
 'bpe_en.model',
 'bpe_en.vocab',
 'bpe_ru.model',
 'bpe_ru.vocab',
 'ru_en.csv',
 'data.csv',
 'newsgroups_triplets.csv',
 'news_texts.txt',
 'bpe.model',
 'bpe.vocab',
 'newsgroups_triplets1.csv',
 '6.zip',
 '6.zip.1']

In [0]:
def triplet_loss(anchor_embed, pos_embed, neg_embed):
    return F.cosine_similarity(anchor_embed, neg_embed) - F.cosine_similarity(anchor_embed, pos_embed)
    
    
class Tripletnet(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, output_size, pad_idx,
                 use_pretrained=False, embed_vecs=None):
        super(Tripletnet, self).__init__()
        self.vocab_size, self.embed_dim, self.hidden_size, self.pad_idx = vocab_size, embed_dim, hidden_size, pad_idx
        self.embed = torch.nn.Embedding(vocab_size, embed_dim, pad_idx)
        if use_pretrained:
          self.embed = self.embed.from_pretrained(embed_vecs)
        self.dropout1 = torch.nn.Dropout(p=0.2)
        self.rnn = torch.nn.LSTM(input_size=embed_dim,
                                 hidden_size=hidden_size,
                                 bidirectional=True,
                                 batch_first=True)
        self.dropout2 = torch.nn.Dropout(p=0.2)
        self.fc = torch.nn.Linear(hidden_size*2, output_size)
        
    def branch(self, x):
        batch_size = x.size(0)
        x = self.embed(x)
        x = self.dropout1(x)
        memory, x = self.rnn(x)
        x = x[0]
        x = x.contiguous().view(batch_size, self.hidden_size*2)
        x = self.dropout2(x)
        x = self.fc(x)
        return x

    def forward(self, anchor, pos, neg):
        
        anchor = self.branch(anchor)
        pos = self.branch(pos)
        neg = self.branch(neg)
        
        return triplet_loss(anchor, pos, neg)

In [0]:
dev1 = device('cuda')
model = Tripletnet(vocab_size=len(TEXT.vocab.stoi),
                   embed_dim=300, hidden_size=128,
                   output_size=100,
                   pad_idx=TEXT.vocab.stoi["<pad>"]).to(dev1)

In [0]:
optim = torch.optim.Adam(model.parameters())
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim)

In [24]:
history = nn_train(model, train_iter, val_iter, optim, dev1, n_epochs=20, scheduler=scheduler)
dump_model_weights(model, 'triplet_weights.pt')
dump_vocab(TEXT.vocab, 'triplet-vocab')

HBox(children=(IntProgress(value=0, description='epoch 0', max=708, style=ProgressStyle(description_width='ini…

validation loss 0.03557


HBox(children=(IntProgress(value=0, description='epoch 1', max=708, style=ProgressStyle(description_width='ini…

validation loss 0.03197


HBox(children=(IntProgress(value=0, description='epoch 2', max=708, style=ProgressStyle(description_width='ini…

validation loss 4.42779


HBox(children=(IntProgress(value=0, description='epoch 3', max=708, style=ProgressStyle(description_width='ini…

validation loss 6.28216


HBox(children=(IntProgress(value=0, description='epoch 4', max=708, style=ProgressStyle(description_width='ini…

validation loss 5.83671


HBox(children=(IntProgress(value=0, description='epoch 5', max=708, style=ProgressStyle(description_width='ini…

validation loss 6.71228


HBox(children=(IntProgress(value=0, description='epoch 6', max=708, style=ProgressStyle(description_width='ini…

validation loss 7.25557


HBox(children=(IntProgress(value=0, description='epoch 7', max=708, style=ProgressStyle(description_width='ini…

validation loss 7.46541


HBox(children=(IntProgress(value=0, description='epoch 8', max=708, style=ProgressStyle(description_width='ini…

validation loss 8.36165


HBox(children=(IntProgress(value=0, description='epoch 9', max=708, style=ProgressStyle(description_width='ini…

validation loss 7.48165


HBox(children=(IntProgress(value=0, description='epoch 10', max=708, style=ProgressStyle(description_width='in…

validation loss 7.73642


HBox(children=(IntProgress(value=0, description='epoch 11', max=708, style=ProgressStyle(description_width='in…

validation loss 9.38528


HBox(children=(IntProgress(value=0, description='epoch 12', max=708, style=ProgressStyle(description_width='in…

validation loss 10.17932


HBox(children=(IntProgress(value=0, description='epoch 13', max=708, style=ProgressStyle(description_width='in…

validation loss 10.44265


HBox(children=(IntProgress(value=0, description='epoch 14', max=708, style=ProgressStyle(description_width='in…

validation loss 9.88893


HBox(children=(IntProgress(value=0, description='epoch 15', max=708, style=ProgressStyle(description_width='in…

validation loss 9.88740


HBox(children=(IntProgress(value=0, description='epoch 16', max=708, style=ProgressStyle(description_width='in…

validation loss 10.20629


HBox(children=(IntProgress(value=0, description='epoch 17', max=708, style=ProgressStyle(description_width='in…

validation loss 10.21989


HBox(children=(IntProgress(value=0, description='epoch 18', max=708, style=ProgressStyle(description_width='in…

validation loss 10.54216


HBox(children=(IntProgress(value=0, description='epoch 19', max=708, style=ProgressStyle(description_width='in…

validation loss 10.54809


In [0]:
LABEL = data.LabelField(dtype=torch.int64)

In [0]:
fields = (('text', TEXT), ('label', LABEL))

In [0]:
train_dataset = [data.Example.fromlist(x, fields) for x in zip(dataset.data, dataset.target)]

In [0]:
train_dataset = data.Dataset(train_dataset, fields)

In [0]:
LABEL.build_vocab(train_dataset)

In [0]:
train_dataset = data.BucketIterator(train_dataset, batch_size=batch_size)

In [81]:
encode_as_vecs(train_dataset, model, dev1, 'X_train.npy', 'y_train.npy')

HBox(children=(IntProgress(value=0, max=89), HTML(value='')))

In [86]:
test_dataset = fetch_20newsgroups(subset='test', download_if_missing=True)
test_dataset = [data.Example.fromlist(x, fields) for x in zip(test_dataset.data, test_dataset.target)]
test_dataset = data.Dataset(test_dataset, fields)
test_dataset = data.BucketIterator(test_dataset, batch_size=batch_size)
encode_as_vecs(test_dataset, model, dev1, 'X_test.npy', 'y_test.npy')

HBox(children=(IntProgress(value=0, max=59), HTML(value='')))

## Sklearn kNN

In [0]:
from sklearn.neighbors import KNeighborsClassifier

In [0]:
X_train, y_train, X_test, y_test = np.load('X_train.npy'), np.load('y_train.npy'), np.load('X_test.npy'), np.load('y_test.npy')

In [0]:
clf = KNeighborsClassifier()

In [90]:
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [91]:
clf.score(X_train, y_train)

0.28619409581050026

In [92]:
clf.score(X_test, y_test)

0.055098247477429634