# LANGUAGE MODELS

**Setup: we will include a bunch of libraries**

In [None]:
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
from copy import deepcopy

In [None]:
random.seed(123)

In [None]:
!wget https://www.dropbox.com/s/nh63vo9ysiura2w/dinos.txt?dl=0 && mv dinos.txt?dl=0 dinos.txt

## Dinosaurus land Dataset

insired by [Kulbear](https://github.com/Kulbear/deep-learning-coursera/blob/master/Sequence%20Models/Dinosaurus%20Island%20--%20Character%20level%20language%20model%20final%20-%20v3.ipynb).

### Dataset and Preprocessing

Read the dataset of dinosaur names, create a list of unique characters (such as a-z), and compute the dataset and vocabulary size.

In [None]:
names = [name.strip().lower() for name in open('dinos.txt').readlines()]
print(names[:10])

### Bigrams + NLTK

In [None]:
chars = [char for name in names for char in name]
freq = nltk.FreqDist(chars)

print(list(freq.keys()))

In [None]:
cfreq = nltk.ConditionalFreqDist(nltk.bigrams(chars))
print(cfreq['a'])

In [None]:
cprob = nltk.ConditionalProbDist(cfreq, nltk.MLEProbDist)
print('p(a, a) = %1.4f' %cprob['a'].prob('a'))
print('p(a, b) = %1.4f' %cprob['a'].prob('b'))
print('p(a, u) = %1.4f' %cprob['a'].prob('u'))

In [None]:
l = sum([freq[char] for char in freq])
def unigram_prob(char):
    return freq[char] / l
print('p(a) = %1.4f' %unigram_prob('a'))

In [None]:
cprob['a'].generate()

Write NLTK version of generating random DINASOUR NAME with length $n$.

In [None]:
from string import ascii_lowercase # all the letters in lowercase

def generate(c, n):
    resulting_string = "" + c
    for i in range(n - 1):
        
        max_prob = 0
        next_c = 'z'
        for letter in ascii_lowercase:
           prob = cprob[resulting_string[-1]].prob(letter)
           if prob > max_prob:
               max_prob = prob
               next_c = letter
        
        resulting_string += next_c

    return resulting_string

In [None]:
generate('d', 100)

Does it really depend on $n$ characters??...




## RNN VERSION

# Character-Level RNN

<img src="http://karpathy.github.io/assets/rnn/charseq.jpeg"  width="500" />

An example RNN with 4-dimensional input and output layers, and a hidden layer of 3 units (neurons). This diagram shows the activations in the forward pass when the RNN is fed the characters "hell" as input. The output layer contains confidences the RNN assigns for the next character (vocabulary is "h,e,l,o"); We want the green numbers to be high and red numbers to be low.

Source: [karpathy blog](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)

### DINO Names generation

#### Data Preprocessing

The characters are a-z (26 characters) plus the "\n" (or newline character), which in this assignment plays a role similar to the <EOS> (or "End of sentence") token we had discussed in lecture, only here it indicates the end of the dinosaur name rather than the end of a sentence. In the cell below, we create a python dictionary (i.e., a hash table) to map each character to an index from 0-26.

In [None]:
data = open('dinos.txt', 'r').read()
data = data.lower()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)

Converse it into a dataset.

In [None]:
from torch.utils.data import Dataset, DataLoader

class DinosDataset(Dataset):
    def __init__(self):
        super().__init__()
        with open('dinos.txt') as f:
            content = f.read().lower()
            self.vocab = sorted(set(content))
            self.vocab_size = len(self.vocab)
            self.lines = content.splitlines()

        self.ch_to_idx = { ch:i for i,ch in enumerate(sorted(chars)) }
        self.idx_to_ch = { i:ch for i,ch in enumerate(sorted(chars)) }
    
    def __getitem__(self, index):
        line = self.lines[index]
        x_str = line
        y_str = line[1:] + '\n'
        x = torch.zeros([len(x_str), self.vocab_size], dtype=torch.float)
        y = torch.empty(len(x_str), dtype=torch.long)
        for i, (x_ch, y_ch) in enumerate(zip(x_str, y_str)):
            x[i][self.ch_to_idx[x_ch]] = 1
            y[i] = self.ch_to_idx[y_ch]
        
        return x, y
    
    def __len__(self):
        return len(self.lines)

In [None]:
dino_dataset = DinosDataset()

### Build Model

##### Setup

In [None]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import pdb
from torch.utils.data import Dataset, DataLoader

%load_ext autoreload
%autoreload 2

torch.set_printoptions(linewidth=200)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        # define Ws, Wh and let's add dropout
        self.Wh = nn.Linear(hidden_size + input_size, hidden_size)
        self.Ws = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, h_prev, x):
        stack = torch.cat([h_prev, x], dim=1)
        h = torch.tanh(self.dropout(self.Wh(stack)))
        
        y = self.Ws(h)
        return h, y

### Train


In [None]:
train_dino_dataloader = DataLoader(dino_dataset, shuffle=True)
next(iter(train_dino_dataloader))

In [None]:
input_size = dino_dataset.vocab_size # == 27
hidden_size = 50
  
model = RNN(input_size, hidden_size, input_size).to(device)
cross_entropy = nn.CrossEntropyLoss()
sgd = optim.SGD(model.parameters(), lr=1e-2) # or Adam or RSMPROP

In [None]:
def train_one_epoch(model, objective, optimizer):
    model.train()
    for batch, (x, y) in enumerate(train_dino_dataloader):
        loss = 0

        # zero the gradient 
        optimizer.zero_grad()

        # define h_prev, x and y
        x, y = x.to(device), y.to(device)
        h_prev = torch.zeros([1, hidden_size], dtype=torch.float, device=device)

        # symbol by symbol feed input into the model 
        for i in range(x.shape[1]):
            h_prev, y_pred = model(h_prev, x[:, i])
            loss += objective(y_pred, y[:, i])
            
        # create gradients & perform optimization step
        loss.backward()
        optimizer.step()


In [None]:
def train(toe_f, model, objective, optimizer, dataset='dinos', epochs=1):
    for e in range(1, epochs + 1):
        # update 
        print('Epoch:{}'.format(e))
        toe_f(model, objective, optimizer)

In [None]:
train(train_one_epoch, model, cross_entropy, sgd, epochs=50)

### Sample

In [None]:
def sample(model):
    model.eval()
    word_size=0
    newline_idx = dino_dataset.ch_to_idx['\n']
    with torch.no_grad():
        h_prev = torch.zeros([1, hidden_size], dtype=torch.float, device=device)
        x = h_prev.new_zeros([1, dino_dataset.vocab_size])
        start_char_idx = random.randint(1, dino_dataset.vocab_size-1)
        indices = [start_char_idx]
        x[0, start_char_idx] = 1
        predicted_char_idx = start_char_idx
        
        while predicted_char_idx != newline_idx and word_size != 50:
            h_prev, y_pred = model(h_prev, x)
            y_softmax_scores = torch.softmax(y_pred, dim=1)
            
            np.random.seed(np.random.randint(1, 5000))
            idx = np.random.choice(np.arange(dino_dataset.vocab_size), p=y_softmax_scores.cpu().numpy().ravel())
            indices.append(idx)
            
            x = (y_pred == y_pred.max(1)[0]).float()
            predicted_char_idx = idx
            
            word_size += 1
        
        if word_size == 50:
            indices.append(newline_idx)
    return indices

In [None]:
def print_sample(sample):
    for idx in sample:
        print(dino_dataset.idx_to_ch[idx], end='')

In [None]:
name = sample(model)
print_sample(name)

Add some kind of logger: at least sample, better loss.

In [None]:
# update train_one_epoch to log & visualize loss function
def train_one_epoch_vis(model, objective, optimizer):
    model.train()
    for i, (x, y) in enumerate(train_dino_dataloader):
        loss = 0
        optimizer.zero_grad()
        h_prev = torch.zeros([1, hidden_size], dtype=torch.float, device=device)
        x, y = x.to(device), y.to(device)
        for i in range(x.shape[1]):
            h_prev, y_pred = model(h_prev, x[:, i])
            loss += objective(y_pred, y[:, i])
            
        if (line_num+1) % 100 == 0:
            print_sample(sample(model))
        loss.backward()
        optimizer.step()

In [None]:
train(train_one_epoch_vis, model, cross_entropy, sgd, epochs = 50)

# Hometask. Part 1. 
(1.5 points)

Implement LSTM.

Evaluate on another set of data -> text generation.


In [None]:
class LSTM(nn.Module):
    raise NotImplementedError

## FUN PART

inspired by [MelLain](https://github.com/MelLain/hse-nlp/blob/master/seminars/sem5_LMs/Dinosaur%20Island%20LM.ipynb)

In [None]:
import numpy as np
import pandas as pd
from os import listdir
from lxml.html import fromstring

from matplotlib import pyplot as plt

%matplotlib inline

In [None]:
with open('mytext.txt', 'w') as file:
  file.write("""Dursley pretended she didn’t have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what the neighbours would say if the Potters arrived in the street. The Dursleys knew that the Potters had a small son, too, but they had never even seen him. This boy was another good reason for keeping the Potters away; they didn’t want Dudley mixing with a child like that.
When Mr and Mrs Dursley woke up on the dull, grey Tuesday our story starts, there was nothing about the cloudy sky outside to suggest that strange and mysterious things would soon be happening all over the country. Mr Dursley hummed as he picked out his most boring tie for work and Mrs Dursley gossiped away happily as she wrestled a screaming Dudley into his high chair.
None of them noticed a large tawny owl flutter past the window.
At half past eight, Mr Dursley picked up his briefcase, pecked Mrs Dursley on the cheek and tried to kiss Dudley goodbye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls. ‘Little tyke,’ chortled Mr Dursley as he left the house. He got into his car and backed out of number four’s drive.
It was on the corner of the street that he noticed the first sign of something peculiar – a cat reading a map. For a second, Mr Dursley didn’t realise what he had seen – then he jerked his head around to look again. There was a tabby cat standing
2
THE BOY WHO LIVED
on the corner of Privet Drive, but there wasn’t a map in sight. What could he have been thinking of? It must have been a trick of the light. Mr Dursley blinked and stared at the cat. It stared back. As Mr Dursley drove around the corner and up the road, he watched the cat in his mirror. It was now reading the sign that said Privet Drive – no, looking at the sign; cats couldn’t read maps or signs. Mr Dursley gave himself a little shake and put the cat out of his mind. As he drove towards town he thought of nothing except a large order of drills he was hoping to get that day.
But on the edge of town, drills were driven out of his mind by something else. As he sat in the usual morning traffic jam, he couldn’t help noticing that there seemed to be a lot of strangely dressed people about. People in cloaks. Mr Dursley couldn’t bear people who dressed in funny clothes – the get-ups you saw on young people! He supposed this was some stupid new fashion. He drummed his fingers on the steering wheel and his eyes fell on a huddle of these weirdos standing quite close by. They were whispering excitedly together. Mr Dursley was enraged to see that a couple of them weren’t young at all; why, that man had to be older than he was, and wearing an emerald- green cloak! The nerve of him! But then it struck Mr Dursley that this was probably some silly stunt – these people were obviously collecting for something ... yes, that would be it. The traffic moved on, and a few minutes later, Mr Dursley arrived in the Grunnings car park, his mind back on drills.
Mr Dursley always sat with his back to the window in his office on the ninth floor. If he hadn’t, he might have found it
3

HARRY POTTER AND THE PHILOSOPHER’S STONE
harder to concentrate on drills that morning. He didn’t see the owls swooping past in broad daylight, though people down in the street did; they pointed and gazed open-mouthed as owl after owl sped overhead. Most of them had never seen an owl even at night-time. Mr Dursley, however, had a perfectly normal, owl-free morning. He yelled at five different people. He made several important telephone calls and shouted a bit more. He was in a very good mood until lunchtime, when he thought he’d stretch his legs and walk across the road to buy himself a bun from the baker’s opposite.
He’d forgotten all about the people in cloaks until he passed a group of them next to the baker’s. He eyed them angrily as he passed. He didn’t know why, but they made him uneasy. This lot were whispering excitedly, too, and he couldn’t see a single collecting tin. It was on his way back past them, clutching a large doughnut in a bag, that he caught a few words of what they were saying.
‘The Potters, that’s right, that’s what I heard –’
‘– yes, their son, Harry –’
Mr Dursley stopped dead. Fear flooded him. He looked
back at the whisperers as if he wanted to say something to them, but thought better of it.
He dashed back across the road, hurried up to his office, snapped at his secretary not to disturb him, seized his telephone and had almost finished dialling his home number when he changed his mind. He put the receiver back down and stroked his moustache, thinking no, he was being stupid. Potter wasn’t such an unusual name.""")

You can load any text in txt format, let's experiment here.

In [None]:
sents = [sent.lower() for sent in open('mytext.txt', 'r').read().split('.')]
print(sents[:10])

In [None]:
sents_pd = pd.DataFrame(sents, columns=['text'])
sents_pd.head()

### Preprocess


Replace punctuation, stop words and create lemmas using NLTK.

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

from string import punctuation

In [None]:
lemmatizer = WordNetLemmatizer()

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
stops = set(stopwords.words('english'))

punct = punctuation + '«»—…“”*№–'

def lemmatize(text):
    
    # use word.strip(punkt), lemmatizer.lemmatize(word) to filter our text
    lemmas = [word.strip(punct) for word in text.lower().split()]
    lemmas = [lemmatizer.lemmatize(word) for word in lemmas if word and word not in stops]
    return " ".join(lemmas)

In [None]:
sents_pd['lemma'] = sents_pd.text
sents_pd.text = sents_pd.text.apply(lemmatize)
sents_pd = sents_pd.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
mask = (sents_pd['text'].str.len() > 1)
sents_pd = sents_pd.loc[mask]

In [None]:
sents_pd.head(10)

### Embeddings

Word2Vec from `gensim`.

In [None]:
vocab = []
for sen in sents_pd.text:
    vocab += [nltk.word_tokenize(sen)]

print(vocab)

In [None]:
from gensim.models import Word2Vec, FastText

model = Word2Vec(vocab, size=300, window=5, min_count=5, workers=4)
model.save('word2v.model')
model.most_similar(u'dursley')

Compare with FastText from the same library `gensim`

In [None]:
ft_model = FastText(vocab, size=300, min_n=5)
ft_model.save('ft.model')
ft_model.most_similar('dursley')

# Hometask 2.

(1 point)

Find fun pretrained W2V (w2v on poetry, nature, basically whatever)

In [None]:
raise NotImplementedError

## Sentence Embeddings

+ mean Word2Vec

In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(w2v.popitem()[1])
        
    def fit(self, X, y):
        return self

    def transform(self, X):
        text = [np.mean([self.word2vec.get(word, np.zeros(self.dim)) for word in text.split()], axis=0) for text in X.text]
        return np.stack(text)

In [None]:
from gensim.models import Word2Vec, FastText

model = Word2Vec.load("word2v.model") 
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

mean_w2v = MeanEmbeddingVectorizer(w2v)
mean_w2v.transform(sents_pd[:10])

+ weighted mean Word2Vec

weights from TfIdfVectorizer

In [None]:
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(w2v.popitem()[1])

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        # YOUR CODE HERE 
        text = [np.mean([self.word2vec.get(word, np.zeros(self.dim)) * self.word2weight[word] for word in text.split()], axis=0) for text in X.text]       
        return text

In [None]:
weighted_mean_w2v = TfidfEmbeddingVectorizer(w2v)
weighted_mean_w2v.fit(sents_pd.text.values, sents_pd.text.values).transform(sents_pd[:10])

## Doc2Vec

Unlike the already trained Word2Vec model, we still need to train the Doc2Vec model. We will use the Doc2Vec model, trained on our case.


In [None]:
from gensim.models.doc2vec import *

splitted_texts = [text for text in sents_pd.text]
idx = [str(i) for i in range(len(sents_pd.text.values))]

docs = []
for i in range(len(sents_pd.text.values)):
    docs.append(TaggedDocument(splitted_texts[i], [idx[i]]))


model = Doc2Vec(size=300, dbow_words=0, window=5, min_count=5, workers=4, alpha=0.025, min_alpha=0.01, dm=0)
model.build_vocab(docs)
model.train(docs, total_examples=len(docs), epochs=50)

In [None]:
model.save("doc2v.model")

Build a vectorizer

In [None]:
class Doc2VecVectorizer(object):
    def __init__(self, d2v_model):
        self.d2v_model = d2v_model
        
    def fit(self, X, y):
        return self

    def transform(self, X):
        # hint: use self.d2v_model.infer_vector(text)
        text = np.array([self.d2v_model.infer_vector(text) for text in X.text])
        return text

In [None]:
d2v_model = Doc2Vec.load("doc2v.model") 

d2v = Doc2VecVectorizer(d2v_model)
d2v.transform(sents_pd[:10])

# Hometask 3

(3 points)

Find different type of context-based embeddings to use on your text and later compare to mean W2V, D2V, etc. Any model: pretrained or trained by you on some kind of data -- should provide you with sentance embeddings. Good start (ELMo or BERT or Whatever). 

In [None]:
raise NotImplementedError

# Hometask 4

(4.5 points)

Create the same visualization for weighted mean W2V, mean W2V and weighted W2V for Fastext/other pretrained W2V, Doc2Vec and your own other embedding: ELMo, etc.

## Visualization 

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)

def get_tsne_vectors(text):
    tsne_input = pd.DataFrame(text, index=sents_pd.text)
    tsne_vectors = tsne.fit_transform(tsne_input.values)
    tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord'])
    return tsne_vectors

In [None]:
from bokeh.models import ColumnDataSource, LabelSet, HoverTool
from bokeh.plotting import figure, show, output_file
from bokeh.core.properties import value
from bokeh.io import output_notebook
output_notebook()

def tsne_embed_viz(vectors, title):
    # add our DataFrame as a ColumnDataSource for Bokeh
    plot_data = ColumnDataSource(vectors)

    # create the plot and configure the
    # title, dimensions, and tools
    tsne_plot = figure(title=title,
                      plot_width = 800,
                      plot_height = 800,
                      tools= (u'pan, wheel_zoom, box_zoom,'
                              u'box_select, reset'),
                      active_scroll=u'wheel_zoom')

    # add a hover tool to display words on roll-over
    tsne_plot.add_tools( HoverTool(tooltips = [('sentence', u'@text'),
                                              ("(x,y)", "(@x_coord, @y_coord)")] ))
    #labels = LabelSet(x=u'x_coord', y=u'y_coord', text=u'word', y_offset=6,
                      #text_font_size=u'8pt', text_color=u'#555555',
                      #source=plot_data, text_align='center')
    #tsne_plot.add_layout(labels)

    # draw the words as circles on the plot
    tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                    color=u'orange', line_alpha=0.6, fill_alpha=0.3,
                    size=5, hover_line_color=u'black')

    # configure visual elements of the plot
    tsne_plot.title.text_font_size = value(u'16pt')
    tsne_plot.xaxis.visible = True
    tsne_plot.yaxis.visible = True
    # engage!
    show(tsne_plot);


In [None]:
raise NotImplementedError