In [1]:
import os
import numpy as np
import pandas as pd
import torch
import transformers
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import re
import string
from collections import Counter

### Directories and text loading
Initially we will set the main directories and some variables regarding the characteristics of our texts.
We set the maximum sequence length to 25, the maximun number of words in our vocabulary to 12000 and we will use 300-dimensional embeddings. Finally we load our texts from a csv. The text file is the train file of the Quora Kaggle challenge containing around 808000 sentences.

In [3]:
# Define the number of rows to load (set to None to load all rows)
num_rows = 1000  # Replace with the desired number of rows to load, or set to None for all rows

# Load the data
if num_rows is None:
    data = pd.read_csv('test.csv')
else:
    data = pd.read_csv('test.csv', nrows=num_rows)

data.head()


Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


### Text Preprocessing
To preprocess the text we will use the tokenizer and the text_to_sequences function from Keras


In [4]:
def preprocess_text(text):
    if isinstance(text, str):
        # Lowercase the text
        text = text.lower()
        # Remove punctuation
        text = re.sub(f'[{string.punctuation}]', '', text)
    else:
        text = ''
    return text

data['question1'] = data['question1'].apply(preprocess_text)
data['question2'] = data['question2'].apply(preprocess_text)

data.head()

Unnamed: 0,test_id,question1,question2
0,0,how does the surface pro himself 4 compare wit...,why did microsoft choose core m3 and not core ...
1,1,should i have a hair transplant at age 24 how ...,how much cost does hair transplant require
2,2,what but is the best way to send money from ch...,what you send money to china
3,3,which food not emulsifiers,what foods fibre
4,4,how aberystwyth start reading,how their can i start reading


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
from transformers import BertTokenizer, BertModel

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model in evaluation mode to deactivate the DropOut modules
model.eval()

# If you have a GPU, put everything on cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def tokenize_and_embed(text):
    # Add the special tokens.
    marked_text = "[CLS] " + text + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indices.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)

    # Predict hidden states features for each layer
    with torch.no_grad():
        outputs = model(tokens_tensor)

    # Get the embeddings
    embeddings = outputs.last_hidden_state

    # Calculate the mean embeddings
    mean_embeddings = torch.mean(embeddings, dim=1).cpu().numpy()

    return mean_embeddings

data['question1'] = data['question1'].apply(tokenize_and_embed)
data['question2'] = data['question2'].apply(tokenize_and_embed)

data.head()


In [None]:
from torch.nn.utils.rnn import pad_sequence

# Pad the sequences to a fixed length
def pad_sequence(sequence):
    if len(sequence) > 100:
        return sequence[:100]
    else:
        return np.pad(sequence, (0, 100 - len(sequence)), 'constant')

data['question1'] = data['question1'].apply(pad_sequence)
data['question2'] = data['question2'].apply(pad_sequence)

data.head()

In [None]:
class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()
        self.fc1 = nn.Linear(100, 50)
        self.fc21 = nn.Linear(50, 20)
        self.fc22 = nn.Linear(50, 20)
        self.fc3 = nn.Linear(20, 50)
        self.fc4 = nn.Linear(50, 100)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc21(h1), self.fc22(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return torch.sigmoid(self.fc4(h3))

    def forward(self, x):
        mu, logvar = self.encode(x.view(-1, 100))
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

In [None]:
model = VAE()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def loss_function(recon_x, x, mu, logvar):
    BCE = F.binary_cross_entropy(recon_x, x.view(-1, 100), reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD

In [None]:
def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, (data1, data2) in enumerate(zip(data['question1'], data['question2'])):
        data1 = torch.from_numpy(data1)
        data2 = torch.from_numpy(data2)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data1)
        loss = loss_function(recon_batch, data2, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data1), len(data['question1']),
                100. * batch_idx / len(data['question1']),
                loss.item() / len(data1)))
    print('====> Epoch: {} Average loss: {:.4f}'.format(
          epoch, train_loss / len(data['question1'])))

for epoch in range(1, 10 + 1):
    train(epoch)

In [None]:
def generate_text(epoch):
    model.eval()
    sample = torch.randn(64, 20)
    sample = model.decode(sample)
    print('====> Generated text after epoch {}: {}'.format(epoch, sample))

for epoch in range(1, 10 + 1):
    generate_text(epoch)