# 11. Approaching Text Classification/Regression 2

In NLP, every document, text or a string (text object in short) signify an instance which is a row in our dataset representation as a matrix. 

As previously explained, bag of words approach tokenizes the text object and uses every token as a feature. Token counts of the text object are accepted as feature value. Naturally, size is dependent on the number of tokens unsless limited beforehand. 

Representing a word with a vector is called **word embedding**. At the end, we have a dictionary with keys corresponding to words and values corresponding to vectors. The representation of whole document is found by adding all word vectors present in the document. Important part is to find the embedding vectors by reconstructing input sentences.

**Concepts**

- [Word Embeddings]()

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import pandas as pd



In [2]:
# Dataset with folds determined
df = pd.read_csv(os.path.join('data/imdb_folds.csv'))

In [3]:
from nltk.tokenize import word_tokenize

def sentence_to_vec(s, embedding_dict, stop_words, tokenizer):
    
    words = str(s).lower()
    words = tokenizer(words)
    words = [w for w in words if w not in stop_words]
    words = [w for w in words if w.isalpha()]
    
    M = []
    
    for w in words:
        if w in embedding_dict:
            M.append(embedding_dict[w])
            
    if len(M) == 0:
        M = np.zeroes(300)
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v**2).sum())


# Let's concoct an example

embedding_dict = {'new': [1,0,0,3],
                 'example': [2,0,1,0]}

s = ' This is the new example'

sentence_to_vec(s, embedding_dict, [], word_tokenize)

array([0.6882472 , 0.        , 0.22941573, 0.6882472 ])

In order for this code to work, we need `wiki-news-300d-1M.vec` embeddings which is added to input folder. Additionaly, we cannot work with the entire embeddings due to RAM limitations. That's why we limit the number embedding vectors as opposed to the code given in the book.

In [4]:
# fasttext.py

import io
from nltk.tokenize import word_tokenize
from sklearn import metrics
from sklearn import linear_model
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer

limit = 10_000 # Number of words in Embedding dictionary

def load_vectors(fname, limit):
    count = 0
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')

    n, d = map(int, fin.readline().split())
    data = {}

    for line in fin:
        count += 1
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))

        if count == limit:
            break
            
    return data

print('Loading embeddings')

embedding_dict = load_vectors('data/wiki-news-300d-1M.vec', limit)

print('Creating sentence vectors')

vectors = []

for review in df.review.values:
    vectors.append(
        sentence_to_vec(s=review, 
                        embedding_dict=embedding_dict, 
                        stop_words=[], 
                        tokenizer=word_tokenize
                        )
        )

vectors = np.array(vectors)
y = df.sentiment.values

kf = model_selection.StratifiedKFold(n_splits=5)

for fold_, (train_, validate_) in enumerate(kf.split(X=vectors, y=y)):

    print(f'Training fold {fold_}')
    X_train = vectors[train_]
    y_train = y[train_]

    X_test = vectors[validate_]
    y_test = y[validate_]
    model = linear_model.LogisticRegression(solver='sag')
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, preds)
    print(f'Accuracy {accuracy}')
    print('')

Loading embeddings
Creating sentence vectors
Training fold 0
Accuracy 0.8172

Training fold 1
Accuracy 0.8159

Training fold 2
Accuracy 0.8133

Training fold 3
Accuracy 0.8231

Training fold 4
Accuracy 0.8209



We are working in notebook so `config` file will be coded as an object such that attributes can be called.

In [5]:
# config.py

class Config:
    def __init__(self, MAX_LEN, 
                 TRAIN_BATCH_SIZE,
                 VALID_BATCH_SIZE,
                 EPOCHS):
        
        self.MAX_LEN = MAX_LEN
        self.TRAIN_BATCH_SIZE = TRAIN_BATCH_SIZE
        self.VALID_BATCH_SIZE = VALID_BATCH_SIZE
        self.EPOCHS = EPOCHS

config = Config(MAX_LEN = 128, 
                TRAIN_BATCH_SIZE = 16, 
                VALID_BATCH_SIZE = 8, 
                EPOCHS = 1 )

EMBEDDING_WORD_LIMIT = 10_000

`dataset` file code

In [6]:
# dataset

import torch

class IMDBDataset:

    def __init__(self, reviews, targets):
        self.reviews = reviews
        self.targets = targets

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = self.reviews[item,:]
        target = self.targets[item]
        
        return {'review': torch.tensor(review, dtype=torch.long),
                'target': torch.tensor(target, dtype=torch.float)}
    
EMBEDDING_WORD_LIMIT = 10_000

`lstm` file

In [7]:
#lstm.py

import torch
import torch.nn as nn


class LSTM(nn.Module):


    def __init__(self, embedding_matrix):

        super(LSTM, self).__init__()
        num_words = embedding_matrix.shape[0]
        embed_dim = embedding_matrix.shape[1]

        self.embedding = nn.Embedding(
            num_embeddings=num_words, 
            embedding_dim=embed_dim
        )

        self.embedding.weight = nn.Parameter(
            torch.tensor(embedding_matrix, dtype=torch.float32))
        
        self.embedding.weight.requires_grad = False

        self.lstm = nn.LSTM(
            embed_dim,
            128,
            bidirectional=True,
            batch_first=True
        )

        self.out = nn.Linear(512, 1)


    def forward(self, x):

        x = self.embedding(x)
        x, _ = self.lstm(x)
        
        avg_pool = torch.mean(x, 1)
        max_pool, _ = torch.max(x, 1)

        out = torch.cat((avg_pool, max_pool), 1)
        out = self.out(out)

        return out

`enigne.py` file

In [8]:
# engine.py

import torch
import torch.nn as nn

def train(data_loader, model, optimizer, device):

    model.train()

    for data in data_loader:

        reviews = data['review']
        targets = data['target']

        reviews = reviews.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        predictions = model(reviews)

        loss = nn.BCEWithLogitsLoss()(
            predictions,
            targets.view(-1, 1)
        )
        loss.backward()

        optimizer.step()

def evaluate(data_loader, model, device):

    final_predictions = []
    final_targets = []
    model.eval()

    with torch.no_grad():

        for data in data_loader:
            
            reviews = data['review']
            targets = data['target']
            
            reviews = reviews.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            predictions = model(reviews)
            predictions = predictions.cpu().numpy().tolist()
            targets = data['target'].cpu().numpy().tolist()

            final_predictions.extend(predictions)
            final_targets.extend(targets)

    return final_predictions, final_targets

`train.py` file

In [9]:
# train.py

import io
import torch
import numpy as np
import pandas as pd

import tensorflow as tf
from sklearn import metrics


def load_vectors(fname, limit):
    count = 0
    fin = io.open(fname, 'r', encoding='utf-8', 
                  newline='\n', errors='ignore')

    n, d = map(int, fin.readline().split())
    data = {}

    for line in fin:

        count += 1
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))

        if count > limit:
            break
            
    return data


def create_embedding_matrix(word_index, embedding_dict):
    embedding_matrix = np.zeros((len(word_index) + 1, 300))

    for word, i in word_index.items():
        if word in embedding_dict:
            embedding_matrix[i] = embedding_dict[word]

    return embedding_matrix


def run(df, fold):

    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    print('Fitting Tokenizer')

    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df.review.values.tolist())

    X_train = tokenizer.texts_to_sequences(train_df.review.values)
    X_test = tokenizer.texts_to_sequences(valid_df.review.values)

    X_train = tf.keras.preprocessing.sequence.pad_sequences(
        X_train, maxlen=config.MAX_LEN
    )

    X_test = tf.keras.preprocessing.sequence.pad_sequences(
        X_test, maxlen=config.MAX_LEN
    )


    train_dataset = IMDBDataset(reviews=X_train,
                                        targets=train_df.sentiment.values)
    
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=2)

    valid_dataset = IMDBDataset(reviews=X_test,
                                        targets=valid_df.sentiment.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=1)

    print('Loading Embeddings'),
    
    embedding_dict= load_vectors('data/wiki-news-300d-1M.vec', EMBEDDING_WORD_LIMIT)
    
    embedding_matrix = create_embedding_matrix(tokenizer.word_index, embedding_dict)

    device = torch.device('cuda')
    model = LSTM(embedding_matrix)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    print('Training model')
    best_accuracy = 0
    early_stopping_counter = 0

    for epoch in range(config.EPOCHS):

        train(train_data_loader, model, optimizer, device)
        outputs, targets = evaluate(valid_data_loader, model, device)

        outputs = np.array(outputs) >= 0.5

        accuracy = metrics.accuracy_score(targets, outputs)
        print(f'Fold: {fold}, Epoch:{epoch}, Accuracy: {accuracy}')

        if accuracy > best_accuracy:
            best_accuracy = accuracy
        else:
            early_stopping_counter += 1

        if early_stopping_counter > 2:
            break

for fold in range(5):
    run(df, fold)

Fitting Tokenizer
Loading Embeddings


AssertionError: Torch not compiled with CUDA enabled

In [10]:
def load_embeddings(word_index, embedding_file, vector_length=300):

    max_features = len(word_index) + 1
    words_to_find = list(word_index.keys())
    more_words_to_find = []

    for wtf in words_to_find:
        
        more_words_to_find.append(wtf)
        more_words_to_find.append(str(wtf).capitalize())

    more_words_to_find = set(more_words_to_find)

    def get_coefs(word, *arr):
        return word, np.assarray(arr, dtype='float32')

    embeddings_index = dict(
        get_coefs(*o.strip().split(' '))
        for o in open(embedding_file)
        if o.split(' ')[0]
        in more_words_to_find
        and len(0) > 100
    )

    embedding_matrix = np.zeros((max_features, vector_length))

    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)

        if embedding_vector is None:
            embedding_vector = embeddings_index.get(str(word).capitalize())

        if embedding_vector is None:
            embedding_vector = embeddings_index.get(str(word).upper())

        if (embedding_vector is not None and len(embedding_vector) == vector_length):
            embedding_matrix[i] = embedding_vector

    return embedding_matrix
