# playing with pytorch and NLP 

In [1]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset

In [2]:
df = pd.DataFrame({
    'LABEL': [1, 2, 1, 1, 2],
    'NAME': ['ram', 'shyam', 'jadu', 'madhu', 'ganesh']
})

In [3]:
class MyDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __getitem__(self, index):
        row = self.df.iloc[index]
        return {'x_data': row.NAME, 'y_target': row.LABEL}

In [4]:
mydata = MyDataset(df)

In [5]:
mydata.__getitem__(3)

{'x_data': 'madhu', 'y_target': 1}

In [6]:
class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping
    """

    def __init__(self, token_to_index=None, add_unk=True, unk_token="<UNK>"):
        """
        Input:
          token_to_index (dict): a predefined mapping from token
                           to index
          add_unk (bool): flag to indicate whether token needs to be added
          unk_token (str): the unk token to add into Vocabulary
        """

        if token_to_index is None:
            token_to_index = {}
        self._token_to_index = token_to_index
        self._idx_to_token = {
            idx: token
            for token, idx in self._token_to_index.items()
        }
        self._add_unk = add_unk
        self._unk_token = unk_token

        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)

    def to_serializable(self):
        """
        Return: Dictionary that can be serialized
        """
        return {
            'token_to_index': self._token_to_index,
            'add_unk': self._add_unk,
            'unk_token': self.unk_token
        }

    @classmethod
    def from_serializable(cls, contents):
        """
        Input: 
            contents (dict)
        Return: 
            Instantiates the Vocabulary 
            from a serializable dictionary
        """
        return cls(**contents)

    def add_token(self, token):
        """
        Input:
            token (str): the item to be added into the Vocabulary
        Return:
            index (int): the integer corresponding to the token
        """

        if token in self._token_to_index:
            index = self._token_to_index[token]
        else:
            index = len(self._token_to_index)
            self._token_to_index[token] = index
            self._idx_to_token[index] = token
        return index

    def lookup_token(self, token):
        """
        Retrieve the index associated with the token or the
        UNK index if the token isn't present

        Input:
            token(str): the token to look up
        Return:
            index (int):  the index corresponding to the
            token
        Notes:
            `unk_index` needs to be >=0 (having added into the
            Vocabulary) for the UNK functionality
        """

        if self.add_unk:
            return self._token_to_index.get(token, self.unk_index)
        else:
            self._token_to_index[token]

        def lookup_index(self, index):

            if index not in self._idx_to_token:
                raise KeyError(
                    "the index {} is not in the Vocabulary".format(index))

            return self._idx_to_token[index]

        def __str__(self):
            return "<Vocabulary(size={})>".format(len(self))

        def __len__(self):
            return len(self._token_to_index)

In [8]:
class ReviewVectorizer(object):
    """
    The Vectorizer class coordinates with the Vocabulary class and 
    puts them into use
    """

    def __init__(self, review_vocab, rating_vocab):
        """
        Input:
            review_vocab (Vocabulary): maps words to integers
            review_rating (Vocabulary): maps class labels to integers
        """
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab

        def vectorize(self, review):
            """
            create collapsed one hot vector for the review
            input:
                review (str): the review
            return:
                one_hot (np.ndarray): the collapsed one hot encoding
            """

            one_hot = np.zeros(len(self.review_vocab), dtype=np.float32)

            for token in review.split(" "):
                if token not in string.punctuation:
                    one_hot[self.review_vocab.lookup_token(token)]

            return one_hot

        @classmethod
        def from_dataframe(cls, review_df, cutoff=25):
            """
            Instantiate the vectorizer from the dataset dataframe
            input:
                review_df (pandas.DataFrame): the review dataset
                cutoff (int): the parameter for frequency based
                filtering
            return:
                an instance of ReviewVectorizer 
            """

            review_vocab = Vocabulary(add_unk=True)
            rating_vocab = Vocabulary(add_unk=False)

            # add ratings
            for rating in sorted(set(review_df.rating)):
                rating_vocab.add_token(rating)

            # add top words if count > provided count
            word_counts = Couner()  # dictionary

            for review in review_df.review:
                for words in review.split(" "):
                    if word not in string.punctuation:
                        word_counts[word] += 1

            for word, count in word_counts.items():
                if count > cutoff:
                    review_vocab.add_token(word)

            return cls(review_vocab, rating_vocab)

        @classmethod
        def from_serializable(cls, contents):
            """
            Instantiate a ReviewVectorizer from a serializable 
            dictionary

            input:
                contents (dict): the serializable dictionary
            return:
                an instance of the ReviewVectorizer class
            """

            review_vocab = Vocabulary.from_serializable(
                contents['review_vocab'])
            rating_vocab = Vocabulary.from_serializable(
                contents['rating_vocab'])

            return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)

        def to_serializable(self):
            """
            Create the serializable dictionary for caching
            return:
                contents (dict): the serializable dictionary
            """

            return {
                'review_vocab': self.review_vocab.to_serializable(),
                'rating_vocab': self.rating_vocab.to_serializable()
            }

In [9]:
def generate_batches(dataset,
                     batch_size,
                     shuffle=True,
                     drop_last=True,
                     device='cpu'):
    """
    A generator function which wraps the PyTorch DataLoader.
    It will ensure each tensor is on the right device location
    """

    dataloader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)

        yield out_data_dict

In [7]:
x = {'a': 37, 'b': 42, 'c': 927}

y = 'hello ' 'world'
z = 'hello ' + 'world'
a = 'hello {}'.format('world')


class foo(object):
    def f(self):
        return 37 * -+2

    def g(self, x, y=42):
        return y


def f(a):
    return 37 + -+a[42 - x:y**3]