<a href="https://colab.research.google.com/github/namratagulati/textingwithcnn/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import re

In [2]:
def clean(string):
    # remove letters, numbers, or certain punctuation with a space
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    # Add space before apostrophes in common contractions
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)

    # Add spaces around commas, exclamation marks, parentheses, and question marks
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)

    # Replace multiple consecutive whitespaces with a single space
    string = re.sub(r"\s{2,}", " ", string)

    # change to lowercase and remove leading/trailing whitespaces
    return string.strip().lower()

In [3]:
def load(positive_data, negative_data):
    #load data, split it, generate labels

    positive_eg = list(open(positive_data, "r", encoding='utf-8').readlines())
    positive_eg = [s.strip() for s in positive_eg]
    negative_eg = list(open(negative_data, "r", encoding='utf-8').readlines())
    negative_eg = [s.strip() for s in negative_eg]
    # Split by words
    x_text = positive_eg + negative_eg
    x_text = [clean(sent) for sent in x_text]
    # create final set of labels
    poslabels = [[0, 1] for _ in positive_eg]
    neglabels = [[1, 0] for _ in negative_eg]
    y = np.concatenate([poslabels, neglabels], 0)
    #return processed text and their corresponding labels
    return [x_text, y]

In [4]:
def batch_iteration(data, batch_size, no_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    #number of batches per epoch
    bpe = int((len(data)-1)/batch_size) + 1
    for epoch in range(no_epochs):
        # Shuffle at each epoch, iterate each mini batch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(bpe):
            start = batch_num * batch_size
            end = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start:end]