<center><h1>Bag-Of-Words Embedding</center></h1>
<center> Exploring TF and TF-IDF methods </center>

# Import

In [279]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import wordpunct_tokenize, WordNetLemmatizer, sent_tokenize, pos_tag
from nltk.corpus import stopwords as sw, wordnet as wn
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import string 

# BOW from scratch

We'll use South Park scripts as our database for messages : https://github.com/BobAdamsEE/SouthParkData

### Preprocessing, split per sentence

In [177]:
def preprocess(document, max_features = 150, max_sentence_len = 300):
    """
    Returns a normalized, lemmatized list of tokens from a document by
    applying segmentation (breaking into sentences), then word/punctuation
    tokenization, and finally part of speech tagging. It uses the part of
    speech tags to look up the lemma in WordNet, and returns the lowercase
    version of all the words, removing stopwords and punctuation.
    """
    
    def lemmatize(token, tag):
        """
        Converts the tag to a WordNet POS tag, then uses that
        tag to perform an accurate WordNet lemmatization.
        """
        tag = {
        'N': wn.NOUN,
        'V': wn.VERB,
        'R': wn.ADV,
        'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return WordNetLemmatizer().lemmatize(token, tag)

    def vectorize(doc, max_features, max_sentence_len):
        """
        Converts a document into a sequence of indices of length max_sentence_len retaining only max_features unique words
        """
        tokenizer = Tokenizer(num_words=max_features)
        tokenizer.fit_on_texts(doc)
        doc = tokenizer.texts_to_sequences(doc)
        doc_pad = pad_sequences(doc, padding = 'pre', truncating = 'pre', maxlen = max_sentence_len)
        return np.squeeze(doc_pad), tokenizer.word_index

    # Clean the text using a few regular expressions
    document = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", document)
    document = re.sub(r"what's", "what is ", document)
    document = re.sub(r"\'", " ", document)
    document = re.sub(r"@", " ", document)
    document = re.sub(r"\'ve", " have ", document)
    document = re.sub(r"can't", "cannot ", document)
    document = re.sub(r"n't", " not ", document)
    document = re.sub(r"i'm", "i am ", document)
    document = re.sub(r"\'re", " are ", document)
    document = re.sub(r"\'d", " would ", document)
    document = re.sub(r"\'ll", " will ", document)
    document = re.sub(r"(\d+)(k)", r"\g<1>000", document)
    document = document.replace("\n", " ")
    
    cleaned_document = []
    vocab = []
    
    # Break the document into sentences
    for sent in sent_tokenize(document):
        lemmatized_tokens = []

        # Break the sentence into part of speech tagged tokens
        for token, tag in pos_tag(wordpunct_tokenize(sent)):

            # Apply preprocessing to the tokens
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # If punctuation ignore token and continue
            if all(char in set(string.punctuation) for char in token): #token in set(sw.words('english')) or 
                continue

            # Lemmatize the token
            lemma = lemmatize(token, tag)
            lemmatized_tokens.append(lemma)
            vocab.append(lemma)
            
        cleaned_document.append(lemmatized_tokens)
    
    vocab = sorted(list(set(vocab)))
    
    return cleaned_document, vocab

### Preprocessing, split per document

In [205]:
def preprocess(document, max_features = 150, max_sentence_len = 300):
    """
    Returns a normalized, lemmatized list of tokens from a document by
    applying segmentation (breaking into sentences), then word/punctuation
    tokenization, and finally part of speech tagging. It uses the part of
    speech tags to look up the lemma in WordNet, and returns the lowercase
    version of all the words, removing stopwords and punctuation.
    """
    
    def lemmatize(token, tag):
        """
        Converts the tag to a WordNet POS tag, then uses that
        tag to perform an accurate WordNet lemmatization.
        """
        tag = {
        'N': wn.NOUN,
        'V': wn.VERB,
        'R': wn.ADV,
        'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return WordNetLemmatizer().lemmatize(token, tag)

    def vectorize(doc, max_features, max_sentence_len):
        """
        Converts a document into a sequence of indices of length max_sentence_len retaining only max_features unique words
        """
        tokenizer = Tokenizer(num_words=max_features)
        tokenizer.fit_on_texts(doc)
        doc = tokenizer.texts_to_sequences(doc)
        doc_pad = pad_sequences(doc, padding = 'pre', truncating = 'pre', maxlen = max_sentence_len)
        return np.squeeze(doc_pad), tokenizer.word_index
    
    cleaned_document = []
    vocab = []
    
    # Break the document into sentences
    for sent in document:
        
        # Clean the text using a few regular expressions
        sent = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", sent)
        sent = re.sub(r"what's", "what is ", sent)
        sent = re.sub(r"\'", " ", sent)
        sent = re.sub(r"@", " ", sent)
        sent = re.sub(r"\'ve", " have ", sent)
        sent = re.sub(r"can't", "cannot ", sent)
        sent = re.sub(r"n't", " not ", sent)
        sent = re.sub(r"i'm", "i am ", sent)
        sent = re.sub(r"\'re", " are ", sent)
        sent = re.sub(r"\'d", " would ", sent)
        sent = re.sub(r"\'ll", " will ", sent)
        sent = re.sub(r"(\d+)(k)", r"\g<1>000", sent)
        sent = sent.replace("\n", " ")
    
        lemmatized_tokens = []

        # Break the sentence into part of speech tagged tokens
        for token, tag in pos_tag(wordpunct_tokenize(sent)):

            # Apply preprocessing to the tokens
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # If punctuation ignore token and continue
            if all(char in set(string.punctuation) for char in token): #token in set(sw.words('english')) or 
                continue

            # Lemmatize the token
            lemma = lemmatize(token, tag)
            lemmatized_tokens.append(lemma)
            vocab.append(lemma)
            
        cleaned_document.append(lemmatized_tokens)
    
    vocab = sorted(list(set(vocab)))
    
    return cleaned_document, vocab

### Apply it to the data

In [218]:
df = pd.read_csv('All-Seasons.csv')['Line'][:1000]
df.head()

0           You guys, you guys! Chef is going away. \n
1                          Going away? For how long?\n
2                                           Forever.\n
3                                    I'm sorry boys.\n
4    Chef said he's been bored, so he joining a gro...
Name: Line, dtype: object

In [219]:
df, vocab = preprocess(list(df))

### Bag-Of-Words

In [220]:
def generate_bow(allsentences):    
    bag_vector = np.zeros((len(allsentences), len(vocab)))
    for j in range(len(allsentences)):
        for w in allsentences[j]:
            for i,word in enumerate(vocab):
                if word == w: 
                    bag_vector[j,i] += 1
    return bag_vector

In [221]:
bow = generate_bow(df)
bow.shape

(1000, 1589)

# BOW in Sk-learn

### TF

In [268]:
df = pd.read_csv('All-Seasons.csv')['Line'][:1000]

In [269]:
def preprocess(document, max_features = 150, max_sentence_len = 300):
    """
    Returns a normalized, lemmatized list of tokens from a document by
    applying segmentation (breaking into sentences), then word/punctuation
    tokenization, and finally part of speech tagging. It uses the part of
    speech tags to look up the lemma in WordNet, and returns the lowercase
    version of all the words, removing stopwords and punctuation.
    """
    
    def lemmatize(token, tag):
        """
        Converts the tag to a WordNet POS tag, then uses that
        tag to perform an accurate WordNet lemmatization.
        """
        tag = {
        'N': wn.NOUN,
        'V': wn.VERB,
        'R': wn.ADV,
        'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return WordNetLemmatizer().lemmatize(token, tag)

    def vectorize(doc, max_features, max_sentence_len):
        """
        Converts a document into a sequence of indices of length max_sentence_len retaining only max_features unique words
        """
        tokenizer = Tokenizer(num_words=max_features)
        tokenizer.fit_on_texts(doc)
        doc = tokenizer.texts_to_sequences(doc)
        doc_pad = pad_sequences(doc, padding = 'pre', truncating = 'pre', maxlen = max_sentence_len)
        return np.squeeze(doc_pad), tokenizer.word_index

    cleaned_document = []
    vocab = []
    
    # Break the document into sentences
    for sent in document:
        
        lemmatized_tokens = []
        # Clean the text using a few regular expressions
        sent = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", sent)
        sent = re.sub(r"what's", "what is ", sent)
        sent = re.sub(r"\'", " ", sent)
        sent = re.sub(r"@", " ", sent)
        sent = re.sub(r"\'ve", " have ", sent)
        sent = re.sub(r"can't", "cannot ", sent)
        sent = re.sub(r"n't", " not ", sent)
        sent = re.sub(r"i'm", "i am ", sent)
        sent = re.sub(r"\'re", " are ", sent)
        sent = re.sub(r"\'d", " would ", sent)
        sent = re.sub(r"\'ll", " will ", sent)
        sent = re.sub(r"(\d+)(k)", r"\g<1>000", sent)
        sent = sent.replace("\n", " ")

        for token, tag in pos_tag(wordpunct_tokenize(sent)):

            # Apply preprocessing to the tokens
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # If punctuation ignore token and continue
            if all(char in set(string.punctuation) for char in token): #token in set(sw.words('english')) or 
                continue

            # Lemmatize the token
            lemma = lemmatize(token, tag)
            lemmatized_tokens.append(lemma)
            
        cleaned_document.append(' '.join(lemmatized_tokens))
    
    return cleaned_document

In [270]:
df = preprocess(list(df))

In [276]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df).toarray()

In [277]:
X.shape

(1000, 1568)

### TF-IDF

In [280]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df).toarray()