# Basic Text Processing with Tensorflow

Playground for trying out different feedforward NN is at:

https://playground.tensorflow.org/

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [3]:
sentences = [
    "We live in the Bay Area.",
    "We communte to work in San Francisco.",
    "We don't like commuting."
]

In [4]:
MAX_VOCAB_SIZE = 10000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [5]:
print(sequences)

[[1, 3, 2, 4, 5, 6], [1, 7, 8, 9, 2, 10, 11], [1, 12, 13, 14]]


In [6]:
# Map word to index
tokenizer.word_index

{'we': 1,
 'in': 2,
 'live': 3,
 'the': 4,
 'bay': 5,
 'area': 6,
 'communte': 7,
 'to': 8,
 'work': 9,
 'san': 10,
 'francisco': 11,
 "don't": 12,
 'like': 13,
 'commuting': 14}

In [8]:
# using defaults
data = pad_sequences(sequences)
print(data)

[[ 0  1  3  2  4  5  6]
 [ 1  7  8  9  2 10 11]
 [ 0  0  0  1 12 13 14]]


In [9]:
MAX_SEQUENCE_LENGTH = 7
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(data)

[[ 0  1  3  2  4  5  6]
 [ 1  7  8  9  2 10 11]
 [ 0  0  0  1 12 13 14]]


In [10]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print(data)

[[ 1  3  2  4  5  6  0]
 [ 1  7  8  9  2 10 11]
 [ 1 12 13 14  0  0  0]]


In [11]:
data = pad_sequences(sequences, maxlen=8)
print(data)

[[ 0  0  1  3  2  4  5  6]
 [ 0  1  7  8  9  2 10 11]
 [ 0  0  0  0  1 12 13 14]]


In [12]:
data = pad_sequences(sequences, maxlen=5)
print(data)

[[ 3  2  4  5  6]
 [ 8  9  2 10 11]
 [ 0  1 12 13 14]]


In [13]:
data = pad_sequences(sequences, maxlen=5, truncating='post')
print(data)

[[ 1  3  2  4  5]
 [ 1  7  8  9  2]
 [ 0  1 12 13 14]]
