# Preprocessing

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
sentence = ['Hello from other side',
            'Hello from this way',
            'hi']

tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentence)

In [None]:
tokenizer.word_index

{'<OOV>': 1,
 'from': 3,
 'hello': 2,
 'hi': 8,
 'other': 4,
 'side': 5,
 'this': 6,
 'way': 7}

In [None]:
sen_seq = tokenizer.texts_to_sequences(sentence)
sen_padded = pad_sequences(sen_seq)

# Sarcasm Dataset

In [None]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json

--2020-07-22 20:21:45--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.204.128, 64.233.188.128, 64.233.189.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.204.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2020-07-22 20:21:45 (84.0 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [None]:
import json

with open('/tmp/sarcasm.json', 'r') as f:
  dataset = json.load(f)

headline = []
for i in dataset:
  headline.append(i['headline'])

In [None]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(headline)
headline_seq = tokenizer.texts_to_sequences(headline)
headline_padded = pad_sequences(headline_seq)

In [None]:
tokenizer.word_index

In [None]:
type(headline_padded[0])

numpy.ndarray

In [None]:
headline_padded.shape

(26709, 40)

In [None]:
headline_padded[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,   308, 15115,   679,  3337,  2298,    48,   382,  2576,
       15116,     6,  2577,  8434], dtype=int32)

# BBC Dataset

In [None]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csv \
    -O /tmp/bbc-text.csv

  
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



#Stopwords list from https://github.com/Yoast/YoastSEO.js/blob/develop/src/config/stopwords.js
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

--2020-07-22 20:21:52--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.204.128, 64.233.189.128, 108.177.97.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.204.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5057493 (4.8M) [application/octet-stream]
Saving to: ‘/tmp/bbc-text.csv’


2020-07-22 20:21:52 (115 MB/s) - ‘/tmp/bbc-text.csv’ saved [5057493/5057493]



In [None]:
import csv

labels = []
sentences = []
with open('/tmp/bbc-text.csv', 'r') as file:
  reader = csv.reader(file, delimiter=',')
  next(reader)
  for row in reader:
    labels.append(row[0])
    sentence = []
    for word in row[1].split():
      if word not in stopwords:
        sentence.append(word)
    sentence = ' '.join(sentence)
    sentences.append(sentence)

In [None]:
# sentences..
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='pre')

In [None]:
padded.shape

(2225, 2438)

In [None]:
# label..
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(labels)
sequences = tokenizer.texts_to_sequences(labels)
padded = pad_sequences(sequences, padding='pre')

In [None]:
padded[:5], labels[:5]

(array([[5],
        [3],
        [2],
        [2],
        [6]], dtype=int32),
 ['tech', 'business', 'sport', 'sport', 'entertainment'])

# IMDB Review

In [1]:
import tensorflow_datasets as tfds
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [18]:
train_dataset, test_dataset = imdb['train'], imdb['test']

train_sentences = []
test_sentences = []
train_labels = []
test_labels = []

for s, l in train_dataset:
  train_sentences.append(s.numpy().decode('utf8'))
  train_labels.append(l.numpy())

for s, l in test_dataset:
  test_sentences.append(s.numpy().decode('utf8'))
  test_labels.append(l.numpy())

In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [41]:
vocab_size = 10000
oov_token = '<OOV>'
pad_type = 'pre'
trunc_type = 'post'
max_length = 120

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(train_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(train_sentences)
test_padded = pad_sequences(test_sequences, padding=pad_type, truncating=trunc_type, maxlen=max_length)