##### Copyright 2019 The TensorFlow Authors.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csv \
    -O /tmp/bbc-text.csv

In [None]:
import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Stopwords list from https://github.com/Yoast/YoastSEO.js/blob/develop/src/config/stopwords.js
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", 
             "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", 
             "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", 
             "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", 
             "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", 
             "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", 
             "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", 
             "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", 
             "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", 
             "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", 
             "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", 
             "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", 
             "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", 
             "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", 
             "yours", "yourself", "yourselves" ]

In [None]:
sentences = []
labels = []
with open("./bbc-text.csv", 'r') as csvfile:
    csv_reader  = csv.reader(csvfile, delimiter=',')
    next(csv_reader)
    for row in csv_reader:
        labels.append(row[0])
        sentence = row[1]
        for word in stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token, " ")
            sentence = sentence.replace("  ", " ")
        sentences.append(sentence)

print(len(sentences))
print(sentences[0])

# Expected output
# 2225
# tv future hands viewers home theatre systems plasma high-definition tvs digital video recorders moving living room 
# way people watch tv will radically different five years time. according expert panel gathered annual consumer electronics 
# show las vegas discuss new technologies will impact one favourite pastimes. us leading trend programmes content will 
# delivered viewers via home networks cable satellite telecoms companies broadband service providers front rooms portable 
# devices. one talked-about technologies ces digital personal video recorders (dvr pvr). set-top boxes like us s tivo uk s 
# sky+ system allow people record store play pause forward wind tv programmes want. essentially technology allows much 
# personalised tv. also built-in high-definition tv sets big business japan us slower take off europe lack high-definition 
# programming. not can people forward wind adverts can also forget abiding network channel schedules putting together a-la-carte 
# entertainment. us networks cable satellite companies worried means terms advertising revenues well brand identity viewer loyalty 
# channels. although us leads technology moment also concern raised europe particularly growing uptake services like sky+. happens 
# today will see nine months years time uk adam hume bbc broadcast s futurologist told bbc news website. likes bbc no issues lost 
# advertising revenue yet. pressing issue moment commercial uk broadcasters brand loyalty important everyone. will talking content 
# brands rather network brands said tim hanlon brand communications firm starcom mediavest. reality broadband connections anybody 
# can producer content. added: challenge now hard promote programme much choice. means said stacey jolna senior vice president tv 
# guide tv group way people find content want watch simplified tv viewers. means networks us terms channels take leaf google s book 
# search engine future instead scheduler help people find want watch. kind channel model might work younger ipod generation used 
# taking control gadgets play them. might not suit everyone panel recognised. older generations comfortable familiar schedules channel 
# brands know getting. perhaps not want much choice put hands mr hanlon suggested. end kids just diapers pushing buttons already - 
# everything possible available said mr hanlon. ultimately consumer will tell market want. 50 000 new gadgets technologies showcased 
# ces many enhancing tv-watching experience. high-definition tv sets everywhere many new models lcd (liquid crystal display) tvs 
# launched dvr capability built instead external boxes. one example launched show humax s 26-inch lcd tv 80-hour tivo dvr dvd recorder. 
# one us s biggest satellite tv companies directtv even launched branded dvr show 100-hours recording capability instant replay search 
# function. set can pause rewind tv 90 hours. microsoft chief bill gates announced pre-show keynote speech partnership tivo called 
# tivotogo means people can play recorded programmes windows pcs mobile devices. reflect increasing trend freeing multimedia people 
# can watch want want.

In [None]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(len(word_index))
# Expected output
# 29714

In [None]:
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)

# Expected output
# [  96  176 1158 ...    0    0    0]
# (2225, 2442)

In [None]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)
label_word_index = label_tokenizer.word_index
label_seq = label_tokenizer.texts_to_sequences(labels)
print(label_word_index)
# Expected Output
# {'sport': 1, 'business': 2, 'politics': 3, 'tech': 4, 'entertainment': 5}