In [1]:
import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:

import zipfile

# Extract the archive
local_zip = '/content/BBC News Train.csv.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('bbc-train')
zip_ref.close()

local_zip = '/content/BBC News Test.csv.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('bbc-test')
zip_ref.close()

In [5]:
with open('/content/bbc-train/BBC News Train.csv','r') as f:
  print(f'This is the header\n {f.readline()}')
  print(f'This is the first data entry \n {f.readline()}')

This is the header
 ArticleId,Text,Category

This is the first data entry 



#Removing Stopwords
One important step when working with text data is to remove the stopwords from it. These are the most common words in the language and they rarely provide useful information for the classification process.

Complete the remove_stopwords below. This function should receive a string and return another string that excludes all of the stopwords provided.

In [18]:
# GRADED FUNCTION: remove_stopwords
def remove_stopwords(sentence):
    # List of stopwords
    stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", 
                 "an", "and", "any", "are", "as", "at", "be", "because", "been", 
                 "before", "being", "below", "between", "both", "but", "by", "could", 
                 "did", "do", "does", "doing", "down", "during", "each", "few", "for", 
                 "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", 
                 "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", 
                 "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", 
                 "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", 
                 "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", 
                 "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some",
                 "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", 
                 "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", 
                 "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd",
                 "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
                 "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", 
                 "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
    
    # Sentence converted to lowercase-only
    sentence = sentence.lower()
    
    ### START CODE HERE
    words_list=sentence.split(' ')
    new_words_list=[val for val in words_list if val not in stopwords]
    sentence=' '.join(new_words_list)
    ### END CODE HERE
    return sentence

In [19]:
# Test your function
remove_stopwords("I am about to go to the store and get any snack")

'go store get snack'

#Reading the raw data
Now you need to read the data from the csv file. To do so, complete the parse_data_from_file function.

A couple of things to note:

You should omit the first line as it contains the headers 
and not data points.


There is no need to save the data points as numpy arrays, regular lists is fine.

To read from csv files use csv.reader by passing the appropriate arguments.

csv.reader returns an iterable that returns each row in every iteration. So the label can be accessed via row[0] and the text via row[1].

Use the remove_stopwords function in each sentence.





In [22]:
def parse_data_from_file(filename):
    sentences = []
    labels = []
    with open(filename, 'r') as csvfile:
        ### START CODE HERE
        next(csvfile)
        sentences=[]
        labels=[]

        csv_reader=csv.reader(csvfile,delimiter=',')
        for row in csv_reader:
          sentence=' '.join(row[1:-1])
          sentences.append(sentence)
          labels.append(row[-1])
        ### END CODE HERE
    return sentences, labels

In [26]:
# Test your function
sentences, labels = parse_data_from_file("/content/bbc-train/BBC News Train.csv")
val_sentences,val_labels=parse_data_from_file("/content/bbc-test/BBC News Test.csv")

print(f"There are {len(sentences)} sentences in the train dataset.\n")
print(f"There are {len(val_sentences)} sentences in the test dataset.\n")
print(f"First sentence of training has {len(sentences[0].split())} words (after removing stopwords).\n")
print(f"First sentence: {sentences[0]} \n ")
print(f"There are {len(labels)} labels in the train dataset.\n")
print(f"There are {len(val_labels)} labels in the test dataset.\n")
print(f"The first 5 train labels are {labels[:5]}")

There are 1490 sentences in the train dataset.

There are 735 sentences in the test dataset.

First sentence of training has 301 words (after removing stopwords).

 
There are 1490 labels in the train dataset.

There are 735 labels in the test dataset.

The first 5 train labels are ['business', 'business', 'business', 'tech', 'business']


#Using the Tokenizer
Now it is time to tokenize the sentences of the dataset.

Complete the fit_tokenizer below.

This function should receive the list of sentences as input and return a Tokenizer that has been fitted to those sentences. You should also define the "Out of Vocabulary" token as <OOV>.

In [27]:
def fit_tokenizer(sentences):
    ### START CODE HERE
    # Instantiate the Tokenizer class by passing in the oov_token argument
    tokenizer=Tokenizer(oov_token="<OOV>")
    # Fit on the sentences
    tokenizer.fit_on_texts(sentences)
    ### END CODE HERE
    return tokenizer

In [28]:
tokenizer=fit_tokenizer(sentences)
word_index=tokenizer.word_index

print(f"Vocabulary contains {len(word_index)} words\n")
print("<OOV> token included in vocabulary" if "<OOV>" in word_index else "<OOV> token NOT included in vocabulary")

Vocabulary contains 24984 words

<OOV> token included in vocabulary


In [30]:
def get_padded_sequences(tokenizer, sentences):
    
    ### START CODE HERE
    # Convert sentences to sequences
    sequences=tokenizer.texts_to_sequences(sentences)
    
    # Pad the sequences using the post padding strategy
    padded_sequences=pad_sequences(sequences,padding="post")
    ### END CODE HERE
    
    return padded_sequences

In [33]:
padded_sequences = get_padded_sequences(tokenizer, sentences)
print(f"First padded sequence looks like this: \n\n{padded_sequences[0]}\n")
print(f"Numpy array of all sequences has shape: {padded_sequences.shape}\n")
print(f"This means there are {padded_sequences.shape[0]} sequences in total and each one has a size of {padded_sequences.shape[1]}.")

First padded sequence looks like this: 

[1418 1275  693 ...    0    0    0]

Numpy array of all sequences has shape: (1490, 3356)

This means there are 1490 sequences in total and each one has a size of 3356.


In [36]:
def tokenize_labels(labels):
    ### START CODE HERE
    
    # Instantiate the Tokenizer class
    # No need to pass additional arguments since you will be tokenizing the labels
    tokenizer=Tokenizer()
    
    # Fit the tokenizer to the labels
    tokenizer.fit_on_texts(labels)
    
    # Save the word index
    label_word_index=tokenizer.word_index
    # Save the sequences
    label_sequences=tokenizer.texts_to_sequences(labels)

    ### END CODE HERE
    
    return label_sequences, label_word_index

In [37]:
label_sequences, label_word_index = tokenize_labels(labels)
print(f"Vocabulary of labels looks like this {label_word_index}\n")
print(f"First ten sequences {label_sequences[:10]}\n")

Vocabulary of labels looks like this {'sport': 1, 'business': 2, 'politics': 3, 'entertainment': 4, 'tech': 5}

First ten sequences [[2], [2], [2], [5], [2], [3], [1], [4], [2], [4]]

