#Deep Learning for Text Classification

In [1]:
!pip install wget==3.2
!pip install tensorflow

Collecting wget==3.2
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=a091cc079cd342c3bada88393ecd9414a76a1754360c2a5b45dd2d8093242616
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [2]:
#Make the necessary imports
import os
import sys
import numpy as np
import pandas as pd
import tarfile
import wget
import warnings
warnings.filterwarnings("ignore")
from zipfile import ZipFile
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import Constant

In [3]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

Loading and Preprocessing

In [4]:
try:
    from google.colab import files
    !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv
    !ls -lah DATAPATH
    data = pd.read_csv("DATAPATH/Full-Economic-News-DFE-839861.csv" , encoding = "ISO-8859-1" )

except ModuleNotFoundError:
    data = pd.read_csv("Data/Full-Economic-News-DFE-839861.csv" , encoding = "ISO-8859-1" )

--2024-02-16 16:37:39--  https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12383529 (12M) [text/plain]
Saving to: ‘DATAPATH/Full-Economic-News-DFE-839861.csv’


2024-02-16 16:37:40 (38.0 MB/s) - ‘DATAPATH/Full-Economic-News-DFE-839861.csv’ saved [12383529/12383529]

total 12M
drwxr-xr-x 2 root root 4.0K Feb 16 16:37 .
drwxr-xr-x 1 root root 4.0K Feb 16 16:37 ..
-rw-r--r-- 1 root root  12M Feb 16 16:37 Full-Economic-News-DFE-839861.csv


In [5]:
display(data.shape) # Number of rows (instances) and columns in the dataset
data["relevance"].value_counts()/data.shape[0] # Class distribution in the dataset

(8000, 15)

no          0.821375
yes         0.177500
not sure    0.001125
Name: relevance, dtype: float64

In [6]:
# convert label to a numerical variable
data = data[data.relevance != "not sure"] # removing the data where we don't want relevance="not sure".
data.shape
data['relevance'] = data.relevance.map({'yes':1, 'no':0}) # relevant is 1, not-relevant is 0.
data = data[["text","relevance"]] # Let us take only the two columns we need.
data.shape

(7991, 2)

In [7]:
data.head()

Unnamed: 0,text,relevance
0,NEW YORK -- Yields on most certificates of dep...,1
1,The Wall Street Journal Online</br></br>The Mo...,0
2,WASHINGTON -- In an effort to achieve banking ...,0
3,The statistics on the enormous costs of employ...,0
4,NEW YORK -- Indecision marked the dollar's ton...,1


In [9]:
texts = data['text'].values.tolist()
y = data['relevance'].values.tolist()

In [15]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

mystopwords = set(stopwords.words("english"))
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
# define text preprocessing steps
lemmatized = []
wn = WordNetLemmatizer()

def preprocess_corpus(texts):
    def remove_stops_digits(tokens):
        #Nested function that lowercases, removes stopwords and digits from a list of tokens
        tokens_new = [wn.lemmatize(word) for word in tokens ]
        return [token.lower() for token in tokens_new if token.lower() not in mystopwords and not token.isdigit()
               and token not in punctuation]
    #This return statement below uses the above function to process twitter tokenizer output further.
    return [remove_stops_digits(word_tokenize(text)) for text in texts]

In [17]:
# get preprossed data and take a look at some sample data
X = preprocess_corpus(texts)
print(len(y), len(X))
print(X[1])
print(y[1])

7991 7991
['wall', 'street', 'journal', 'online', '/br', '/br', 'morning', 'brief', 'look', 'day', "'s", 'biggest', 'news', 'emailed', 'subscriber', 'a.m.', 'every', 'business', 'day', 'sign', 'e-mail', 'here.', '/br', '/br', 'friday', 'evening', 'congress', 'town', 'summer', 'recess', 'americans', 'heading', 'mid-august', 'weekend', 'bush', 'administration', 'sent', 'message', 'state', 'federal', 'government', 'make', 'tougher', 'national', 'child', "'s", 'insurance', 'program', 'cover', 'offspring', 'middle-income', 'families.', '/br', '/br', 'state', 'children', "'s", 'health', 'insurance', 'program', 'wa', 'created', 'help', 'child', 'whose', 'family', 'could', "n't", 'afford', 'insurance', "n't", 'qualify', 'medicaid', 'administration', 'official', 'tell', 'new', 'york', 'times', 'change', 'aimed', 'returning', 'program', 'low-', 'income', 'focus', 'assuring', "n't", 'become', 'replacement', 'private', 'insurance', 'administration', 'point', 'man', 'dennis', 'smith', 'wrote', 'sta

In [21]:
from sklearn.model_selection import train_test_split

# split data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# further split the training data into training and test set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

Step 1: Tokenize the texts and convert them into word index vectors

In [22]:
#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer
#Tokenizer is fit on training data only, and that is used to tokenize both train and test data.
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(X_train)
train_sequences = tokenizer.texts_to_sequences(X_train) #Converting text to a vector of word indexes
test_sequences = tokenizer.texts_to_sequences(X_test)
val_sequences = tokenizer.texts_to_sequences(X_val)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 49414 unique tokens.


Step 2: Pad the text sequences so that all text vectors are of the same length.

In [23]:
#Converting this to sequences to be fed into neural network. Max seq. len is 1000 as set earlier
#initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH
train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
val_data = pad_sequences(val_sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_labels = to_categorical(np.asarray(y_train))
test_labels = to_categorical(np.asarray(y_test))
val_labels = to_categorical(np.asarray(y_val))

Step 3: If we want to use pre-trained embeddings to convert the train and test data
into an embedding matrix like we did in the earlier examples with Word2vec and
fastText, we have to download them and use them to convert our data into the input
format for the neural networks. The following code snippet shows an example of how
to do this using GloVe embeddings

In [None]:
import gensim.downloader

#Load pre trained glove model from Gensim
w2v_model = gensim.downloader.load('glove-wiki-gigaword-100')

In [26]:
# prepare embedding matrix - rows are the words from word_index, columns are the embeddings of that word from glove.
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    if word in w2v_model:
      embedding_vector = w2v_model[word]
      if embedding_vector is not None:
          # words not found in embedding index will be all-zeros.
          embedding_matrix[i] = embedding_vector

Step 4: Use the output from Step 3 as the input to a neural network architecture.

In [27]:
# load these pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
print("Preparing of embedding matrix is done")

Preparing of embedding matrix is done


### (1) 1D CNN model with training your own embedding

In [31]:
print("Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings")
cnnmodel = Sequential()
cnnmodel.add(Embedding(MAX_NUM_WORDS, 128))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(GlobalMaxPooling1D())
cnnmodel.add(Dense(128, activation='relu'))
cnnmodel.add(Dense(2, activation='softmax'))

cnnmodel.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
#Train the model. Tune to validation set.
cnnmodel.fit(train_data, train_labels,
          batch_size=128,
          epochs=1, validation_data=(val_data, val_labels))

#Evaluate on test set:
score, acc = cnnmodel.evaluate(test_data, test_labels)
print('Test accuracy with CNN:', acc)

Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings
Test accuracy with CNN: 0.8298937082290649


### (2) 1D CNN Model with pre-trained embedding

In [29]:
print('Define a 1D CNN model.')

cnnmodel = Sequential()
cnnmodel.add(embedding_layer)
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(GlobalMaxPooling1D())
cnnmodel.add(Dense(128, activation='relu'))
cnnmodel.add(Dense(2, activation='softmax'))

cnnmodel.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
#Train the model. Tune to validation set.
cnnmodel.fit(train_data, train_labels,
          batch_size=128,
          epochs=1, validation_data=(val_data, val_labels))
#Evaluate on test set:
score, acc = cnnmodel.evaluate(test_data, test_labels)
print('Test accuracy with CNN:', acc)

Define a 1D CNN model.
Test accuracy with CNN: 0.8298937082290649


### (3) LSTM Model with training your own embedding

In [None]:
print("Defining and training an LSTM model, training embedding layer on the fly")

#model
rnnmodel = Sequential()
rnnmodel.add(Embedding(MAX_NUM_WORDS, 128))
rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnnmodel.add(Dense(2, activation='sigmoid'))
rnnmodel.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print('Training the RNN')

rnnmodel.fit(train_data, train_labels,
          batch_size=32,
          epochs=1,
          validation_data=(val_data, val_labels))
score, acc = rnnmodel.evaluate(test_data, test_labels,
                            batch_size=32)
print('Test accuracy with RNN:', acc)

Defining and training an LSTM model, training embedding layer on the fly
Training the RNN
Test accuracy with RNN: 0.7940000295639038


### (4) LSTM Model using pre-trained Embedding Layer

In [32]:
print("Defining and training an LSTM model, using pre-trained embedding layer")

rnnmodel2 = Sequential()
rnnmodel2.add(embedding_layer)
rnnmodel2.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnnmodel2.add(Dense(2, activation='sigmoid'))
rnnmodel2.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print('Training the RNN')

rnnmodel2.fit(train_data, train_labels,
          batch_size=32,
          epochs=1,
          validation_data=(val_data, val_labels))
score, acc = rnnmodel2.evaluate(test_data, test_labels,
                            batch_size=32)
print('Test accuracy with RNN:', acc)

Defining and training an LSTM model, using pre-trained embedding layer
Training the RNN
Test accuracy with RNN: 0.8298937082290649
