# Exercise 3 : Text classification on the Ohsumed dataset
## 1. Data loading and preprocessing


In [41]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from collections import defaultdict
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
import nltk
from nltk.corpus import stopwords   # to get collection of stopwords
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from tensorflow.keras.layers import Bidirectional, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re
import matplotlib.pyplot as plt


In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maelkerichard/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def get_info(path: str):
    data = list(os.walk(path))[1:]
    files = []
    for d in data:
        folder_name = d[0]
        for file in d[2]:
            files.append((folder_name.split('/')[-1], os.path.join(folder_name, file)))

    d = defaultdict(int)
    texts = defaultdict(list)
    for (cate, file) in files:
        with open(file, 'r') as outfile:
            text = outfile.read()
            texts[cate].append(text)
            words = text_to_word_sequence(text)
            for word in words:
                d[word] += 1
    words = sorted(d.items(), key=lambda x: x[1], reverse=True)
    return (texts, words)

In [18]:
english_stops = set(stopwords.words('english'))

In [31]:
def load_dataset(folder):

    # LOAD DATA
    path = '../ohsumed-first-20000-docs/' + folder
    texts, words = get_info(path)
    
    # CREATE DATAFRAME
    df = pd.DataFrame(columns=['category', 'article'])
    for cate in texts:
        for text in texts[cate]:
            df = pd.concat([df, pd.DataFrame({'category': [cate], 'article': [text]})], ignore_index=True)
    
    # PRE-PROCESS DATA
    df['article'] = df['article'].replace({'<.*?>': ''}, regex = True)          # remove html tag
    df['article'] = df['article'].replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    df['article'] = df['article'].apply(lambda article: [w for w in article.split() if w not in english_stops])  # remove stop words
    df['article'] = df['article'].apply(lambda article: [w.lower() for w in article])   # lower case
    
    # ENCODE CATEGORY
    df['category'] = df['category'].replace('C01', 0)
    df['category'] = df['category'].replace('C02', 1)
    df['category'] = df['category'].replace('C03', 2)
    df['category'] = df['category'].replace('C04', 3)
    df['category'] = df['category'].replace('C05', 4)
    df['category'] = df['category'].replace('C06', 5)
    df['category'] = df['category'].replace('C07', 6)
    df['category'] = df['category'].replace('C08', 7)
    df['category'] = df['category'].replace('C09', 8)
    df['category'] = df['category'].replace('C10', 9)
    df['category'] = df['category'].replace('C11', 10)
    df['category'] = df['category'].replace('C12', 11)
    df['category'] = df['category'].replace('C13', 12)
    df['category'] = df['category'].replace('C14', 13)
    df['category'] = df['category'].replace('C15', 14)
    df['category'] = df['category'].replace('C16', 15)
    df['category'] = df['category'].replace('C17', 16)
    df['category'] = df['category'].replace('C18', 17)
    df['category'] = df['category'].replace('C19', 18)
    df['category'] = df['category'].replace('C20', 19)
    df['category'] = df['category'].replace('C21', 20)
    df['category'] = df['category'].replace('C22', 21)
    df['category'] = df['category'].replace('C23', 22)

    x_data = df['article']
    
    y_data = df['category']
    return x_data, y_data

In [32]:
x_train, y_train = load_dataset('training')
x_test, y_test = load_dataset('test')

In [35]:
def get_max_length():
    review_length = []
    for article in x_train:
        review_length.append(len(article))

    return int(np.ceil(np.mean(review_length)))

In [39]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[  361   242    96 ...   119    96   354]
 [14808   668   464 ...     0     0     0]
 [ 1210    74    97 ...     0     0     0]
 ...
 [  669  1092  4038 ...  2897    21   669]
 [   39   240   132 ...     0     0     0]
 [    7  1270   254 ...   276  1859 13205]] 

Encoded X Test
 [[   74   837   711 ...     0     0     0]
 [ 1410   552    74 ...   552   240    45]
 [ 1037  1622    77 ...     0     0     0]
 ...
 [  128    21  1422 ...    30  3395  1536]
 [21723   470 12121 ...     0     0     0]
 [ 3106  5827   449 ...     0     0     0]] 

Maximum review length:  112


## Build Architecture/Model

In [42]:

# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length=max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 112, 32)           905888    
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 930785 (3.55 MB)
Trainable params: 930785 (3.55 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [46]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [47]:
history = model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])


Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.01514, saving model to models/LSTM.h5
Epoch 2/5
 2/82 [..............................] - ETA: 5s - loss: -228.3922 - accuracy: 0.0273

  saving_api.save_model(


Epoch 2: accuracy did not improve from 0.01514
Epoch 3/5
Epoch 3: accuracy did not improve from 0.01514
Epoch 4/5
Epoch 4: accuracy did not improve from 0.01514
Epoch 5/5
Epoch 5: accuracy did not improve from 0.01514
