# Exercise 3 : Text classification on the Ohsumed dataset
## 1. Data loading and preprocessing


In [89]:
import os
from collections import defaultdict
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
import nltk
from nltk.corpus import stopwords   # to get collection of stopwords
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from tensorflow.keras.layers import Bidirectional, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import GRU
import re
import matplotlib.pyplot as plt
from keras.src.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, SimpleRNN
from tensorflow.keras import Model, Input

In [90]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maelkerichard/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [91]:
def get_info(path: str):
    data = list(os.walk(path))[1:]
    files = []
    for d in data:
        folder_name = d[0]
        for file in d[2]:
            files.append((folder_name.split('/')[-1], os.path.join(folder_name, file)))

    d = defaultdict(int)
    texts = defaultdict(list)
    for (cate, file) in files:
        with open(file, 'r') as outfile:
            text = outfile.read()
            texts[cate].append(text)
            words = text_to_word_sequence(text)
            for word in words:
                d[word] += 1
    words = sorted(d.items(), key=lambda x: x[1], reverse=True)
    return (texts, words)

In [92]:
english_stops = set(stopwords.words('english'))

In [93]:
def load_dataset(folder):

    # LOAD DATA
    path = '../ohsumed-first-20000-docs/' + folder
    texts, words = get_info(path)
    
    # CREATE DATAFRAME
    df = pd.DataFrame(columns=['category', 'article'])
    for cate in texts:
        for text in texts[cate]:
            df = pd.concat([df, pd.DataFrame({'category': [cate], 'article': [text]})], ignore_index=True)
    
    # PRE-PROCESS DATA
    df['article'] = df['article'].replace({'<.*?>': ''}, regex = True)          # remove html tag
    df['article'] = df['article'].replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    df['article'] = df['article'].apply(lambda article: [w for w in article.split() if w not in english_stops])  # remove stop words
    df['article'] = df['article'].apply(lambda article: [w.lower() for w in article])   # lower case
    
    # ENCODE CATEGORY
    df['category'] = df['category'].replace('C01', 0)
    df['category'] = df['category'].replace('C02', 1)
    df['category'] = df['category'].replace('C03', 2)
    df['category'] = df['category'].replace('C04', 3)
    df['category'] = df['category'].replace('C05', 4)
    df['category'] = df['category'].replace('C06', 5)
    df['category'] = df['category'].replace('C07', 6)
    df['category'] = df['category'].replace('C08', 7)
    df['category'] = df['category'].replace('C09', 8)
    df['category'] = df['category'].replace('C10', 9)
    df['category'] = df['category'].replace('C11', 10)
    df['category'] = df['category'].replace('C12', 11)
    df['category'] = df['category'].replace('C13', 12)
    df['category'] = df['category'].replace('C14', 13)
    df['category'] = df['category'].replace('C15', 14)
    df['category'] = df['category'].replace('C16', 15)
    df['category'] = df['category'].replace('C17', 16)
    df['category'] = df['category'].replace('C18', 17)
    df['category'] = df['category'].replace('C19', 18)
    df['category'] = df['category'].replace('C20', 19)
    df['category'] = df['category'].replace('C21', 20)
    df['category'] = df['category'].replace('C22', 21)
    df['category'] = df['category'].replace('C23', 22)

    # Join words in the articles to form a single string per article
    df['article'] = df['article'].apply(lambda x: ' '.join(x))

    x_data = df['article'].tolist()
    y_data = df['category']
    return x_data, y_data

In [94]:
x_train, y_train = load_dataset('training')
x_test, y_test = load_dataset('test')

In [95]:
def get_max_length():
    review_length = []
    for article in x_train:
        review_length.append(len(article))

    return int(np.ceil(np.mean(review_length)))

In [96]:
# Hyperparameters (adjust as needed)
EMBEDDING_DIM = 100  # Embedding dimensions
MAX_SEQUENCE_LENGTH = 500  # Adjust based on your data
NUM_CLASSES = 23  # Total number of classes
LEARNING_RATE = 0.001  # Start with this, adjust if necessary
BATCH_SIZE = 64
EPOCHS = 5
LSTM_OUT = 64

In [101]:
# Tokenization and Padding
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(x_train)
sequences_train = tokenizer.texts_to_sequences(x_train)
sequences_test = tokenizer.texts_to_sequences(x_test)

max_length = get_max_length()

x_train_padded = pad_sequences(sequences_train, maxlen=max_length, padding='post', truncating='post')
x_test_padded = pad_sequences(sequences_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(tokenizer.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum article length: ', max_length)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [102]:
y_train

0         7
1         7
2         7
3         7
4         7
         ..
10428    10
10429    10
10430    10
10431    10
10432    10
Name: category, Length: 10433, dtype: int64

In [98]:
# One-hot encode the labels
y_train_encoded = to_categorical(y_train, num_classes=NUM_CLASSES)
y_test_encoded = to_categorical(y_test, num_classes=NUM_CLASSES)

## Build Architecture/Model

In [99]:
def create_lstm_model():
    model = Sequential()
    model.add(Embedding(total_words, EMBEDDING_DIM, input_length=max_length))
    model.add(LSTM(LSTM_OUT))
    model.add(Dense(NUM_CLASSES, activation='sigmoid'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_gru_model():
    model = Sequential()
    model.add(Embedding(total_words, EMBEDDING_DIM, input_length=max_length))
    model.add(GRU(LSTM_OUT))
    model.add(Dense(NUM_CLASSES, activation='sigmoid'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model
def create_rnn_model():
    model = Sequential()
    model.add(Embedding(total_words, EMBEDDING_DIM, input_length=max_length))
    model.add(SimpleRNN(LSTM_OUT))
    model.add(Dense(NUM_CLASSES, activation='sigmoid'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model
def create_cnn_model():
    model = Sequential()
    model.add(Embedding(total_words, EMBEDDING_DIM, input_length=max_length))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(NUM_CLASSES, activation='sigmoid'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model
def create_lstm_cnn_model():
    model = Sequential()
    model.add(Embedding(total_words, EMBEDDING_DIM, input_length=max_length))
    model.add(Conv1D(64, 5, activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(LSTM(LSTM_OUT))
    model.add(Dense(NUM_CLASSES, activation='sigmoid'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model
def create_stacked_lstm_model():
    model = Sequential()
    model.add(Embedding(total_words, EMBEDDING_DIM, input_length=max_length))
    model.add(LSTM(LSTM_OUT, return_sequences=True))
    model.add(LSTM(LSTM_OUT))
    model.add(Dense(NUM_CLASSES, activation='sigmoid'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [100]:
models = [create_lstm_model(), create_gru_model(), create_cnn_model(), create_lstm_cnn_model(), create_stacked_lstm_model()]

model_histories = []

for model in models:
    print(model.summary())
    history = model.fit(x_train_padded, y_train_encoded, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(x_test_padded, y_test_encoded))
    model_histories.append(history)

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_17 (Embedding)    (None, 909, 100)          2830900   
                                                                 
 lstm_24 (LSTM)              (None, 64)                42240     
                                                                 
 dense_18 (Dense)            (None, 23)                1495      
                                                                 
Total params: 2874635 (10.97 MB)
Trainable params: 2874635 (10.97 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/5

KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(12, 6))

for history in model_histories:
    val_acc = history.history['val_accuracy']
    epochs = range(1, len(val_acc) + 1)
    plt.plot(epochs, val_acc, label='Validation accuracy of model')

plt.title('Comparison of Different Models')
plt.xlabel('Epochs')
plt.ylabel('Validation Accuracy')
plt.legend()
plt.show()


En conclusion de ce TP sur la classification de texte avec le jeu de données Ohsumed, plusieurs étapes cruciales ont été mises en œuvre. Nous avons commencé par le chargement et la préparation des données, incluant la suppression des mots inutiles, le nettoyage et la normalisation des textes. Différents modèles ont été construits et évalués, notamment LSTM, GRU, RNN, CNN et une combinaison de LSTM et CNN.

Chacun de ces modèles a été entraîné avec des hyperparamètres ajustés pour améliorer les performances. Les résultats des expérimentations montrent des différences dans l'efficacité de chaque modèle, comme observé dans les courbes de précision de validation. Ces résultats soulignent l'importance de choisir le bon modèle et les bons hyperparamètres pour un problème de classification de texte spécifique.