In [None]:
#Reuters-21578 text classification with Gensim and Keras
#https://www.bonaccorso.eu/2016/08/02/reuters-21578-text-classification-with-gensim-and-keras/

In [1]:
import re
import xml.sax.saxutils as saxutils

from bs4 import BeautifulSoup

from gensim.models.word2vec import Word2Vec

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM

from multiprocessing import cpu_count

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer

from pandas import DataFrame

from sklearn.cross_validation import train_test_split

import numpy as np

Using TensorFlow backend.


In [4]:
# Set Numpy random seed
import random
random.seed(1000)

# Newsline folder and format
data_folder = '/Users/nhu2000/projects/SimilarSearch/keras/reuters21578/'

sgml_number_of_files = 22
sgml_file_name_template = 'reut2-NNN.sgm'

# Category files
category_files = {
    'to_': ('Topics', 'all-topics-strings.lc.txt'),
    'pl_': ('Places', 'all-places-strings.lc.txt'),
    'pe_': ('People', 'all-people-strings.lc.txt'),
    'or_': ('Organizations', 'all-orgs-strings.lc.txt'),
    'ex_': ('Exchanges', 'all-exchanges-strings.lc.txt')
}

num_features = 500
# Limit each newsline to a fixed number of words
document_max_num_words = 100
# Selected categories
selected_categories = ['pl_usa']

In [5]:
#Prepare documents and categories

In [6]:
# Create category dataframe

# Read all categories
category_data = []

for category_prefix in category_files.keys():
    with open(data_folder + category_files[category_prefix][1], 'r') as file:
        for category in file.readlines():
            category_data.append([category_prefix + category.strip().lower(), 
                                  category_files[category_prefix][0], 
                                  0])

# Create category dataframe
news_categories = DataFrame(data=category_data, columns=['Name', 'Type', 'Newslines'])

In [7]:
def update_frequencies(categories):
    for category in categories:
        idx = news_categories[news_categories.Name == category].index[0]
        f = news_categories.get_value(idx, 'Newslines')
        news_categories.set_value(idx, 'Newslines', f+1)
    
def to_category_vector(categories, target_categories):
    vector = np.zeros(len(target_categories)).astype(float)
    
    for i in range(len(target_categories)):
        if target_categories[i] in categories:
            vector[i] = 1.0
    
    return vector

In [8]:
# Parse SGML files
document_X = {}
document_Y = {}

def strip_tags(text):
    return re.sub('<[^<]+?>', '', text).strip()

def unescape(text):
    return saxutils.unescape(text)

# Iterate all files
for i in range(sgml_number_of_files):
    if i < 10:
        seq = '00' + str(i)
    else:
        seq = '0' + str(i)
        
    file_name = sgml_file_name_template.replace('NNN', seq)
    print('Reading file: %s' % file_name)
    
    with open(data_folder + file_name, 'r') as file:
        content = BeautifulSoup(file.read().lower())
        
        for newsline in content('reuters'):
            document_categories = []
            
            # News-line Id
            document_id = newsline['newid']
            
            # News-line text
            document_body = strip_tags(str(newsline('text')[0].body)).replace('reuter\n&#3;', '')
            document_body = unescape(document_body)
            
            # News-line categories
            topics = newsline.topics.contents
            places = newsline.places.contents
            people = newsline.people.contents
            orgs = newsline.orgs.contents
            exchanges = newsline.exchanges.contents
            
            for topic in topics:
                document_categories.append('to_' + strip_tags(str(topic)))
                
            for place in places:
                document_categories.append('pl_' + strip_tags(str(place)))
                
            for person in people:
                document_categories.append('pe_' + strip_tags(str(person)))
                
            for org in orgs:
                document_categories.append('or_' + strip_tags(str(org)))
                
            for exchange in exchanges:
                document_categories.append('ex_' + strip_tags(str(exchange)))
                
            # Create new document    
            update_frequencies(document_categories)
            
            document_X[document_id] = document_body
            document_Y[document_id] = to_category_vector(document_categories, selected_categories)

Reading file: reut2-000.sgm




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html5lib")

  markup_type=markup_type))


Reading file: reut2-001.sgm
Reading file: reut2-002.sgm
Reading file: reut2-003.sgm
Reading file: reut2-004.sgm
Reading file: reut2-005.sgm
Reading file: reut2-006.sgm
Reading file: reut2-007.sgm
Reading file: reut2-008.sgm
Reading file: reut2-009.sgm
Reading file: reut2-010.sgm
Reading file: reut2-011.sgm
Reading file: reut2-012.sgm
Reading file: reut2-013.sgm
Reading file: reut2-014.sgm
Reading file: reut2-015.sgm
Reading file: reut2-016.sgm
Reading file: reut2-017.sgm
Reading file: reut2-018.sgm
Reading file: reut2-019.sgm
Reading file: reut2-020.sgm
Reading file: reut2-021.sgm


In [9]:
#Top 20 categories (by number of newslines)
news_categories.sort_values(by='Newslines', ascending=False, inplace=True)
news_categories.head(20)

Unnamed: 0,Name,Type,Newslines
161,pl_usa,Places,12542
533,to_earn,Topics,3987
498,to_acq,Topics,2448
158,pl_uk,Places,1489
84,pl_japan,Places,1138
31,pl_canada,Places,1104
571,to_money-fx,Topics,801
526,to_crude,Topics,634
543,to_grain,Topics,628
167,pl_west-germany,Places,567


In [None]:
#Tokenize newsline documents

In [10]:
# Load stop-words
stop_words = set(stopwords.words('english'))

# Initialize tokenizer
# It's also possible to try with a stemmer or to mix a stemmer and a lemmatizer
tokenizer = RegexpTokenizer('[\'a-zA-Z]+')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Tokenized document collection
newsline_documents = []

In [11]:
def tokenize(document):
    words = []

    for sentence in sent_tokenize(document):
        tokens = [lemmatizer.lemmatize(t.lower()) for t in tokenizer.tokenize(sentence) if t.lower() not in stop_words]
        words += tokens

    return words

# Tokenize
for key in document_X.keys():
    newsline_documents.append(tokenize(document_X[key]))

number_of_documents = len(document_X)


In [12]:
# Create new Gensim Word2Vec model
w2v_model = Word2Vec(newsline_documents, size=num_features, min_count=1, window=10, workers=cpu_count())
w2v_model.init_sims(replace=True)
w2v_model.save(data_folder + 'reuters.word2vec')

In [13]:
#Word2Vec Model

In [14]:
# Load an existing Word2Vec model
w2v_model = Word2Vec.load(data_folder + 'reuters.word2vec')

In [15]:
#Vectorize each document

In [16]:
num_categories = len(selected_categories)
X = np.zeros(shape=(number_of_documents, document_max_num_words, num_features)).astype(float)
Y = np.zeros(shape=(number_of_documents, num_categories)).astype(float)

empty_word = np.zeros(num_features).astype(float)

for idx, document in enumerate(newsline_documents):
    for jdx, word in enumerate(document):
        if jdx == document_max_num_words:
            break
            
        else:
            if word in w2v_model:
                X[idx, jdx, :] = w2v_model[word]
            else:
                X[idx, jdx, :] = empty_word

for idx, key in enumerate(document_Y.keys()):
    Y[idx, :] = document_Y[key]

In [17]:
#Split training and test sets

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [19]:
#Create Keras model

In [20]:
model = Sequential()

model.add(LSTM(int(document_max_num_words*1.5), input_shape=(document_max_num_words, num_features)))
model.add(Dropout(0.3))
model.add(Dense(num_categories))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
#Train and evaluate model

In [21]:
# Train model
model.fit(X_train, Y_train, batch_size=128, epochs=5, validation_data=(X_test, Y_test))

# Evaluate model
score, acc = model.evaluate(X_test, Y_test, batch_size=128)
    
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)



Train on 15104 samples, validate on 6474 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score: 0.6788
Accuracy: 0.5846
