# Text classification 

In [1]:
import re
import xml.sax.saxutils as saxutils

#from BeautifulSoup import BeautifulSoup

from gensim.models.word2vec import Word2Vec

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM
from keras.layers.convolutional import Conv1D, MaxPooling1D, AveragePooling1D
from keras.layers.core import Flatten #, Dense, Dropout
from keras.optimizers import Adam

#from multiprocessing import cpu_count

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer

from pandas import DataFrame

from sklearn.cross_validation import train_test_split

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 760 (CNMeM is disabled, cuDNN not available)


# Setup constants 

In [2]:
# Set Numpy random seed
import random
random.seed(1000)

# data and model folder
data_folder = '.\\data\\'

# Word2Vec number of features
num_features = 500
# Limit each newsline to a fixed number of words
document_max_num_words = 100

# Prepare content

In [3]:
import json

with open(data_folder + 'content.json', 'r') as f:
     content = json.load(f)

with open(data_folder + 'label.json', 'r') as f:
     label = json.load(f)

print label['1001'], 'topic, place, people, organization, exchanges'
print content['1001']

[0, 1, 0, 0, 0] topic, place, people, organization, exchanges
sandoz ag said it planned a joint venture
to produce herbicides in the soviet union.
    the company said it had signed a letter of intent with the
soviet ministry of fertiliser production to form the first
foreign joint venture the ministry had undertaken since the
soviet union allowed western firms to enter into joint ventures
two months ago.
    the ministry and sandoz will each have a 50 pct stake, but
a company spokeswoman was unable to give details of the size of
investment or planned output.
 


# Tokenize contents

In [4]:
# Load stop-words
stop_words = set(stopwords.words('english'))

# Initialize tokenizer
tokenizer = RegexpTokenizer('[\'a-zA-Z]+')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Tokenized content collection
content_documents = []

In [6]:
import nltk

nltk.download('punkt')

def tokenize(document):
    words = []
    for sentence in sent_tokenize(document):
        tokens = [lemmatizer.lemmatize(t.lower()) for t in tokenizer.tokenize(sentence) if t.lower() not in stop_words]
        words += tokens

    return words

content_documents = []
# Tokenize
key1001 = 0
for key in content.keys():
    if key == '1001':
        key1001 = len(content_documents)
        print len(content_documents)
        print tokenize(content[key])
    content_documents.append(tokenize(content[key]))

number_of_documents = len(content)

[nltk_data] Downloading package punkt to C:\Anaconda2\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
17743
[u'sandoz', u'ag', u'said', u'planned', u'joint', u'venture', u'produce', u'herbicide', u'soviet', u'union', u'company', u'said', u'signed', u'letter', u'intent', u'soviet', u'ministry', u'fertiliser', u'production', u'form', u'first', u'foreign', u'joint', u'venture', u'ministry', u'undertaken', u'since', u'soviet', u'union', u'allowed', u'western', u'firm', u'enter', u'joint', u'venture', u'two', u'month', u'ago', u'ministry', u'sandoz', u'pct', u'stake', u'company', u'spokeswoman', u'unable', u'give', u'detail', u'size', u'investment', u'planned', u'output']


# Word2Vec Model

In [7]:
# Create new Gensim Word2Vec model
w2v_model = Word2Vec(content_documents, size=num_features, min_count=1, window=10)
w2v_model.init_sims(replace=True)

# Vectorize content

In [8]:
import numpy

num_categories = 5
X = numpy.zeros(shape=(number_of_documents, document_max_num_words, num_features)).astype(numpy.float32)
Y = numpy.zeros(shape=(number_of_documents, num_categories)).astype(numpy.float32)

empty_word = numpy.zeros(num_features).astype(numpy.float32)

for idx, document in enumerate(content_documents):
    for jdx, word in enumerate(document):
        if jdx == document_max_num_words:
            break
            
        else:
            if word in w2v_model:
                X[idx, jdx, :] = w2v_model[word]
            else:
                X[idx, jdx, :] = empty_word

for idx, key in enumerate(label.keys()):
    Y[idx, :] = label[key]
    
print X[key1001]

[[-0.02157213 -0.04568234  0.00146626 ..., -0.00972886 -0.01741979
  -0.00713177]
 [-0.03147293 -0.0149641   0.03508525 ..., -0.03686867 -0.00689081
  -0.02585261]
 [-0.02510744 -0.00800088 -0.00946434 ...,  0.08622478  0.01505131
  -0.05790702]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]
