In [178]:
#https://richliao.github.io/supervised/classification/2016/11/26/textclassifier-convolutional/
#https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

import os, pandas as pd
import re

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
from nltk.corpus import stopwords

In [179]:
file_list=os.listdir('../../dataset/df_train.pkl.gz/')
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, 
                        pd.read_pickle('../../dataset/df_train.pkl.gz/'+file, compression='gzip')])
df_train['mission_prgrm']=df_train['mission']+'; '+df_train['prgrm_dsc']

len(df_train['mission_prgrm'])

229472

In [180]:
small_num=0
while small_num<100: # Make sure each category has at least 100 records.
    trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(60000)
    small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']

small_num

123

In [181]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """    
    return string.strip().lower()

texts = []
labels = []

for idx in range(len(trainDF)):
    text = trainDF['mission_prgrm'].values[idx]
    texts.append(clean_str(text))
    labels.append(ord(trainDF['NTEE1'].values[idx]))

In [182]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

MAX_NB_WORDS = 2000
MAX_SEQUENCE_LENGTH=1000

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)



Found 84204 unique tokens.


In [184]:
#labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)

labels1 = []

for i in range(len(labels)):
    pos = np.zeros((26,), dtype=int)
    pos[labels[i]-65] = 1
    labels1.append(np.asarray(pos))

labels = np.asarray(labels1)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

VALIDATION_SPLIT = 0.7
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:nb_validation_samples]
y_train = labels[:nb_validation_samples]
x_val = data[nb_validation_samples:]
y_val = labels[nb_validation_samples:]

Shape of data tensor: (60000, 1000)
Shape of label tensor: (60000, 26)


In [186]:
x_train[0]

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [187]:
embeddings_index = {}
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

EMBEDDING_DIM=100

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Found 400000 word vectors.


In [192]:
from keras.layers import Input, Dense, Conv1D, MaxPooling1D, Flatten, BatchNormalization
from keras.models import Model

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(filters=256, kernel_size=5, activation='relu')(embedded_sequences)
x = MaxPooling1D(pool_size=5)(x)
x = Conv1D(filters=256, kernel_size=5, activation='relu')(x)
x = MaxPooling1D(pool_size=5)(x)
x = Conv1D(filters=256, kernel_size=5, activation='relu')(x)
x = MaxPooling1D(pool_size=35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(units=256, activation='relu')(x)
x = Dense(units=128, activation='relu')(x)
preds = Dense(units=26, activation='softmax')(x)

model = Model(sequence_input, preds)

model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
history = model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=20, batch_size=500)

#128,5 ; 5; 128,5; 5; 128; 5; 35; 128; 26 : 67.64
# 3,,,25,: 29.61

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_26 (InputLayer)        (None, 1000)              0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 1000, 100)         8420500   
_________________________________________________________________
conv1d_69 (Conv1D)           (None, 996, 256)          128256    
_________________________________________________________________
max_pooling1d_68 (MaxPooling (None, 199, 256)          0         
_________________________________________________________________
conv1d_70 (Conv1D)           (None, 195, 256)          327936    
_________________________________________________________________
max_pooling1d_69 (MaxPooling (None, 39, 256)           0         
_________________________________________________________________
conv1d_71 (Conv1D)           (None, 35, 256)           327936    
__________

In [157]:
score = model.evaluate(x_val, y_val, 
                   batch_size=500, verbose=1)

score




[2.0775147477785745, 0.431999996304512]