### Doc to Vec Classifier 

This notebook create a document to vector classifier on the new group 20 data sets (build into sklearn)

In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from keras.models import Sequential, Model
from keras.layers import Embedding, Dropout, Reshape, Activation, Input,GRU, Flatten, Dense, Dot, Conv1D, Bidirectional
import keras.backend as K
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams
from keras.utils import multi_gpu_model
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

from keras.layers import GlobalAveragePooling1D
import tensorflow as tf

import re
import nltk
from nltk.corpus import stopwords

import plotly.express as px
import os 
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
print('GPU avialible: {}'.format(tf.config.list_physical_devices('GPU')))


# gets the stop word list 
try:
    stop_words = set(stopwords.words('english'))
except:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

# Define the preprocessing function
def pre_processor(text, stopwords=stop_words):
    '''
    
    Basic Semi universial Text Preprocessing
    tokenizes dates,webaddress, emails, phone numbers and numbers
    
    param: text string of text 
    param: stopwords = list of strings of words to remove
    '''
    def mask_dates(text, token=' <date> '):
        text = re.sub("[\d]{1,2} [ADFJMNOS]\w* [\d]{4}", token, text)
        text = re.sub("[\d]{1,2}-[\d]{1,2}-[\d]{2}", token, text)
        text = re.sub("[\d]{1,2}/[\d]{1,2}/[\d]{4}", token, text)
        text = re.sub("[\d]{4}/[\d]{1,2}/[\d]{1,2}", token, text)
        text = re.sub("([\d]{1,2}\s(January|February|March|April|May|June|July|August|September|October|November|December)\s[\d]{4})", token, text)
        return text

    def mask_webaddr(text, token=' <webaddr> '):
        text = re.sub('http[s]?[:][^\s]+', token, text) # remove web address
        text =  re.sub('www[.][^\s]+', token, text) #remove 
        return text

    def mask_phone_numbers(text, token=' <phone> '):
        pattern = '[^A-Za-z]{0,1}[0-9]{3}[^A-Za-z]{0,1}-[0-9]{3}-[0-9]{4}'
        return re.sub(pattern, token, text)

    def mask_email(text, token='<email>'):
        pattern = '\S+@\S+[.]\S{3}'
        return re.sub(pattern, token, text)
    
    if isinstance(text, type(None)):
        return ''
    else:
        text = str(text)

    text = mask_phone_numbers(text)
    text = mask_webaddr(text)
    text = mask_dates(text)
    text = mask_email(text)
    text = re.sub('[0-9]+', ' <num> ', text)
    text = re.sub('[^a-zA-Z<>.]+', ' ', text)
    if isinstance(stopwords, type(None)):
        return text
    
    else:
        stopwords = set(stopwords)
        return ' '.join([word for word in text.split(' ') if word not in stopwords ])
        
pre_processor('this 1 is a  &^# 123 test https://www.google.com this is a \n  \
2012-1-12 test. www.google.com with sim numbers 12 and a phone (800)-111-2222 and 1-800-222-1252 and \
my email is chiefOfStaff@whitehouse.gov')


GPU avialible: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


'<num> <num> test <webaddr> <num> <date> test. <webaddr> sim numbers <num> phone <phone> <num> <phone> email <email>'

In [2]:
d = fetch_20newsgroups()
docs = list(map(pre_processor, d['data']))
labels = [d['target_names'][i] for i in d['target']]

docs[0][0:100], labels[1]

('From <email> thing Subject WHAT car Nntp Posting Host rac <num> .wam.umd.edu Organization University',
 'comp.sys.mac.hardware')

#### Tokenizeration
A tokenizer is used to convert the words to integers

In [3]:

# fit the tokenizer
num_words = 5000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(docs)

vocab_size = min([len(tokenizer.word_index.items()), num_words])
print('tokenizer vocab_size: {}'.format(vocab_size))


tokenizer vocab_size: 5000


In [33]:
tokenizer_labels = Tokenizer()
labels_split = [d.split('.') for d in labels]
tokenizer_labels.fit_on_texts(labels_split)

labels_array = pd.DataFrame(tokenizer_labels.texts_to_matrix(labels_split), 
                            columns= [''] + list(tokenizer_labels.word_index.keys()), index=labels)
print(labels[0:5])
labels_array.head()

['rec.autos', 'comp.sys.mac.hardware', 'comp.sys.mac.hardware', 'comp.graphics', 'sci.space']


Unnamed: 0,Unnamed: 1,comp,rec,sci,misc,talk,politics,sport,sys,hardware,...,electronics,ibm,pc,forsale,graphics,mac,mideast,guns,alt,atheism
rec.autos,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
comp.sys.mac.hardware,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
comp.sys.mac.hardware,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
comp.graphics,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
sci.space,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Data Splitting

In [38]:

# split the data inot training and test sets 
X_train, X_test, y_train, y_test = train_test_split(docs, labels_array, test_size=.25) 


print('train_labels: {0} test_labels: {1}'.format(y_train.shape, y_test.shape))

train_labels: (8485, 34) test_labels: (2829, 34)


####  Data Generation Function

A data generator is used to create random samples of documents and labels
This is used to generate training and test samples from documents and labels

In [34]:
def random_data_gen(docs, labels_array,  n_samples=10):    
    n_docs = len(docs)
    index = np.arange(n_docs)
    for _ in range(n_samples):
        i = np.random.choice(index, 1)[0]
        x = np.reshape(tokenizer.texts_to_sequences([docs[i]])[0], (1,-1))
        y = np.reshape(labels_array[i, :], (1, -1))
        
        yield x, y
g = random_data_gen(docs, labels_array.values)
x, y = next(g)
print(x.shape, y.shape)

(1, 105) (1, 34)


#### Model Definition
Here a basic embedding model is created, this uses the skip grams to classifier whether the word and the context are from the document, or from a negative sample

This model is throw away, only the word_embedding weights are used (for word tom vec)

In [35]:
n_outputs = y.shape[1]
n_embedding_dims = 25

def get_model_d2v():
        inputs = Input(shape=(None,), name= 'variable-sequence-len-input')
        emb_word = Embedding(vocab_size + 1,
                         output_dim=n_embedding_dims,
                         embeddings_initializer='glorot_uniform',
                         input_length=None,
                         name = 'embedding', 
                         trainable=True)(inputs)
        conv = Conv1D(20, kernel_size=2)(emb_word)
        #lstm = Bidirectional(GRU(10, return_sequences=True))(conv)
        avg = GlobalAveragePooling1D(name='averaging')(conv)
        outputs = Dense(n_outputs, activation='sigmoid')(avg)
        model = Model(inputs=inputs, outputs=outputs, name='skipgram-classifier')
        model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])    
        return model
model = get_model_d2v()
model.summary()



Model: "skipgram-classifier"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
variable-sequence-len-input  (None, None)              0         
_________________________________________________________________
embedding (Embedding)        (None, None, 25)          125025    
_________________________________________________________________
conv1d_10 (Conv1D)           (None, None, 20)          1020      
_________________________________________________________________
averaging (GlobalAveragePool (None, 20)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 34)                714       
Total params: 126,759
Trainable params: 126,759
Non-trainable params: 0
_________________________________________________________________


#### Model Fitting
Call backs are used to reduce learning rate on loss reduction platue, as well as save only the best model weights, and stop early if model fails to improve

In [39]:
weights_path = "weights_best.hdf5"

# save only the best weights
checkpoint = ModelCheckpoint(weights_path ,mode='max' ,monitor='val_accuracy', verbose=1, save_best_only=True)

learning_rate = .001

#update the optimizer learning rate
K.set_value(model.optimizer.lr,learning_rate)

lrPlateauReductionFactor = .5
lrMin = 0.000001

# reduces learning rate on performance platu
lrCheckPoint = ReduceLROnPlateau(monitor = 'val_loss', factor=lrPlateauReductionFactor, min=lrMin)

# stops training whenmodel fails to improve
esm =  EarlyStopping(patience=5, monitor='val_accuracy',mode='max')

#establish how many steps in a epoch and how many epoch
n_steps_per_epoch = 500
n_epochs = 20

# sets up the training generator
train_gen = random_data_gen(X_train, y_train.values, n_steps_per_epoch * n_epochs )
test_gen = random_data_gen(X_test, y_test.values, n_steps_per_epoch * n_epochs )
# fit the model 

with tf.device('/CPU:0'):
    history = model.fit(train_gen,
                        epochs=n_epochs, 
                        steps_per_epoch= n_steps_per_epoch, 
                        validation_data=test_gen,
                        validation_steps=n_steps_per_epoch,
                        callbacks=[esm, lrCheckPoint, checkpoint], shuffle=True, verbose=0) 



Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.




Epoch 00001: val_accuracy improved from -inf to 0.21800, saving model to weights_best.hdf5

Epoch 00002: val_accuracy improved from 0.21800 to 0.33800, saving model to weights_best.hdf5

Epoch 00003: val_accuracy improved from 0.33800 to 0.44600, saving model to weights_best.hdf5

Epoch 00004: val_accuracy did not improve from 0.44600

Epoch 00005: val_accuracy did not improve from 0.44600

Epoch 00006: val_accuracy did not improve from 0.44600

Epoch 00007: val_accuracy did not improve from 0.44600

Epoch 00008: val_accuracy improved from 0.44600 to 0.51600, saving model to weights_best.hdf5

Epoch 00009: val_accuracy improved from 0.51600 to 0.61000, saving model to weights_best.hdf5

Epoch 00010: val_accuracy improved from 0.61000 to 0.61400, saving model to weights_best.hdf5

Epoch 00011: val_accuracy improved from 0.61400 to 0.64600, saving model to weights_best.hdf5

Epoch 00012: val_accuracy improved from 0.64600 to 0.65000, saving model to weights_best.hdf5

Epoch 00013: val_a

#### Extracting out the word embeddings

## Create a Document Classifier 
uses Global Average pooling and embedding layer to create document to vectorizer.

This essententally uses the the word vectors of a particular document and averages across.  This could be trainable as a classifier (multi class in the case of topic modeling)

In [40]:

def doc_2_vect(docs):
    if isinstance(docs, str):
        docs = [docs]
    gen = tokenizer.texts_to_sequences_generator(docs)
    output = np.zeros((len(docs), model.output.shape[1]))
    for i in range(len(docs)):
        temp_array = np.reshape(next(gen), (1, -1))
        with tf.device('/CPU:0'):
            preds = model.predict(temp_array).flatten()
        output[i, :] = preds
    return output
with tf.device('/CPU:0'):
    print(doc_2_vect(['this is a test', 'this is another test']))
    

[[3.06219961e-14 5.32610109e-03 2.40701149e-04 6.72304630e-03
  1.27896888e-03 1.97153008e-06 4.53112443e-06 1.44648020e-05
  2.33357330e-03 2.50182045e-03 2.38946825e-07 5.62631703e-06
  9.23468804e-08 9.80955832e-08 2.97179417e-04 2.14598422e-05
  1.46448874e-04 4.07984015e-04 6.35742734e-04 7.91340019e-04
  1.25063409e-04 1.12805123e-04 4.12828813e-04 6.26836380e-04
  3.32174893e-03 8.35678482e-04 9.56432486e-04 5.68333955e-04
  3.14941932e-03 9.09978058e-04 9.00453642e-07 6.01947795e-06
  1.08869131e-07 1.76104194e-07]
 [2.25090353e-17 1.10203139e-02 3.80553538e-05 2.06308323e-03
  2.10928483e-04 1.10102349e-09 4.93113328e-09 1.26797784e-06
  7.14323856e-03 8.27598386e-03 1.36267914e-10 6.31333080e-07
  4.20491766e-11 4.63296727e-11 4.77659778e-05 1.20628488e-06
  7.61990123e-06 6.03162953e-05 7.70900660e-05 1.26313433e-04
  2.97109145e-05 2.30717415e-05 5.24354167e-04 9.46007494e-04
  1.15057558e-03 1.55068538e-03 1.37000508e-03 3.16808204e-04
  4.18218039e-03 2.06008041e-03 6.733

#### PCA combined with TSNE as Dimension Reduction
PCA and TSNE are used here to reduce the dimensions to 3d so that the embedding space may be visualized

This is used for vizualization of the document to vector process.

If this works corretly, then documents of the same topic should show up in the same region of the TSNE

In [43]:
steps = [('pca', PCA(10)), ('tsne', TSNE(2))]
pipe = Pipeline(steps=steps)


document_vectors = doc_2_vect(X_test)

colors = [w.split('.')[0] for w in y_test.index]
comps = pipe.fit_transform(document_vectors)
comps_df = pd.DataFrame(comps, index=y_test.index)
comps_df.head()


Unnamed: 0,0,1
comp.os.ms-windows.misc,-8.240011,-52.959072
talk.politics.mideast,58.0145,-1.68608
talk.religion.misc,40.51442,12.686135
sci.space,-10.475912,31.431793
comp.sys.mac.hardware,-7.937212,-22.115822


#### 3D Scatter Plot with Plottly 

In [44]:
fig = px.scatter(comps_df.reset_index(), x=0, y=1, color=colors, hover_data=['index'])
fig.show()