# Word2Vec + NN

__Author__: Patrick Steiner

__Created__: 31.03.2020

__Version__: 2

__Description__: Creates sentence embedding based on tf-idf and word-embedding and feeds this in a NN




|Change ID | Date       |  Author             |Change Description                   |
|----------|------------|---------------------|--------------------------------------|
|#SK01     | 03.04.2020 | Severin Kappeler    |Re-define corpus with split by sentence rather than by document | 
|#SK02     | 03.04.2020 | Severin Kappeler    |Embedding matrix weighted by TF-IDF score for each word-embedding vector. Using here the "global" TF-IDF. For a given word the "global" TF-IDF score is the average of all the TF-IDF scores for that word over all documents.  | 
|#SK03     | 03.04.2020 | Severin Kappeler    |Apply tokenize.texts_to_sequence to sentence list rather than document list| 
|#SK04     | 03.04.2020 | Severin Kappeler    |Converting label (target) to be on sentence level rather than document level.      | 
|#SK05     | 03.04.2020 | Severin Kappeler    |Adding split to sentence part for features |


In [87]:
# see also 06-Natural-Language_processing notebooks. This one focuses only on doc2vec.
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import re
import nltk
import os
import smart_open
import collections
import scipy.stats as stats
import tensorflow as tf
import gensim
import tqdm
import warnings

from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts
from gensim.test.utils import get_tmpfile

from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

from nltk import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
#from sklearn.model_selection import GridSearchCV, RandomSearch
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
import xgboost

# Preprocessing

## Import

In [3]:
#os.chdir('/Users/patrickrs/Documents/GitLab/patrick-steiner/Exercises')
target = pd.read_csv('.. ../../../../severin-kappeler/06-NLP/data/train_target.csv', index_col = 0)
features = pd.read_csv('../../../../severin-kappeler/06-NLP/data/train_features.csv', index_col = 0)

In [4]:
features = features[0:1000]
target   = target[0:1000]


## Cleaning

In [5]:
#TO DO: Clean the columns (removing missing values)
nltk.download('stopwords')
STOP_WORDS = nltk.corpus.stopwords.words('english')

def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = re.sub("xxxx", "", sentence)
    sentence = re.sub("xxx", "", sentence)
    sentence = re.sub("xx", "", sentence)
    sentence = re.sub("\s\s+", " ", sentence)
       
    # stemming of words (seems not to affect accuracy, but should make things faster
    porter = PorterStemmer()
    words = word_tokenize(sentence)
    sentence = " ".join([porter.stem(word) for word in words])
      
    sentence = sentence.split(" ")
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)  
    sentence = " ".join(sentence)
    
    return sentence

#SK05
#splitting data into sentences
def tokenize_to_sentences(data):
    output_data = data.copy()
    output_data = output_data.apply(nltk.sent_tokenize)
        
    return output_data

[nltk_data] Downloading package stopwords to /home/sevi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
#STOP_WORDS

In [7]:
#runtime: 3'
#create list of all document strings
documents = features['Consumer complaint narrative']
#create list of document strings split by sentence (a list for each document where each document is represented by a list of sentences (strings))
doc_sent = tokenize_to_sentences(documents)
#create list of document strings split by cleaned sentences
doc_sent_clean = [[clean_sentence(sentence) for sentence in document] for document in doc_sent]

In [8]:
documents[0:2]

0    I have an installment loan out with Oklahoma C...
1    I was told by the Department of Education to c...
Name: Consumer complaint narrative, dtype: object

In [9]:
doc_sent[0:2]

0    [I have an installment loan out with Oklahoma ...
1    [I was told by the Department of Education to ...
Name: Consumer complaint narrative, dtype: object

In [10]:
doc_sent_clean[0:2]

[['instal loan oklahoma credit ok call cell phone numer time unknownblock number',
  'ask stop simpl fact answer call call result know tri get reach regard loan',
  'know cfpb regul compani hide ident'],
 ['wa told depart educ contact thi collect compani',
  'told depart educ collect compani loan belong becaus name chang debt seven year old',
  'accord rule fcra debt suppos delet system becaus inaccur']]

In [11]:
#'patiencei'  [sentence for doc in doc_sent_clean for sentence in doc]

In [12]:
#Build Corpus: #SK01
#runtime: 15''
def build_corpus(data):
    return [nltk.word_tokenize(sentence) for doc in data for sentence in doc]

corpus = build_corpus(doc_sent_clean)

#Old version:
# def build_corpus(data):
#     "Creates a list of lists containing words from each sentence"
#     corpus = []
#     for content in data:
#         corpus_temp = nltk.word_tokenize(content)
#         corpus.append(corpus_temp)
#     return corpus

# corpus = build_corpus(features)

In [13]:
#Number of sentences overall (can contain duplicates!):
len(corpus)

10960

In [14]:
#corpus[0:2]

# Embedding

## Word2Vec (Pretrained Google News)

In [15]:
# Importing pre-trained model, updating vocab 
# and training the model (takes long to run)
#runtime without google: 18''
model_2 = Word2Vec(size=300, min_count=1, window = 5)
model_2.build_vocab(corpus)
total_examples = model_2.corpus_count

# #for testing commented out:
# model_pt = gensim.models.KeyedVectors.load_word2vec_format('/home/sevi/Desktop/Work/PropulsionDS/Pretrained/GoogleNews-vectors-negative300.bin', binary=True)
# model_2.build_vocab([list(model_pt.vocab.keys())], update=True)
# model_2.intersect_word2vec_format('/home/sevi/Desktop/Work/PropulsionDS/Pretrained/GoogleNews-vectors-negative300.bin', binary=True, lockf=1.0)

# intersect_word2vec_format() will let you bring vectors from an external file into a model that's already had its own vocabulary initialized
# see https://tedboy.github.io/nlps/generated/generated/gensim.models.Word2Vec.intersect_word2vec_format.html
model_2.train(corpus, total_examples=total_examples, epochs=model_2.iter)

  from ipykernel import kernelapp as app


(470346, 548965)

In [16]:
#model_2.wv.vectors contains 26'668 vectors of length 300
print(len(model_2.wv.vectors))
print(len(model_2.wv.vectors[0]))
print(len(model_2.wv.vocab))


5586
300
5586


## TF-IDF

In [54]:
doc_clean = [' '.join(document) for document in doc_sent_clean]

In [110]:
#SK02
#this function creates the (for now only global) TF-IDF weights for each word.
def calc_tf_idf(cleaned_doc_str = None, global_word = True):
    #Calculate TF-IDF score for all words in the corpus
    tfidf = TfidfVectorizer(
          sublinear_tf=True
        , min_df=1
        , norm='l2'
        , encoding='latin-1'
        , ngram_range=(1,1)  #considering only 1-grams for now
        , token_pattern= '(?u)\\b\\w+\\b' #changed from default in order to also include single letters in case they appear in the corpus.
        , tokenizer=nltk.word_tokenize #we pass the corpus as words already.
        , stop_words=None #we already removed stopwords
    ) 
    tfidf_matrix = tfidf.fit_transform(cleaned_doc_str)#.toarray()
    
    if(global_word):
        #average all TF-idfs for all documents:
        tfidf_matrix = pd.DataFrame(data = tfidf_matrix.mean(axis = -0)[0]
                                    , columns = tfidf.get_feature_names()
                                    , index = ['TFIDF']
                                    )
    if(not(len(tfidf.get_feature_names()) == len(model_2.wv.vocab.keys()))):
        warnings.warn('The count of vocabulary from the word embedding does not coincide with the TF-IDF Vectorizer features.')
    
    return tfidf_matrix
    #None
    #tfidf_matrix contains TF-IDF scores for each document (in the rows) and word (in the columns)
    #tfidf.get_feature_names() gives you the words in the order 

In [98]:
global_word_tfidf = calc_tf_idf(doc_clean, global_word = True)
#global_word_tfidf

In [102]:
len(global_word_tfidf.columns)

5586

## Combine Word2Vec & TF-IDF

In [108]:
#loop through tfidf feature_names (the model vocab can be much larger since pretrained. But we are only interested in our own words from the corpus.)
embedding_matrix = np.zeros((len(global_word_tfidf.columns) + 1, 300))
for i, token in enumerate(global_word_tfidf.columns):
  embedding_matrix[i] = model_2.wv.get_vector(token) * global_word_tfidf[token].values[0]
#embedding_matrix is 26'668 x 300 matrix

# Active Learning

## Input Preparation

In [None]:
sent_clean = [sentence for document in doc_sent_clean for sentence in document]

In [118]:
#SK03
feature_size = 500
tokenizer = Tokenizer(num_words = feature_size, split = ' ')
# fit the tokenizer on our text
tokenizer.fit_on_texts(sent_clean) #SK03 changed to be applied son sent_clean instead of features (which is grouped by docs)

In [124]:
# put the tokens in a matrix
X = tokenizer.texts_to_sequences(sent_clean)
X = pad_sequences(X)
#len(X) = number of sentences in all docs

In [155]:
#SK04
def calc_target_per_sent(sentence_per_doc = None, target_original = None):
    
    current_target_idx = 0
    new_target = pd.DataFrame()
    for doc in sentence_per_doc:
        for sentence in doc:
            new_target = new_target.append(target.iloc[current_target_idx:current_target_idx+1])
            
        current_target_idx += 1

        
    return new_target.reset_index(drop = True)

In [157]:
target = calc_target_per_sent(doc_sent_clean, target)

In [160]:
# prepare the labels
y = pd.get_dummies(target)

In [161]:
print(len(X))
print(len(y))

10960
10960


In [162]:
# split in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

## Define and Run Model

In [163]:
# init model
model = Sequential()
# emmbed word vectors
model.add(Embedding(len(model_2.wv.vocab) + 1 ,
                    300,
                    input_length  = X.shape[1],
                    weights = [embedding_matrix],
                    trainable=False))
# learn the correlations
model.add(LSTM(300,return_sequences=False))
model.add(Dense(12,activation="softmax")) 
# output model skeleton
model.summary()
model.compile(optimizer="nadam",loss="categorical_crossentropy",metrics=['acc'])

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 300)          1676100   
_________________________________________________________________
lstm (LSTM)                  (None, 300)               721200    
_________________________________________________________________
dense (Dense)                (None, 12)                3612      
Total params: 2,400,912
Trainable params: 724,812
Non-trainable params: 1,676,100
_________________________________________________________________


In [164]:
batch = 64
epochs = 50
model.fit(X_train,y_train,batch,epochs)

Instructions for updating:
Use tf.cast instead.
Epoch 1/50
Epoch 2/50
Epoch 3/50

KeyboardInterrupt: 

In [None]:
model.evaluate(X_test,y_test)