# Word2Vec and Deep Learning Article Views

https://towardsdatascience.com/machine-learning-word-embedding-sentiment-classification-using-keras-b83c28087456

In [1]:
import numpy as np
import pandas as pd
import os
import re
import time
import math

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from tqdm import tqdm
import string

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

tqdm.pandas()

Using TensorFlow backend.
  from pandas import Panel


In [2]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [2]:
train = pd.read_csv('train.csv')

In [3]:
content = train['content'].tolist()

## Preprocessing

In [4]:
def preprocessing(content_list):
    
    processed_list = []
    
    for line in tqdm(content_list):
        tokens = word_tokenize(line)
        # Convert to lower case
        tokens = [w.lower() for w in tokens]
        # Remove punctuation
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # Remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]
        # Filter out stopwords
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        
        processed_list.append(words)
        
    return processed_list

In [8]:
# Preprocessing - to remove length-1 words, and remove non-alphabet symbols
def preprocessing(titles_array):

    processed_array = []
    
    for title in tqdm(titles_array):
        
        # remove other non-alphabets symbols with space (i.e. keep only alphabets and whitespaces).
        processed = re.sub('[^a-zA-Z ]', '', title)
        
        words = processed.split()
        
        # keep words that have length of more than 1 (e.g. gb, bb), remove those with length 1.
        processed_array.append(' '.join([word for word in words if len(word) > 1]))
    
    return processed_array

In [5]:
# Preprocessing the words
train['processed_content'] = preprocessing(train['content'])

100%|███████████████████████████████████████████████████████████████████████████| 16772/16772 [01:19<00:00, 210.16it/s]


## Training the Word2Vec Model

Word2Vec is a static word-embedding.

**Getting from Word Embedding to Doc Embedding**

- https://towardsdatascience.com/nlp-performance-of-different-word-embeddings-on-text-classification-de648c6262b
- 3 methods:
1. Simple Averaging of Word Embedding
2. TF-IDF Weighted Averaging on Word Embedding
3. Directly leverage Doc2Vec

In [46]:
EMBEDDING_DIM = 100

In [47]:
model = Word2Vec(sentences = train['processed_content'], size = EMBEDDING_DIM, min_count = 1, window = 5, workers = 4)

In [48]:
# Vocab size
words = list(model.wv.vocab)
print('Vocabulary size: {}'.format(len(words)))

Vocabulary size: 111788


In [53]:
# Testing the model
model.wv.most_similar('bear')

[('erode', 0.7844998836517334),
 ('defy', 0.7674049139022827),
 ('distorting', 0.7603688836097717),
 ('arrogance', 0.7515872120857239),
 ('induce', 0.7422409653663635),
 ('useless', 0.7407965660095215),
 ('indulge', 0.7380375862121582),
 ('conceive', 0.7355968356132507),
 ('unchecked', 0.7346436381340027),
 ('virtue', 0.7324776649475098)]

In [54]:
# Doing some math on the word vectors
model.wv.most_similar_cosmul(positive = ['father', 'female'], negative = ['male'])

[('daughter', 0.8678452372550964),
 ('son', 0.8435013890266418),
 ('remembered', 0.8427660465240479),
 ('sheldon', 0.8286240100860596),
 ('haymes', 0.8157896399497986),
 ('hackney', 0.811972439289093),
 ('stormin', 0.8090441226959229),
 ('husband', 0.807654082775116),
 ('wife', 0.805396556854248),
 ('dad', 0.8004140853881836)]

In [32]:
# Save model
filename = 'word2vec_train2.txt'
model.wv.save_word2vec_format(filename, binary = False)

## Using Word2Vec Model for Training Classification Model

In [6]:
# Extract the embeddings from the stored file
# Embedding is size 111k (# words) x 100 (dimensions)
import os 

embeddings_index = {}
f = open(os.path.join('', 'word2vec_train2.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [7]:
# Vectorize the text samples into 2D integer tensor
tokenizer_obj = Tokenizer()
# Fit the tokenizer on the text
tokenizer_obj.fit_on_texts(train['processed_content'])
# Generate the sequence of tokens
sequences = tokenizer_obj.texts_to_sequences(train['processed_content'])

In [8]:
# Get the max length of each article - 5587
max_length = max([len(s) for s in train['processed_content']])
# Get vocab size
vocab_size = len(tokenizer_obj.word_index) + 1

- sequences is len = 16772 (# articles)
- review_pad has shape (16772, 5587)
- 111788 unique tokens

In [9]:
# Pad the sequences
review_pad = pad_sequences(sequences, maxlen = max_length)

word_index = tokenizer_obj.word_index

In [12]:
num_words = len(word_index) + 1
words_not_found = []
# Create the emedding matrix - map embeddings from word2vec model for each word and create matrix of word vectors
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words: # Least common words (don't care)
        continue
        
    embedding_vector = embeddings_index.get(word)
    
    if (embedding_vector is not None):
        # Assign the ith elmenet of the embedding matrix to the embedding of that word
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
        
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

number of null word embeddings: 1


In [13]:
embedding_matrix.shape

(111789, 100)