# Word2Vec and Deep Learning Article Views

https://towardsdatascience.com/machine-learning-word-embedding-sentiment-classification-using-keras-b83c28087456

In [10]:
import numpy as np
import pandas as pd
import os
import re
import time
import math

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from tqdm import tqdm
import string

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

tqdm.pandas()

In [2]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [11]:
train = pd.read_csv('train.csv')

In [12]:
content = train['content'].tolist()

## Preprocessing

In [13]:
def preprocessing(content_list):
    
    processed_list = []
    
    for line in tqdm(content_list):
        tokens = word_tokenize(line)
        # Convert to lower case
        tokens = [w.lower() for w in tokens]
        # Remove punctuation
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # Remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]
        # Filter out stopwords
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        
        processed_list.append(words)
        
    return processed_list

In [5]:
# Preprocessing - to remove length-1 words, and remove non-alphabet symbols
def preprocessing(titles_array):

    processed_array = []
    
    for title in tqdm(titles_array):
        
        # remove other non-alphabets symbols with space (i.e. keep only alphabets and whitespaces).
        processed = re.sub('[^a-zA-Z ]', '', title)
        
        words = processed.split()
        
        # keep words that have length of more than 1 (e.g. gb, bb), remove those with length 1.
        processed_array.append(' '.join([word for word in words if len(word) > 1]))
    
    return processed_array

In [14]:
# Preprocessing the words
train['processed_content'] = preprocessing(train['content'])

100%|███████████████████████████████████████████████████████████████████████████| 16772/16772 [01:15<00:00, 223.61it/s]


## Training the Word2Vec Model

Word2Vec is a static word-embedding.

**Getting from Word Embedding to Doc Embedding**

- https://towardsdatascience.com/nlp-performance-of-different-word-embeddings-on-text-classification-de648c6262b
- 3 methods:
1. Simple Averaging of Word Embedding
2. TF-IDF Weighted Averaging on Word Embedding
3. Directly leverage Doc2Vec

In [15]:
EMBEDDING_DIM = 100

In [16]:
model = Word2Vec(sentences = train['processed_content'], size = EMBEDDING_DIM, min_count = 1, window = 5, workers = 4)

In [17]:
# Vocab size
words = list(model.wv.vocab)
print('Vocabulary size: {}'.format(len(words)))

Vocabulary size: 111788


In [42]:
# Testing the model
model.wv.most_similar('huntsman')

[('steinbergdietrich', 0.7880867123603821),
 ('houston', 0.7732806205749512),
 ('stiteler', 0.7369509935379028),
 ('tangen', 0.7138106822967529),
 ('meyerson', 0.7133698463439941),
 ('golkin', 0.7085722088813782),
 ('fagin', 0.7020806074142456),
 ('fisherbennett', 0.700007975101471),
 ('gathwala', 0.6967608332633972),
 ('skirkanich', 0.6869189143180847)]

In [30]:
# Doing math on the word vectors
model.wv.most_similar_cosmul(positive = ['fraternity', 'female'], negative = ['male'])

[('ifc', 0.9987518191337585),
 ('sorority', 0.9928473830223083),
 ('mgc', 0.9732818603515625),
 ('panhellenic', 0.9669802188873291),
 ('sororities', 0.9545149207115173),
 ('greek', 0.9544658064842224),
 ('panhel', 0.9450474381446838),
 ('chapter', 0.9426577091217041),
 ('chapters', 0.9415575265884399),
 ('recruitment', 0.9344643354415894)]

In [32]:
# Save model
filename = 'word2vec_train2.txt'
model.wv.save_word2vec_format(filename, binary = False)