In [1]:
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer
import gensim.downloader as gensim_downloader
import logging
from gensim.models.word2vec import Word2Vec
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import numpy as np
from tqdm import tqdm

In [2]:
try:
    emma = nltk.corpus.gutenberg.sents(nltk.corpus.gutenberg.fileids()[0])
    print("Nbr Sentences:", len(emma))
except LookupError:
    nltk.download('gutenberg')
    nltk.download('punkt')
    emma = nltk.corpus.gutenberg.sents(nltk.corpus.gutenberg.fileids()[0])
    print("Nbr Sentences:", len(emma))

Nbr Sentences: 7752


In [3]:
emma = [[word.lower() for word in sent if word not in string.punctuation] for sent in emma]
emma_joined_sents = [' '.join([word for word in sent]) for sent in emma]
emma_joined_sents = [sent for sent in emma_joined_sents if sent != '']

In [4]:
vectorizer = CountVectorizer()
emma_count_vectors = vectorizer.fit_transform(emma_joined_sents)
print("25 Samples from Vocabulary:\n", vectorizer.get_feature_names_out()[-25:])
print(f"Nbr Sentences: {emma_count_vectors.shape[0]} Nbr Words: {emma_count_vectors.shape[1]}")

25 Samples from Vocabulary:
 ['yards' 'ye' 'year' 'years' 'yellow' 'yeomanry' 'yes' 'yesterday' 'yet'
 'yield' 'yielded' 'yielding' 'york' 'yorkshire' 'you' 'young' 'younger'
 'youngest' 'your' 'yours' 'yourself' 'youth' 'youthful' 'zeal' 'zigzags']
Nbr Sentences: 7721 Nbr Words: 7239


In [5]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)
w2v_model = Word2Vec(emma, vector_size=100, min_count=2)
print(f"25 Samples from Vocabulary:\n{list(w2v_model.wv.key_to_index.keys())[-25:]}")
print(f"Most similar words to 'house':\n{w2v_model.wv.most_similar(['house'])}")
print(f"Word2Vec Embedding for 'house':\n{w2v_model.wv['house']}")

2021-11-13 00:11:40,021 : INFO : collecting all words and their counts
2021-11-13 00:11:40,022 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-11-13 00:11:40,066 : INFO : collected 7322 word types from a corpus of 167069 raw words and 7752 sentences
2021-11-13 00:11:40,068 : INFO : Creating a fresh vocabulary
2021-11-13 00:11:40,089 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 4435 unique words (60.570882272603114%% of original 7322, drops 2887)', 'datetime': '2021-11-13T00:11:40.089544', 'gensim': '4.1.2', 'python': '3.9.6 (tags/v3.9.6:db3ff76, Jun 28 2021, 15:26:21) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_vocab'}
2021-11-13 00:11:40,091 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 164182 word corpus (98.2719714608934%% of original 167069, drops 2887)', 'datetime': '2021-11-13T00:11:40.091544', 'gensim': '4.1.2', 'python': '3.9.6 (tags/v3.9.6:db3ff76,

25 Samples from Vocabulary:
['eyebrows', 'presents', ':"--', 'sparkling', 'lengths', 'fairest', 'stout', 'falsehood', '_own_', '_home_', 'hereabouts', 'needs', 'faster', 'respecting', 'admissible', 'displayed', 'contradict', 'likenesses', 'eleven', 'faithful', 'augur', 'wore', '?\'"', 'unconscious', 'poultry']
Most similar words to 'house':
[('room', 0.9979537129402161), ('wife', 0.997846782207489), ('family', 0.9972323775291443), ('part', 0.9971916079521179), ('mind', 0.9970117211341858), ('subject', 0.996946394443512), ('first', 0.9965338110923767), ('brought', 0.9964548349380493), ('visit', 0.996389627456665), ('having', 0.9963793158531189)]
Word2Vec Embedding for 'house':
[-0.29812935  0.2564647   0.1489685   0.20134875 -0.09541447 -0.55086327
  0.2629776   0.4896287  -0.37229693 -0.14831354 -0.00734899 -0.6023056
  0.2280103  -0.08894787  0.2480529  -0.4374347   0.18319055 -0.25876698
 -0.26572463 -0.4848458   0.19794495  0.05376895  0.4569192  -0.2980309
 -0.33282393  0.11609851 

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode(emma_joined_sents[0]))[None, :]
bert_embedding_layer = model(input_ids)[0]
print("Flattened BERT Embedding:\n{np.mean(bert_embedding_layer, axis=1}")

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Flattened BERT Embedding:
{np.mean(bert_embedding_layer, axis=1}


In [7]:
def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            returntoken_type_ids=True
        )
        
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])
    return (
        np.asarray(input_ids, dtype='int32'),
        np.asarray(input_masks, dtype='int32'),
        np.asarray(input_segments, dtype='int32')
    )
tokens, masks, segments = tokenize(sentences=emma_joined_sents, tokenizer=tokenizer)
bert_embedding_layer = model(tokens)[0]

Linear Regression - Don't overfit polynomial
Logistic Regression - Binary classification (using sigmoid function)
Feature engineering
Multi layer perception