In [20]:
import pandas as pd
import numpy as np
import re
import PIL
from nltk.tokenize import word_tokenize

In [None]:
#dictionary_url = https://www.mit.edu/~ecprice/wordlist.10000 

In [21]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
         
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map


In [22]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('../data/glove.6B.50d.txt')

In [16]:
print(len(word_to_vec_map["sister"]))
print(len(word_to_index))


50
400000


In [40]:
def embed_sentence(sentence, word_to_vec_map, Ty):
    """
    Converts a sentence (string) into a list of words (strings). Extracts the GloVe representation of each word
    and returns the array of vector encoding the meaning of the sentence.
    
    Arguments:
    sentence -- string, one training example from X
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    Ty -- the max length of the sentence. The sentence is truncated in case its size is greater than Ty.
    
    Returns:
    embedding -- return a numpy-array of shape (Ty, 50). When the sentence size is < Ty, the array is padded with 50-dimensional zero vectors.
    """
    
    # Step 1: Add a "." at the end of the sentence. It is our <EOS> element for now // FIX ME
    p = re.compile('.*(\.)$')
    if not (p.match(sentence)):
        sentence += "."
    
    # Step 2: Split the sentence into a list of lower case tokens
    words = [i.lower() for i in word_tokenize(sentence)]   
    sentence_size = len(words)
    
    # Initialize the embedding.
    embedding = np.zeros((Ty,50))
    
    # Step 3: Loops over the token list.
    i = 0
    for idx, w in enumerate(words):
        #The sentence is truncated whenlarger than Ty. The <EOS> character "." is added back.
        if (i == Ty - 1):
            embedding[i][:] = word_to_vec_map["."] 
            break
        try:
            embedding[i][:] = word_to_vec_map[w]
            i += 1
        except KeyError:
            # Ignores unknown words for now
            continue
    
    return embedding

In [41]:
embed_sentence("What a wonderful story, can't wait", word_to_vec_map, 5)


array([[ 4.5323e-01,  5.9811e-02, -1.0577e-01, -3.3300e-01,  7.2359e-01,
        -8.7170e-02, -6.1053e-01, -3.7695e-02, -3.0945e-01,  2.1805e-01,
        -4.3605e-01,  4.7318e-01, -7.6866e-01, -2.7130e-01,  1.1042e+00,
         5.9141e-01,  5.6962e-01, -1.8678e-01,  1.4867e-01, -6.7292e-01,
        -3.4672e-01,  5.2284e-01,  2.2959e-01, -7.2014e-02,  9.3967e-01,
        -2.3985e+00, -1.3238e+00,  2.8698e-01,  7.5509e-01, -7.6522e-01,
         3.3425e+00,  1.7233e-01, -5.1803e-01, -8.2970e-01, -2.9333e-01,
        -5.0076e-01, -1.5228e-01,  9.8973e-02,  1.8146e-01, -1.7420e-01,
        -4.0666e-01,  2.0348e-01, -1.1788e-02,  4.8252e-01,  2.4598e-02,
         3.4064e-01, -8.4724e-02,  5.3240e-01, -2.5103e-01,  6.2546e-01],
       [ 2.1705e-01,  4.6515e-01, -4.6757e-01,  1.0082e-01,  1.0135e+00,
         7.4845e-01, -5.3104e-01, -2.6256e-01,  1.6812e-01,  1.3182e-01,
        -2.4909e-01, -4.4185e-01, -2.1739e-01,  5.1004e-01,  1.3448e-01,
        -4.3141e-01, -3.1230e-02,  2.0674e-01, -7.

In [None]:
# Import Dictionary
from gensim.corpora.dictionary import Dictionary

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles)
print(dictionary)

# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("computer")

# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles]

# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus[4][:10])

# Save the fifth document: doc
doc = corpus[4]

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)
    
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count
    
# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

# Calculate the tfidf weights of doc: tfidf_weights
tfidf_weights = tfidf[doc]

# Print the first five weights
print(tfidf_weights[:5])

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

# Print the top 5 weighted words
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)