1) a function named preprocess() that takes in a pandas.Series() of a corpus of text data as an argument. This function should output an indexed vocabulary and preprocessed tokens.
2) a function named encode() that takes in two arguments: 1) a pandas.Series() (or the preprocessed token outputs of the preprocess() function), and 2) a specified encoding method. The function must include implementations for Bag-of-Words, TF-IDF, and Word2Vec. 


In [27]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
# import pandas as pd

# Download the NLTK tokenizer models (only the first time)
nltk.download('punkt')

def preprocess(corpus):
    tokenized_corpus = []
    for i, word in corpus.items():
        tokenized_corpus.append(nltk.word_tokenize(word))
    print(tokenized_corpus)
    print([word for doc in tokenized_corpus for word in doc])
    vocab = set()
    for doc in tokenized_corpus:
        for word in doc:
            vocab.update(word)
    return tokenized_corpus, vocab

def encode(corpus, encoding_method):
    if encoding_method == 'Bag-of-Words':
        bow_vectorizer = CountVectorizer()
        bow_matrix = bow_vectorizer.fit_transform(corpus)
        print(bow_vectorizer.get_feature_names_out())
        return bow_matrix.toarray()
    
    elif encoding_method == 'TF-IDF':
        tdif_vectorizer = TfidfVectorizer()
        tfidf_matrix = tdif_vectorizer.fit_transform(corpus)
        print(tdif_vectorizer.get_feature_names_out())
        return tfidf_matrix.toarray()
    
    elif encoding_method == 'Word2Vec':
        model = Word2Vec([word_tokenize(doc) for doc in corpus], vector_size=20, window=5, min_count=1, workers=4)
        #word_vectors = {word: model.wv[word] for word in model.wv.vocab}
        return model



[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>


In [32]:

CORPUS = pd.Series([
    "The quick brown fox jumps over the lazy dog",
    "A king's strength also includes his allies",
    "History is written by the victors",
    "An apple a day keeps the doctor away",
    "Nothing happens until something moves"
    ])
TARGET = 'apple'

vocabulary, tokenized_corpus = preprocess(CORPUS)

# Encode using Bag-of-Words
encoded_bow = encode(CORPUS, 'Bag-of-Words')
print("Bag-of-Words Encoding:")
print(encoded_bow)

# Encode using TF-IDF
encoded_tfidf = encode(CORPUS, 'TF-IDF')
print("\nTF-IDF Encoding:")
print(encoded_tfidf)

# Encode using Word2Vec
encoded_word2vec = encode(CORPUS, 'Word2Vec')
print("\nWord2Vec Encoding:")
print(encoded_word2vec)
print(encoded_word2vec.wv.most_similar(TARGET, topn=5))

[['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'], ['A', 'king', "'s", 'strength', 'also', 'includes', 'his', 'allies'], ['History', 'is', 'written', 'by', 'the', 'victors'], ['An', 'apple', 'a', 'day', 'keeps', 'the', 'doctor', 'away'], ['Nothing', 'happens', 'until', 'something', 'moves']]
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'A', 'king', "'s", 'strength', 'also', 'includes', 'his', 'allies', 'History', 'is', 'written', 'by', 'the', 'victors', 'An', 'apple', 'a', 'day', 'keeps', 'the', 'doctor', 'away', 'Nothing', 'happens', 'until', 'something', 'moves']
['allies' 'also' 'an' 'apple' 'away' 'brown' 'by' 'day' 'doctor' 'dog'
 'fox' 'happens' 'his' 'history' 'includes' 'is' 'jumps' 'keeps' 'king'
 'lazy' 'moves' 'nothing' 'over' 'quick' 'something' 'strength' 'the'
 'until' 'victors' 'written']
Bag-of-Words Encoding:
[[0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 2 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0