In [None]:
import re
import contractions
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from preprocess import normalization

In [20]:
def preprocess(text):
    # Expand contractions
    expanded_text = contractions.fix(text)

    # make sure all text is lowercase
    expanded_text = expanded_text.lower()

    # # Remove punctuations and special characters
    just_text = re.sub(r'[^a-zA-Z\s]', '', expanded_text)

    # Remove stopwords and trim white space
    stop_words = set(stopwords.words('english'))

    word_tokens = word_tokenize(just_text)
    filtered_words = [w for w in word_tokens if not w.lower() in stop_words]

    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(w) for w in filtered_words]

    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text


In [21]:
text = "I can't believe it's already 2021, I'm so excited for the new year."
print(preprocess(text))

believe already excited new year


In [18]:
def encode(text_data, method="bag_of_words", embedding_dim=100, window=5, min_count=1):
     if method == "bag_of_words":
        vectorizer = CountVectorizer()
        bow_data = vectorizer.fit_transform(text_data) 
        features = vectorizer.get_feature_names_out()
        return bow_data.toarray(), features

     elif method == "tfidf":
         # TF-IDF encoding
         vectorizer = TfidfVectorizer()
         tfidf_data = vectorizer.fit_transform(text_data)
         features = vectorizer.get_feature_names_out()
         return tfidf_data.toarray(), features

     elif method == "word2vec":
         tokenized_docs = [word_tokenize(doc.lower()) for doc in text_data]
         model = Word2Vec(tokenized_docs, vector_size=10, window=5, min_count=1, workers=4)
         # Get Word2Vec embeddings for each word
         embeddings = {word: model.wv[word] for word in model.wv.index_to_key}

         return embeddings

     else:
         raise ValueError("Invalid encoding method. Choose 'bag_of_words', 'tfidf', or 'word2vec'.")



In [19]:

data = ["I can't believe it's already 2021. I'm so excited for the new year.",
            "I like apples. I also like bananas.",
            "I like apples and bananas. I also like grapes."]

preprocessed_data = [normalization(sentence) for sentence in data]
print(preprocessed_data)

# Bag-of-Words encoding
encoded_data, feature_names = encode(preprocessed_data, method='bag_of_words')
print("Bag-of-Words Encoding:")
print(encoded_data)
print("Feature Names:")
print(feature_names)

encoded_data, feature_names = encode(preprocessed_data, method="tfidf")
print("\nTF-IDF Encoding:")
print(encoded_data)
print("Feature Names:")
print(feature_names)

# Word2Vec encoding
word2vec_embeddings = encode(preprocessed_data, method="word2vec")
print("\nWord2Vec Embeddings:")
for word, embedding in word2vec_embeddings.items():
    print(f"Word: {word}, Embedding: {embedding}")


['believe already excited new year', 'like apple also like banana', 'like apple banana also like grape']
Bag-of-Words Encoding:
[[1 0 0 0 1 1 0 0 1 1]
 [0 1 1 1 0 0 0 2 0 0]
 [0 1 1 1 0 0 1 2 0 0]]
Feature Names:
['already' 'also' 'apple' 'banana' 'believe' 'excited' 'grape' 'like'
 'new' 'year']

TF-IDF Encoding:
[[0.4472136  0.         0.         0.         0.4472136  0.4472136
  0.         0.         0.4472136  0.4472136 ]
 [0.         0.37796447 0.37796447 0.37796447 0.         0.
  0.         0.75592895 0.         0.        ]
 [0.         0.33846987 0.33846987 0.33846987 0.         0.
  0.44504721 0.67693975 0.         0.        ]]
Feature Names:
['already' 'also' 'apple' 'banana' 'believe' 'excited' 'grape' 'like'
 'new' 'year']

Word2Vec Embeddings:
Word: like, Embedding: [-0.00536227  0.00236431  0.0510335   0.09009273 -0.0930295  -0.07116809
  0.06458873  0.08972988 -0.05015428 -0.03763372]
Word: banana, Embedding: [ 0.07380505 -0.01533471 -0.04536613  0.06554051 -0.0486016  -