In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from gensim.models import Word2Vec

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


In [None]:
corpus = []
with open('foo.txt') as f:
    corpus.append(f.read())

In [None]:
lemmatizer = WordNetLemmatizer()

# Function to convert NLTK POS tags to WordNet POS tags
def nltk_pos_to_wordnet_pos(nltk_pos_tag):
    if nltk_pos_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_pos_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_pos_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# Function to preprocess text
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())  # Convert text to lowercase and tokenize
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # POS tagging
    pos_tags = nltk.pos_tag(tokens)
    # Lemmatize tokens with POS tags
    lemmatized_tokens = []
    for word, tag in pos_tags:
        wordnet_pos = nltk_pos_to_wordnet_pos(tag)
        if wordnet_pos is None:
            lemmatized_token = lemmatizer.lemmatize(word)
        else:
            lemmatized_token = lemmatizer.lemmatize(word, pos=wordnet_pos)
        lemmatized_tokens.append(lemmatized_token)
    return lemmatized_tokens

# Preprocess the corpus
preprocessed_corpus = [preprocess_text(doc) for doc in corpus]

# Display preprocessed corpus
print(preprocessed_corpus)

# Train the Word2Vec model
model = Word2Vec(sentences=preprocessed_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Save the model
model.save("word2vec.model")

# Load and test the model
model = Word2Vec.load("word2vec.model")
print(model.wv.most_similar('technology'))
