In [None]:
import io
import time
from datetime import timedelta
import gensim

if __name__ == '__main__':
    start_time = time.time()
    print('Streaming wiki...')
    id_wiki = gensim.corpora.WikiCorpus(
        './datasets/idwiki-latest-pages-articles.xml.bz2',
        dictionary={}, lower=True
    )
    
    article_count = 0
    with io.open('./datasets/idwiki_new_lower.txt', 'w', encoding='utf-8') as wiki_txt:
        for text in id_wiki.get_texts():

            wiki_txt.write(" ".join(text) + '\n')
            article_count += 1

            if article_count % 10000 == 0:
                print('{} articles processed'.format(article_count))
        print('total: {} articles'.format(article_count))

    finish_time = time.time()
    print('Elapsed time: {}'.format(timedelta(seconds=finish_time-start_time)))

### Training Word2Vec

In [7]:
import time
import multiprocessing
from datetime import timedelta
from gensim.models import word2vec

if __name__ == '__main__':
    start_time = time.time()
    print('Training Word2Vec Model...')
    sentences = word2vec.LineSentence('./datasets/idwiki_new_lower.txt')
    id_w2v = word2vec.Word2Vec(sentences, vector_size=200, workers=multiprocessing.cpu_count()-1, sg=1, window=2)
    id_w2v.save('model/idwiki_word2vec_200_skip-gram_window_2_new_lower.model')
    finish_time = time.time()

    print('Finished. Elapsed time: {}'.format(timedelta(seconds=finish_time-start_time)))


Training Word2Vec Model...
Finished. Elapsed time: 0:15:18.877293


### Model Testing

In [10]:
from gensim.models import Word2Vec

# Load the trained Word2Vec model
model = Word2Vec.load('model/idwiki_word2vec_200_skip-gram_window_2_new_lower.model')

# Get word vectors for user input
user_input = input("Enter your custom text: ")
user_tokens = user_input.split()

# Initialize an empty list to store word vectors
word_vectors = []

# Iterate through the tokens in the user input
for word in user_tokens:
    if word in model.wv:
        word_vectors.append(model.wv[word])
    else:
        print(f"Word '{word}' not in vocabulary")

# Perform operations on word vectors
if word_vectors:
    # Find words similar to the words in the user input
    similar_words = model.wv.most_similar(positive=word_vectors, topn=5)
    
    print("\nWords similar to user input:")
    for word, similarity in similar_words:
        print(f"{word}: {similarity}")
else:
    print("No valid words found in the vocabulary for similarity analysis.")



Words similar to user input:
dosen: 1.0
pengajar: 0.7999383807182312
profesor: 0.7081513404846191
almamaternya: 0.6806699633598328
akademikus: 0.6767548322677612


### Model Checks

In [9]:
from gensim.models import Word2Vec

# Load the Word2Vec model
model = Word2Vec.load('model/idwiki_word2vec_200_skip-gram_window_2_new_lower.model')

# Check the 'sg' parameter
if model.sg == 1:
    print("The model was trained using Skip-gram.")
    print(f"Windows size: {model.window}")
else:
    print("The model was trained using CBOW.")


The model was trained using Skip-gram.
Windows size: 2
