In [None]:
import io
import time
from datetime import timedelta
import gensim

if __name__ == '__main__':
    start_time = time.time()
    print('Streaming wiki...')
    id_wiki = gensim.corpora.WikiCorpus(
        './datasets/idwiki-latest-pages-articles.xml.bz2',
        dictionary={}, lower=True
    )
    
    article_count = 0
    with io.open('./datasets/idwiki_new_lower.txt', 'w', encoding='utf-8') as wiki_txt:
        for text in id_wiki.get_texts():

            wiki_txt.write(" ".join(text) + '\n')
            article_count += 1

            if article_count % 10000 == 0:
                print('{} articles processed'.format(article_count))
        print('total: {} articles'.format(article_count))

    finish_time = time.time()
    print('Elapsed time: {}'.format(timedelta(seconds=finish_time-start_time)))

### Training Word2Vec

In [1]:
import time
import multiprocessing
from datetime import timedelta
from gensim.models import word2vec

if __name__ == '__main__':
    start_time = time.time()
    print('Training Word2Vec Model...')
    sentences = word2vec.LineSentence('./datasets/idwiki_new_lower.txt')
    id_w2v = word2vec.Word2Vec(sentences, vector_size=200, workers=multiprocessing.cpu_count()-1, sg=1, window=5)
    id_w2v.save('model/windows_5/idwiki_word2vec_200_skip-gram_window_5_new_lower.model')
    finish_time = time.time()

    print('Finished. Elapsed time: {}'.format(timedelta(seconds=finish_time-start_time)))


Training Word2Vec Model...
Finished. Elapsed time: 0:08:14.968403


### Model Testing

In [10]:
from gensim.models import Word2Vec

# Load the trained Word2Vec model
model = Word2Vec.load('model/idwiki_word2vec_200_skip-gram_window_2_new_lower.model')

# Get word vectors for user input
user_input = input("Enter your custom text: ")
user_tokens = user_input.split()

# Initialize an empty list to store word vectors
word_vectors = []

# Iterate through the tokens in the user input
for word in user_tokens:
    if word in model.wv:
        word_vectors.append(model.wv[word])
    else:
        print(f"Word '{word}' not in vocabulary")

# Perform operations on word vectors
if word_vectors:
    # Find words similar to the words in the user input
    similar_words = model.wv.most_similar(positive=word_vectors, topn=5)
    
    print("\nWords similar to user input:")
    for word, similarity in similar_words:
        print(f"{word}: {similarity}")
else:
    print("No valid words found in the vocabulary for similarity analysis.")



Words similar to user input:
dosen: 1.0
pengajar: 0.7999383807182312
profesor: 0.7081513404846191
almamaternya: 0.6806699633598328
akademikus: 0.6767548322677612


### Model Checks

In [2]:
from gensim.models import Word2Vec

# Load the Word2Vec model
model = Word2Vec.load('./model/windows_5/idwiki_word2vec_200_skip-gram_window_5_new_lower.model')

# Check the 'sg' parameter
if model.sg == 1:
    print("The model was trained using Skip-gram.")
    print(f"Windows size: {model.window}")
else:
    print("The model was trained using CBOW.")


The model was trained using Skip-gram.
Windows size: 5


### Model Compare

#### Model Architecture Compare

In [3]:
from gensim.models import Word2Vec

# Load your pre-trained Word2Vec models
model1 = Word2Vec.load("../skripsi/pre-trained-model/word2vec_wikipedia_id_model/wiki.id.case.model")
model2 = Word2Vec.load("./model/windows_5/idwiki_word2vec_200_skip-gram_window_5_new_lower.model")

# Compare the architectures of the two models
def compare_architecture(model1, model2):
    architecture_comparison = {}

    # Compare vector size (embedding dimension)
    architecture_comparison["Vector Size (Embedding Dimension)"] = {
        "Model 1": model1.vector_size,
        "Model 2": model2.vector_size,
    }

    # Compare context window size
    architecture_comparison["Context Window Size"] = {
        "Model 1": model1.window,
        "Model 2": model2.window,
    }

    # Compare training algorithm (Skip-gram or CBOW)
    architecture_comparison["Training Algorithm"] = {
        "Model 1": "Skip-gram" if model1.sg else "CBOW",
        "Model 2": "Skip-gram" if model2.sg else "CBOW",
    }

    # Compare corpus size (number of sentences)
    architecture_comparison["Corpus Size (Number of Sentences)"] = {
        "Model 1": model1.corpus_count,
        "Model 2": model2.corpus_count,
    }

    # Add more architecture comparisons as needed

    return architecture_comparison

# Compare the architectures of the two models
architecture_comparison_result = compare_architecture(model1, model2)

# Print the comparison results
for aspect, values in architecture_comparison_result.items():
    print(aspect)
    for model_name, value in values.items():
        print(f"- {model_name}: {value}")

# You can add more aspects to compare based on your requirements.


Vector Size (Embedding Dimension)
- Model 1: 400
- Model 2: 200
Context Window Size
- Model 1: 5
- Model 2: 5
Training Algorithm
- Model 1: CBOW
- Model 2: Skip-gram
Corpus Size (Number of Sentences)
- Model 1: 427949
- Model 2: 496134
