### Model Testing

In [None]:
from gensim.models import Word2Vec
# Load the trained Word2Vec model
model = Word2Vec.load('model/idwiki_word2vec_200_skip-gram_window_2_new_lower.model')

# Get word vectors for user input
user_input = input("Enter your custom text: ")
user_tokens = user_input.split()

# Initialize an empty list to store word vectors
word_vectors = []

# Iterate through the tokens in the user input
for word in user_tokens:
    if word in model.wv:
        word_vectors.append(model.wv[word])
    else:
        print(f"Word '{word}' not in vocabulary")

# Perform operations on word vectors
if word_vectors:
    # Find words similar to the words in the user input
    similar_words = model.wv.most_similar(positive=word_vectors, topn=5)

    print("\nWords similar to user input:")
    for word, similarity in similar_words:
        print(f"{word}: {similarity}")
else:
    print("No valid words found in the vocabulary for similarity analysis.")


### Model Checks

In [None]:
from gensim.models import Word2Vec

# Load the Word2Vec model
model = Word2Vec.load('./model/windows_2/idwiki_word2vec_200_skip-gram_window_2_new_lower.model')

# Check the 'sg' parameter
if model.sg == 1:
    print("The model was trained using Skip-gram.")
    print(f"Windows size: {model.window}")
else:
    print("The model was trained using CBOW.")

### Model Architecture Compare

In [None]:
from gensim.models import Word2Vec

# Load your pre-trained Word2Vec models
model1 = Word2Vec.load("../skripsi/pre-trained-model/word2vec_wikipedia_id_model/wiki.id.case.model")
model2 = Word2Vec.load("./model/windows_5/idwiki_word2vec_200_skip-gram_window_5_new_lower.model")

# Compare the architectures of the two models
def compare_architecture(model1, model2):
    architecture_comparison = {}

    # Compare vector size (embedding dimension)
    architecture_comparison["Vector Size (Embedding Dimension)"] = {
        "Model 1": model1.vector_size,
        "Model 2": model2.vector_size,
    }

    # Compare context window size
    architecture_comparison["Context Window Size"] = {
        "Model 1": model1.window,
        "Model 2": model2.window,
    }

    # Compare training algorithm (Skip-gram or CBOW)
    architecture_comparison["Training Algorithm"] = {
        "Model 1": "Skip-gram" if model1.sg else "CBOW",
        "Model 2": "Skip-gram" if model2.sg else "CBOW",
    }

    # Compare corpus size (number of sentences)
    architecture_comparison["Corpus Size (Number of Sentences)"] = {
        "Model 1": model1.corpus_count,
        "Model 2": model2.corpus_count,
    }

    # Add more architecture comparisons as needed

    return architecture_comparison

# Compare the architectures of the two models
architecture_comparison_result = compare_architecture(model1, model2)

# Print the comparison results
for aspect, values in architecture_comparison_result.items():
    print(aspect)
    for model_name, value in values.items():
        print(f"- {model_name}: {value}")

# You can add more aspects to compare based on your requirements.


### Datasets Checks

In [1]:
import pandas as pd

csv_path = "./datasets/wikipedia_id-clean.csv"

df = pd.read_csv(csv_path)

df

Unnamed: 0,id,parent_id,timestamp,hash,title,text
0,1,19472643.0,2022-04-24T00:35:51Z,oo2rvi3f9p4u9czu3opfvbsl0jp9aog,Asam deoksiribonukleat,"Asam deoksiribonukleat, lebih dikenal dengan s..."
1,3,19395623.0,2022-05-05T11:26:32Z,lq3uz0weougma29prpmkm1l5at48s4w,Anwar Sadat,Muhammad Anwar el-Sadat; ) adalah seorang poli...
2,4,14292725.0,2021-07-05T02:41:20Z,3h1z248a5ic1u3odrse27wg0o0slljl,Azhar Mansor,Datuk Azhar Mansor adalah orang Malaysia perta...
3,5,21368465.0,2022-09-26T04:25:33Z,ololfquhgvu4wzszfu1yaqo3amb8xnm,Arkeologi,Arkeologi atau ilmu kepurbakalaan adalah ilmu ...
4,6,21768805.0,2022-11-02T02:33:49Z,mkxdb4xfmgse21h6lvtoflnvwlepdb3,Antropologi,Antropologi adalah ilmu tentang manusia. Antro...
...,...,...,...,...,...,...
638793,3768433,22028503.0,2022-11-20T09:09:07Z,lqkp6ghxuwh3a5dbdq2agh6qd7a2mc1,"Minabua, Mimika Baru, Mimika",Minabua adalah kampung yang berada di distrik ...
638794,3768434,22028678.0,2022-11-20T09:10:55Z,1owb4thtivx78elqnynpzsq3fslh2ej,"Hangaitji, Mimika Baru, Mimika",Hangaitji adalah kampung yang berada di distri...
638795,3768435,,2022-11-20T08:26:08Z,9u51d11lp2p5awe9od987qvj00why54,Toyota GR010 Hybrid,Toyota GR010 Hybrid adalah mobil balap prototi...
638796,3768436,22029512.0,2022-11-20T09:35:03Z,qkp1lsltxrfbj9nvem5mn7aav2av993,Romaria Magdalena Simbolon,Romaria Magdalena Simbolon adalah seorang peny...
