### Datasets Merger

In [None]:
# Specify the paths to your two TXT files
file1_path = './datasets/2023/txt/idwiki_20230520.txt'
file2_path = './datasets/2023/txt/idwiki_new_lower_latest.txt'

# Specify the path for the merged dataset
merged_file_path = './datasets/merged_dataset.txt'

# Open the first file and read its content
with open(file1_path, 'r', encoding='utf-8') as file1:
    data1 = file1.read()

# Open the second file and read its content
with open(file2_path, 'r', encoding='utf-8') as file2:
    data2 = file2.read()

# Combine the content of the two files
merged_data = data1 + '\n' + data2

# Write the merged data to a new file
with open(merged_file_path, 'w', encoding='utf-8') as merged_file:
    merged_file.write(merged_data)

print('Datasets merged and saved as', merged_file_path)


### Model Testing

In [None]:
from gensim.models import Word2Vec
# Load the trained Word2Vec model
model = Word2Vec.load('model/idwiki_word2vec_200_skip-gram_window_2_new_lower.model')

# Get word vectors for user input
user_input = input("Enter your custom text: ")
user_tokens = user_input.split()

# Initialize an empty list to store word vectors
word_vectors = []

# Iterate through the tokens in the user input
for word in user_tokens:
    if word in model.wv:
        word_vectors.append(model.wv[word])
    else:
        print(f"Word '{word}' not in vocabulary")

# Perform operations on word vectors
if word_vectors:
    # Find words similar to the words in the user input
    similar_words = model.wv.most_similar(positive=word_vectors, topn=5)

    print("\nWords similar to user input:")
    for word, similarity in similar_words:
        print(f"{word}: {similarity}")
else:
    print("No valid words found in the vocabulary for similarity analysis.")


### Model Checks

In [None]:
from gensim.models import Word2Vec

# Load the Word2Vec model
model = Word2Vec.load('./model/windows_2/idwiki_word2vec_200_skip-gram_window_2_new_lower.model')

# Check the 'sg' parameter
if model.sg == 1:
    print("The model was trained using Skip-gram.")
    print(f"Windows size: {model.window}")
else:
    print("The model was trained using CBOW.")

### Model Architecture Compare

In [None]:
from gensim.models import Word2Vec

# Load your pre-trained Word2Vec models
model1 = Word2Vec.load("../skripsi/pre-trained-model/word2vec_wikipedia_id_model/wiki.id.case.model")
model2 = Word2Vec.load("./model/windows_5/idwiki_word2vec_200_skip-gram_window_5_new_lower.model")

# Compare the architectures of the two models
def compare_architecture(model1, model2):
    architecture_comparison = {}

    # Compare vector size (embedding dimension)
    architecture_comparison["Vector Size (Embedding Dimension)"] = {
        "Model 1": model1.vector_size,
        "Model 2": model2.vector_size,
    }

    # Compare context window size
    architecture_comparison["Context Window Size"] = {
        "Model 1": model1.window,
        "Model 2": model2.window,
    }

    # Compare training algorithm (Skip-gram or CBOW)
    architecture_comparison["Training Algorithm"] = {
        "Model 1": "Skip-gram" if model1.sg else "CBOW",
        "Model 2": "Skip-gram" if model2.sg else "CBOW",
    }

    # Compare corpus size (number of sentences)
    architecture_comparison["Corpus Size (Number of Sentences)"] = {
        "Model 1": model1.corpus_count,
        "Model 2": model2.corpus_count,
    }

    # Add more architecture comparisons as needed

    return architecture_comparison

# Compare the architectures of the two models
architecture_comparison_result = compare_architecture(model1, model2)

# Print the comparison results
for aspect, values in architecture_comparison_result.items():
    print(aspect)
    for model_name, value in values.items():
        print(f"- {model_name}: {value}")

# You can add more aspects to compare based on your requirements.


### Datasets Checks

In [None]:
import pandas as pd

csv_path = "./datasets/wikipedia_id-clean.csv"

df = pd.read_csv(csv_path)

df