In [4]:
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

# Data

In [5]:
import json
import requests

def load_data_skill(filename):

    # resp = requests.get(filename)
    with open(filename, 'r') as f:
        data = json.load(f)

    return list(data)

filename = 'data/train/total_skill_augmentation_jds_sorted_relevant_combine_random_and_not.json'
total_skills_jds = load_data_skill(filename)

# Valid Function

In [6]:
def model_prediction(model, skills, topn=20):
    result = {}
    for skill in skills:
        topn_skills = [skill[0] for skill in model.wv.most_similar(skill, topn=topn)]

        result[skill] = topn_skills

    return result

def valid_skill(prediction, valid_data): # prediction: dict, valid_data: dict
    count = 0
    for skill in prediction.keys():
        for rlv_skill in prediction[skill]:
            if rlv_skill in valid_data[skill]:
                count += 1

    # find the number of skill in valid data
    total_skills = len(valid_data) * 20

    return count/total_skills


# Model training

In [24]:
# valid data
valid_data_path = 'https://raw.githubusercontent.com/nnccuong-tmabd/middle-intern-tma/main/data/valid/skills_valid.json'

# with open(valid_data_path, 'r') as f:
#     valid_data = json.load(f)

# f.close()

import requests

resp = requests.get(valid_data_path)
valid_data = json.loads(resp.text)
valid_data

# total valid skill
total_valid_skill = 0
for skill in valid_data.keys():
    total_valid_skill += len(valid_data[skill])

print(f'Total valid skill: {total_valid_skill}')

Total valid skill: 66547


In [26]:
import random

def calculate_acc(model):
  #pick random 200 skills in valid data

  prediction = model_prediction(model, valid_data.keys())

  accuracy = valid_skill(prediction, valid_data)


  return accuracy

# calculate_acc(model=model)
# print(f'Accuracy: {sum(accuracies)/100}')

In [27]:
import itertools

# create hyperparameter for training fasttext model and use grid search to find best hyperparameter
# Define hyperparameters
vector_size = [100]  # Can adjust this to your desired vector size
window_size = [3]    # Context window size
min_count = [2]      # Minimum word frequency
sg = [1]             # Skip-gram model (1 for skip-gram, 0 for CBOW)
hs = [0, 1]                # Using negative sampling (1 for HS, 0 for negative sampling)
negative = [5, 10, 15, 20]
ns_exponent = [0.75]    # The exponent used to shape the negative sampling distribution (0 for unigram, 1 for uniform)
seed = [0]
batch_size = [100]     # Number of words for each training batch
shrink_windows = [False] # Whether to shrink the window size as training goes on. This usually improves the accuracy of word vectors
alpha = [0.1]           # The initial learning rate

workers = [2]           # Number of worker threads to train the model
epochs = [100]          # Number of epochs to train the model

# Create combinations of hyperparameters
hyperparameter_combinations = list(itertools.product(vector_size, window_size, min_count, sg, hs, negative, ns_exponent, seed, batch_size, shrink_windows, alpha, workers, epochs))

# Print the total number of combinations
print(f"Total number of hyperparameter combinations: {len(hyperparameter_combinations)}")

Total number of hyperparameter combinations: 8


In [28]:
# function train model fasttext
def train_word2vec(skills_jds,
                   vector_size=100,
                   window_size=5,
                   min_count=3,
                   sg=1,
                   hs=1,
                   negative=0,
                   ns_exponent=0,
                   batch_size=100,
                   seed=seed,
                   shrink_windows=True,
                   alpha=0.025,
                   workers=2,
                   epochs=100):

    # Initialize and train the FastText model
    model = Word2Vec(
        vector_size=vector_size,
        window=window_size,
        min_count=min_count,
        sg=sg,
        hs=hs,
        negative=negative,
        alpha=alpha,
        ns_exponent=ns_exponent,
        seed=seed,
        batch_words=batch_size,
        shrink_windows=shrink_windows,
        workers=workers,
        epochs=2
    )

    # Build the vocabulary
    model.build_vocab(corpus_iterable=skills_jds)

    model.train(
        corpus_iterable=skills_jds,
        total_words = model.corpus_total_words,
        total_examples=len(skills_jds),
        epochs=model.epochs
    )

    pre_accuaracy = calculate_acc(model)
    print(f'Epoch {1}/{epochs} - Accuaracy: {pre_accuaracy}')

    threshold = 0.01
    num_epochs = epochs - 1

    for epoch in range(num_epochs):

        model.update_weights()

        model.train(
            corpus_iterable=skills_jds,
            total_words = model.corpus_total_words,
            total_examples=len(skills_jds),
            epochs=model.epochs
        )

        accuaracy = calculate_acc(model)

        print(f'Epoch {epoch+2}/{epochs} - Accuaracy: {accuaracy}')

        # early stopping
        if pre_accuaracy - accuaracy > threshold:
            break
        else:
            pre_accuaracy = accuaracy
        
    return model

In [51]:
import multiprocessing

# Define hyperparameters
vector_size = 200  # Can adjust this to your desired vector size
window_size = 5    # Context window size
min_count = 7      # Minimum word frequency
sg = 1             # Skip-gram model (1 for skip-gram, 0 for CBOW)
hs = 1                # Using negative sampling (1 for HS, 0 for negative sampling)
negative = 0
ns_exponent = 1    # The exponent used to shape the negative sampling distribution (0 for unigram, 1 for uniform)
batch_size = 300     # Number of words for each training batch
shrink_windows = True # Whether to shrink the window size as training goes on. This usually improves the accuracy of word vectors
alpha = 0.1           # The initial learning rate
workers=2
seed=0

epochs = 100          # Number of epochs to train the model

In [52]:
model = train_word2vec(skills_jds=total_skills_jds,
                       vector_size=vector_size,
                       window_size=window_size,
                       min_count=min_count,
                       workers=workers,
                       alpha=alpha,
                       sg=sg,
                       hs=hs,
                       negative=negative,
                       ns_exponent=ns_exponent,
                       batch_size=batch_size,
                       shrink_windows=shrink_windows,
                       epochs=epochs)

In [14]:
import os

save_folder = 'word2vecmodel/2/'

word_vectors = model.wv

vocab = list(word_vectors.index_to_key)

if not os.path.exists(save_folder):
    os.makedirs(save_folder)

tsv_file = save_folder + "vectors.tsv"

with open(tsv_file, "w", encoding="utf-8") as file:

    # Write word vectors
    for word in vocab:
        vector = "\t".join(str(val) for val in word_vectors[word])

        file.write(f"{vector}\n")

print(f"Word vectors saved to {tsv_file}")

tsv_metadata = save_folder + 'vocab.tsv'

with open (tsv_metadata, "w", encoding='utf-8') as file:

    for word in vocab:
        file.write(f"{word}\n")

print(f"Metadata saved to {tsv_metadata}")

model_name = save_folder + "word2vec_model"
model.save(model_name)

NameError: name 'model' is not defined

## Grid search

In [10]:
# use smaller data
import random

n = 1000
random_items = random.sample(total_skills_jds, n)

In [29]:
import os
import numpy as np

def grid_search(model_path = 'word2vecmodel', total_skill_jds=total_skills_jds):
    word2vec_models = []
    accuracy_models = []
    for idx, parameters in enumerate(hyperparameter_combinations):
        print(f"Training model with parameters: {parameters}")
        model = train_word2vec(total_skill_jds, parameters[0], parameters[1], parameters[2],
                    parameters[3], parameters[4], parameters[5],
                    parameters[6], parameters[7], parameters[8],
                    parameters[9], parameters[10], parameters[11], parameters[12])
        # fasttext_models.append(model)

        # valid model
        prediction = model_prediction(model, valid_data.keys())
        word2vec_models.append(model)

        accuracy = valid_skill(prediction, valid_data)
        accuracy_models.append(accuracy)

        print(f'Accuracy: {accuracy}')

        # save model
        if not os.path.exists(f'{model_path}/{idx}'):
            os.makedirs(f'{model_path}/{idx}')

        model.save(f'{model_path}/{idx}/word2vec.model')

        # export model
        vocab = model.wv.key_to_index

        with open(f'{model_path}/{idx}/vocab.tsv', "w", encoding='utf-8') as f:
            # f.write("word\n")
            for word in vocab:
                f.write(f"{word}\n")

        f.close()

        with open(f'{model_path}/{idx}/vectors.tsv', "w", encoding='utf-8') as f:
            for word in vocab:
                vector = "\t".join(str(val) for val in model.wv[word])

                f.write(f"{vector}\n")
        f.close()

        with open(f'{model_path}/{idx}/parameters.txt', 'w', encoding='utf-8') as f:
            f.write(str(parameters))
        f.close()


    # #find best model
    # best_model = fasttext_models[np.argmax(accuracy_models)]
    # best_accuracy = max(accuracy_models)

    # # save best model
    # if not os.path.exists(f'{model_path}/best_model'):
    #     os.makedirs(f'{model_path}/best_model')

    # best_model.save(f'{model_path}/best_model/fasttext_model.model')
    # # save best parameter
    # with open(f'{model_path}/best_model/best_parameter.txt', 'w') as f:
    #     f.write(f'Best parameter: {hyperparameter_combinations[np.argmax(accuracy_models)]}\n')
    #     f.write(f'Best accuracy: {best_accuracy}')

    return word2vec_models, accuracy_models

In [30]:
models = grid_search('word2vecmodel_3', total_skills_jds)

Training model with parameters: (100, 3, 2, 1, 0, 5, 0.75, 0, 100, False, 0.1, 2, 100)


KeyError: "Key 'gulp' not present in vocabulary"

# Test model

In [6]:
vector = model.wv['html']  # get numpy vector of a word
sims = model.wv.most_similar('hive', topn=10)  # get other similar words
sims

[('sap', 0.686120867729187),
 ('plant construction', 0.680181622505188),
 ('genomics', 0.6622642278671265),
 ('snowflake', 0.6429843306541443),
 ('scrum', 0.6354222297668457),
 ('obstetrics', 0.6297516822814941),
 ('mathematics', 0.6162903308868408),
 ('business intelligence', 0.5987536311149597),
 ('splunk', 0.5981040596961975),
 ('olap', 0.5967950820922852)]

# End