In [2]:
from gensim.models.phrases import Phrases, Phraser
from gensim.models import FastText

# Data

In [3]:
import json
import requests

def load_data_skill(filename):

    # resp = requests.get(filename)
    with open(filename, 'r') as f:
        data = json.load(f)

    return list(data)

filename = 'data/train/total_skill_augmentation_jds_sorted_relevant_combine_random_and_not.json'
total_skills_jds = load_data_skill(filename)

# Valid Function

In [4]:
def model_prediction(model, skills, topn=20):
    result = {}
    for skill in skills:
        topn_skills = [skill[0] for skill in model.wv.most_similar(skill, topn=topn)]

        result[skill] = topn_skills

    return result

def valid_skill(prediction, valid_data): # prediction: dict, valid_data: dict
    count = 0
    for skill in prediction.keys():
        for rlv_skill in prediction[skill]:
            if rlv_skill in valid_data[skill]:
                count += 1

    # find the number of skill in valid data
    total_skills = len(valid_data) * 20

    return count/total_skills


# Model training

In [7]:
import itertools

# create hyperparameter for training fasttext model and use grid search to find best hyperparameter
# Define hyperparameters
vector_size = [100]  # Can adjust this to your desired vector size
window_size = [5]    # Context window size
min_count = [7]      # Minimum word frequency
sg = [1]             # Skip-gram model (1 for skip-gram, 0 for CBOW)
hs = [1]                # Using negative sampling (1 for HS, 0 for negative sampling)
negative = [0]
ns_exponent = [0, 1]    # The exponent used to shape the negative sampling distribution (0 for unigram, 1 for uniform)
batch_size = [100]     # Number of words for each training batch
# min_n = [2, 3, 4]             # Minimum character n-gram length # choose 2
# max_n = [5, 6, 7]             # Maximum character n-gram length # choose 5
min_n = [3, 5]             # Minimum character n-gram length
max_n = [7, 8]             # Maximum character n-gram length
bucket = [2000000, 2500000, 3000000]      # Character n-grams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model
shrink_windows = [False] # Whether to shrink the window size as training goes on. This usually improves the accuracy of word vectors
alpha = [0.1]           # The initial learning rate

workers = [2]           # Number of worker threads to train the model
epochs = [100]          # Number of epochs to train the model

# Create combinations of hyperparameters
hyperparameter_combinations = list(itertools.product(vector_size, window_size, min_count, sg, hs, negative, ns_exponent, batch_size, min_n, max_n, bucket, shrink_windows, alpha, workers, epochs))

# Print the total number of combinations
print(f"Total number of hyperparameter combinations: {len(hyperparameter_combinations)}")

Total number of hyperparameter combinations: 18


In [8]:
# function train model fasttext
def train_fasttext(skills_jbs,
                   vector_size=100,
                   window_size=5,
                   min_count=3,
                   sg=1,
                   hs=0,
                   negative=0,
                   ns_exponent=0.75,
                   batch_words=100,
                   min_n=3,
                   max_n=6,
                   bucket=2000000, 
                   shrink_windows=True,
                   alpha=0.025,
                   workers=2,
                   epochs=100):

    # Initialize and train the FastText model
    model = FastText(
        vector_size=vector_size,
        window=window_size,
        min_count=min_count,
        sg=sg,
        hs=hs,
        negative=negative,
        alpha=alpha,
        ns_exponent=ns_exponent,
        batch_words=batch_words,
        min_n=min_n,
        max_n=max_n,
        bucket=bucket,
        shrink_windows=shrink_windows,
        workers=workers,
        epochs=epochs
    )

    model.build_vocab(corpus_iterable=skills_jbs)

    model.train(
        corpus_iterable=skills_jbs,
        total_words = model.corpus_total_words,
        total_examples=len(skills_jbs),
        epochs=model.epochs,
        
        
    )

    return model


In [14]:
# Define hyperparameters
vector_size = 300  # Can adjust this to your desired vector size
window_size = 10    # Context window size
min_count = 3      # Minimum word frequency
sg = 1             # Skip-gram model (1 for skip-gram, 0 for CBOW)
hs = 1                # Using negative sampling (1 for HS, 0 for negative sampling)
negative = 0
ns_exponent = 1    # The exponent used to shape the negative sampling distribution (0 for unigram, 1 for uniform)
batch_words = 100     # Number of words for each training batch
min_n = 3             # Minimum character n-gram length
max_n = 6             # Maximum character n-gram length
bucket = 2000000      # Character n-grams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model
# shrink_windows = True # Whether to shrink the window size as training goes on. This usually improves the accuracy of word vectors
alpha = 0.1           # The initial learning rate

epochs = 1000          # Number of epochs to train the model

In [13]:
import multiprocessing

model = train_fasttext(skills_jbs=total_skills_jds,
                       vector_size=vector_size,
                       window_size=window_size,
                       min_count=10,
                       workers=multiprocessing.cpu_count()-2,
                       min_n=min_n,
                       max_n=max_n,
                       bucket=bucket,
                       alpha=alpha,
                       sg=1,
                       hs=1,
                       epochs=100)

In [9]:
# valid data
valid_data_path = 'https://raw.githubusercontent.com/nnccuong-tmabd/middle-intern-tma/main/data/valid/skills_valid.json'

# with open(valid_data_path, 'r') as f:
#     valid_data = json.load(f)

# f.close()

import requests

resp = requests.get(valid_data_path)
valid_data = json.loads(resp.text)
valid_data

# total valid skill
total_valid_skill = 0
for skill in valid_data.keys():
    total_valid_skill += len(valid_data[skill])

print(f'Total valid skill: {total_valid_skill}')

Total valid skill: 36806


In [10]:
import random

def calculate_acc(model, n_iter=100):
  #pick random 200 skills in valid data
  accuracies = []
  for _ in range(n_iter):
    #pick random 200 skills in valid data
    random_items = dict(random.sample(list(valid_data.items()), 200))
    # valid model
    prediction = model_prediction(model, random_items.keys())

    accuracies.append(valid_skill(prediction, random_items))


  return sum(accuracies)/n_iter

# calculate_acc(model=model)
# print(f'Accuracy: {sum(accuracies)/100}')

In [8]:
# model_path = 'fastText_model_augmentation'
# idx = 2

# import os
# # save model
# if not os.path.exists(f'{model_path}/{idx}'):
#     os.makedirs(f'{model_path}/{idx}')

# model.save(f'{model_path}/{idx}/fasttext_model.model')

# # export model
# vocab = model.wv.key_to_index
        
# with open(f'{model_path}/{idx}/vocab.tsv', "w", encoding='utf-8') as f:
#     # f.write("word\n")
#     for word in vocab:
#         f.write(f"{word}\n")

# f.close()

# with open(f'{model_path}/{idx}/vectors.tsv', "w", encoding='utf-8') as f:
#     for word in vocab:
#         vector = "\t".join(str(val) for val in model.wv[word])

#         f.write(f"{vector}\n")
# f.close()

# with open(f'{model_path}/{idx}/parameters.txt', 'w', encoding='utf-8') as f:
#     f.write(str(parameters))
# f.close()

## Grid search

In [12]:
# use smaller data
import random

n = 1000
random_items = random.sample(total_skills_jds, n)

In [11]:
import os

def grid_search(model_path = 'fastTextmodel', total_skill_jbs=total_skills_jds):
    fasttext_models = []
    accuracy_models = []
    for idx, parameters in enumerate(hyperparameter_combinations):
        print(f"Training model with parameters: {parameters}")
        model = train_fasttext(total_skill_jbs, parameters[0], parameters[1], parameters[2],
                    parameters[3], parameters[4], parameters[5],
                    parameters[6], parameters[7], parameters[8],
                    parameters[9], parameters[10], parameters[11],
                    parameters[12], parameters[13], parameters[14])
        fasttext_models.append(model)

        # valid model
        prediction = model_prediction(model, valid_data.keys())
        
        accuracy = valid_skill(prediction, valid_data)
        accuracy_models.append(accuracy)

        print(f'Accuracy: {accuracy}')

        # save model
        if not os.path.exists(f'{model_path}/{idx}'):
            os.makedirs(f'{model_path}/{idx}')

        model.save(f'{model_path}/{idx}/fasttext_model.model')

        # export model
        vocab = model.wv.key_to_index
        
        with open(f'{model_path}/{idx}/vocab.tsv', "w", encoding='utf-8') as f:
            # f.write("word\n")
            for word in vocab:
                f.write(f"{word}\n")

        f.close()

        with open(f'{model_path}/{idx}/vectors.tsv', "w", encoding='utf-8') as f:
            for word in vocab:
                vector = "\t".join(str(val) for val in model.wv[word])

                f.write(f"{vector}\n")
        f.close()

        with open(f'{model_path}/{idx}/parameters.txt', 'w', encoding='utf-8') as f:
            f.write(str(parameters))
        f.close()
        

    # find best model
    # best_model = fasttext_models[np.argmax(accuracy_models)]
    # best_accuracy = max(accuracy_models)

    # # save best model
    # if not os.path.exists(f'{model_path}/best_model'):
    #     os.makedirs(f'{model_path}/best_model')

    # best_model.save(f'{model_path}/best_model/fasttext_model.model')
    # # save best parameter
    # with open(f'{model_path}/best_model/best_parameter.txt', 'w') as f:
    #     f.write(f'Best parameter: {hyperparameter_combinations[np.argmax(accuracy_models)]}\n')
    #     f.write(f'Best accuracy: {best_accuracy}')

    return fasttext_models, accuracy_models

In [12]:
models = grid_search('fastTextmodel1', total_skills_jds)

Training model with parameters: (100, 5, 7, 1, 1, 0, 0, 100, 2, 5, 2000000, False, 0.1, 2, 100)
Accuracy: 0.29371289800323797
Training model with parameters: (100, 5, 7, 1, 1, 0, 0, 100, 2, 5, 2500000, False, 0.1, 2, 100)
Accuracy: 0.29341608202914193
Training model with parameters: (100, 5, 7, 1, 1, 0, 0, 100, 2, 5, 3000000, False, 0.1, 2, 100)
Accuracy: 0.29217485159201295
Training model with parameters: (100, 5, 7, 1, 1, 0, 0, 100, 2, 7, 2000000, False, 0.1, 2, 100)
Accuracy: 0.2928224500809498
Training model with parameters: (100, 5, 7, 1, 1, 0, 0, 100, 2, 7, 2500000, False, 0.1, 2, 100)
Accuracy: 0.2923097679438748
Training model with parameters: (100, 5, 7, 1, 1, 0, 0, 100, 2, 7, 3000000, False, 0.1, 2, 100)
Accuracy: 0.2936319481921209
Training model with parameters: (100, 5, 7, 1, 1, 0, 0, 100, 2, 8, 2000000, False, 0.1, 2, 100)
Accuracy: 0.29155423637344846
Training model with parameters: (100, 5, 7, 1, 1, 0, 0, 100, 2, 8, 2500000, False, 0.1, 2, 100)
Accuracy: 0.2933351322180

In [34]:
# model = fasttext_models[0]

# prediction = model_prediction(model, valid_data.keys())
    
# accuracy = valid_skill(prediction, valid_data, total_valid_skill)
# accuracy_models.append(accuracy)

# print(f'Accuracy: {accuracy}')

# # save model
# if not os.path.exists(f'{model_path}/{idx}'):
#     os.makedirs(f'{model_path}/{idx}')

# model.save(f'{model_path}/{idx}/fasttext_model.model')

# # export model
# vocab = model.wv.key_to_index
    
# with open(f'{model_path}/{idx}/vocab.txt', "w", encoding='utf-8') as f:
#     f.write("word")
#     for word in vocab:
#         f.write(f"{word}\n")

# f.close()

# with open(f'{model_path}/{idx}/vectors.txt', "w", encoding='utf-8') as f:
#     for word in vocab:
#         vector = "\t".join(str(val) for val in model.wv[word])

#         f.write(f"{vector}\n")
# f.close()

# with open(f'{model_path}/{idx}/parameters.txt', 'w', encoding='utf-8') as f:
#     f.write(str(parameters))
# f.close()
    

Accuracy: 0.20854021847070506


# Test model

In [6]:
vector = model.wv['html']  # get numpy vector of a word
sims = model.wv.most_similar('hive', topn=10)  # get other similar words
sims

[('sap', 0.686120867729187),
 ('plant construction', 0.680181622505188),
 ('genomics', 0.6622642278671265),
 ('snowflake', 0.6429843306541443),
 ('scrum', 0.6354222297668457),
 ('obstetrics', 0.6297516822814941),
 ('mathematics', 0.6162903308868408),
 ('business intelligence', 0.5987536311149597),
 ('splunk', 0.5981040596961975),
 ('olap', 0.5967950820922852)]

# Save model

In [16]:
save_folder = 'fastText_newdata/1/'

In [17]:
word_vectors = model.wv

vocab = list(word_vectors.index_to_key)

tsv_file = save_folder + "word_vectors_fasttext.tsv"

with open(tsv_file, "w", encoding="utf-8") as file:

    # Write word vectors
    for word in vocab:
        vector = "\t".join(str(val) for val in word_vectors[word])

        file.write(f"{vector}\n")

print(f"Word vectors saved to {tsv_file}")

tsv_metadata = save_folder + 'metadata_fasttext.tsv'

with open (tsv_metadata, "w", encoding='utf-8') as file:

    file.write("word")

    for word in vocab:
        file.write(f"{word}\n")

print(f"Metadata saved to {tsv_metadata}")

Word vectors saved to fastText_newdata/1/word_vectors_fasttext.tsv
Metadata saved to fastText_newdata/1/metadata_fasttext.tsv


In [18]:
model_name = save_folder + "fasttext_model"
model.save(model_name)

# End