In [1]:
from gensim.models import FastText

# Data

In [2]:
import json
import requests

def load_data_skill(filename):

    # resp = requests.get(filename)
    with open(filename, 'r') as f:
        data = json.load(f)

    return list(data)

# filename = 'data/train/total_skill_augmentation_jds_sorted_relevant_combine_random_and_not.json'
filename = 'data/train/total_skill_jds_augmentation.json'
total_skills_jds = load_data_skill(filename)
len(total_skills_jds)

175971

# Valid Function

In [3]:
# valid data
valid_data_path = 'data/valid/skills_valid_around.json'

with open(valid_data_path, 'r') as f:
    valid_data = json.load(f)

f.close()

# valid_data = json.loads(resp.text)

# total valid skill
total_valid_skill = 0
for skill in valid_data.keys():
    total_valid_skill += len(valid_data[skill])

print(f'Total valid skill: {total_valid_skill}')

Total valid skill: 67321


In [4]:
def model_prediction(model, valid_data):
    result = {}
    skills = valid_data.keys()
    for skill in skills:
        try:
            topn = len(valid_data[skill])
            topn_skills = [skill[0] for skill in model.wv.most_similar(skill, topn=topn)]
        except:
            continue

        result[skill] = topn_skills

    return result

def valid_skill(prediction, valid_data): # prediction: dict, valid_data: dict
    count = 0
    for skill in prediction.keys():
        for rlv_skill in prediction[skill]:
            if rlv_skill in valid_data[skill]:
                count += 1

    flattened_valid = []
    for skills in prediction.keys():
        flattened_valid += valid_data[skills]
        
    # find the number of skill in valid data
    total_skills = len(flattened_valid)

    return count/total_skills


In [5]:
def calculate_acc(model):
  #pick random 200 skills in valid data

  prediction = model_prediction(model, valid_data)

  accuracy = valid_skill(prediction, valid_data)


  return accuracy

# Model training

In [6]:
import itertools

# create hyperparameter for training fasttext model and use grid search to find best hyperparameter
# Define hyperparameters
vector_size = [100]  # Can adjust this to your desired vector size
window_size = [5]    # Context window size
min_count = [12]      # Minimum word frequency
sg = [1]             # Skip-gram model (1 for skip-gram, 0 for CBOW)
hs = [1]                # Using negative sampling (1 for HS, 0 for negative sampling)
negative = [0]
ns_exponent = [0]    # The exponent used to shape the negative sampling distribution (0 for unigram, 1 for uniform)
seed = [1]
batch_word = [100]     # Number of words for each training batch
# min_n = [2, 3, 4]             # Minimum character n-gram length # choose 2
# max_n = [5, 6, 7]             # Maximum character n-gram length # choose 5
min_n = [2, 3, 4]             # Minimum character n-gram length
max_n = [5, 6, 7]             # Maximum character n-gram length
bucket = [2000000]      # Character n-grams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model
shrink_windows = [False] # Whether to shrink the window size as training goes on. This usually improves the accuracy of word vectors
alpha = [0.025]           # The initial learning rate

workers = [2]           # Number of worker threads to train the model
epochs = [200]          # Number of epochs to train the model

# Create combinations of hyperparameters
hyperparameter_combinations = list(itertools.product(vector_size, window_size, min_count, sg, hs, negative, ns_exponent, seed, batch_word, min_n, max_n, bucket, shrink_windows, alpha, workers, epochs))

# Print the total number of combinations
print(f"Total number of hyperparameter combinations: {len(hyperparameter_combinations)}")

Total number of hyperparameter combinations: 9


In [7]:
# function train model fasttext
def train_fasttext(skills_jds,
                   vector_size=100,
                   window_size=5,
                   min_count=3,
                   sg=1,
                   hs=0,
                   negative=0,
                   ns_exponent=0.5,
                   seed=1,
                   batch_words=100,
                   min_n=3,
                   max_n=6,
                   bucket=2000000, 
                   shrink_windows=True,
                   alpha=0.025,
                   workers=2,
                   epochs=100):

    # Initialize and train the FastText model
    model = FastText(
        vector_size=vector_size,
        window=window_size,
        min_count=min_count,
        sg=sg,
        hs=hs,
        negative=negative,
        alpha=alpha,
        ns_exponent=ns_exponent,
        seed=seed,
        batch_words=batch_words,
        min_n=min_n,
        max_n=max_n,
        bucket=bucket,
        shrink_windows=shrink_windows,
        workers=workers,
        epochs=2
    )

    model.build_vocab(corpus_iterable=skills_jds)

    model.train(
        corpus_iterable=skills_jds,
        total_words = model.corpus_total_words,
        total_examples=len(skills_jds),
        epochs=model.epochs,   
    )

    pre_accuaracy = calculate_acc(model)
    print(f'Epoch {1}/{epochs} - Accuaracy: {pre_accuaracy}')

    # threshold = 0.002
    num_epochs = epochs - 1
    break_count = 3

    for epoch in range(num_epochs):

        if break_count <= 0:
            break

        model.update_weights()

        model.train(
            corpus_iterable=skills_jds,
            total_words = model.corpus_total_words,
            total_examples=len(skills_jds),
            epochs=model.epochs
        )

        accuaracy = calculate_acc(model)

        print(f'Epoch {epoch+2}/{epochs} - Accuaracy: {accuaracy}')

        # early stopping
        if pre_accuaracy - accuaracy > 0:
            break_count -= 1
        
        pre_accuaracy = accuaracy

    return model


In [9]:
vector_size = 100  # Can adjust this to your desired vector size
window_size = 5    # Context window size
min_count = 12      # Minimum word frequency
sg = 1             # Skip-gram model (1 for skip-gram, 0 for CBOW)
hs = 1                # Using negative sampling (1 for HS, 0 for negative sampling)
negative = 0
ns_exponent = 0    # The exponent used to shape the negative sampling distribution (0 for unigram, 1 for uniform)
seed=1
batch_words = 100     # Number of words for each training batch
# min_n = [2, 3, 4]             # Minimum character n-gram length # choose 2
# max_n = [5, 6, 7]             # Maximum character n-gram length # choose 5
min_n = 5             # Minimum character n-gram length
max_n = 7             # Maximum character n-gram length
bucket = 3000000     # Character n-grams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model
shrink_windows = True # Whether to shrink the window size as training goes on. This usually improves the accuracy of word vectors
alpha = 0.01           # The initial learning rate

workers = 2           # Number of worker threads to train the model
epochs = 200 

In [10]:
import multiprocessing

model = train_fasttext(skills_jds=total_skills_jds,
                       vector_size=vector_size,
                       window_size=window_size,
                       min_count=min_count,
                       workers=multiprocessing.cpu_count()-2,
                       negative=negative,
                       ns_exponent=ns_exponent,
                       seed=seed,
                       batch_words=batch_words,
                       min_n=min_n,
                       max_n=max_n,
                       bucket=bucket,
                       shrink_windows=shrink_windows,
                       alpha=alpha,
                       sg=sg,
                       hs=hs,
                       epochs=epochs)

Epoch 1/200 - Accuaracy: 0.05094992647168046
Epoch 2/200 - Accuaracy: 0.07586042988072073
Epoch 3/200 - Accuaracy: 0.09977570148987686
Epoch 4/200 - Accuaracy: 0.11914558607269649
Epoch 5/200 - Accuaracy: 0.1347870649574427
Epoch 6/200 - Accuaracy: 0.14665557552621025
Epoch 7/200 - Accuaracy: 0.15639993464149374
Epoch 8/200 - Accuaracy: 0.16393101706748267
Epoch 9/200 - Accuaracy: 0.1698281368369454
Epoch 10/200 - Accuaracy: 0.1747448790124924
Epoch 11/200 - Accuaracy: 0.17885949406574472
Epoch 12/200 - Accuaracy: 0.18214227358476553
Epoch 13/200 - Accuaracy: 0.185023989542639
Epoch 14/200 - Accuaracy: 0.1874006624975862
Epoch 15/200 - Accuaracy: 0.19004471115996494
Epoch 16/200 - Accuaracy: 0.19118848501953328
Epoch 17/200 - Accuaracy: 0.19304526076558579
Epoch 18/200 - Accuaracy: 0.19512484960116458
Epoch 19/200 - Accuaracy: 0.19635774869654343
Epoch 20/200 - Accuaracy: 0.19756093937998545
Epoch 21/200 - Accuaracy: 0.1987195674455222
Epoch 22/200 - Accuaracy: 0.19958111139169055
Epoc

In [11]:
import os

model_path = 'fasttext_simple'
idx = 0

log_parameters = {
            "vector_size": vector_size,
            "windows_size": window_size,
            "min_cout": min_count,
            "sg": sg,
            "hs": hs,
            "negative": negative,
            "ns_exponent": ns_exponent,
            "seed": seed,
            "batch_word": batch_word,
            "min_n": min_n,
            "max_n": max_n,
            "bucket": bucket,
            "shrink_windows": shrink_windows,
            "alpha": alpha,
            "workers": workers,
            "epochs": epochs
        }

accuracy = calculate_acc(model)

if not os.path.exists(f'{model_path}/{idx}'):
    os.makedirs(f'{model_path}/{idx}')

model.save(f'{model_path}/{idx}/word2vec.model')

# export model
vocab = model.wv.key_to_index

with open(f'{model_path}/{idx}/vocab.tsv', "w", encoding='utf-8') as f:
    # f.write("word\n")
    for word in vocab:
        f.write(f"{word}\n")

f.close()

with open(f'{model_path}/{idx}/vectors.tsv', "w", encoding='utf-8') as f:
    for word in vocab:
        vector = "\t".join(str(val) for val in model.wv[word])

        f.write(f"{vector}\n")
    f.close()

with open(f'{model_path}/{idx}/logmodel.txt', 'w', encoding='utf-8') as f:
    f.write(str(log_parameters) + '\n')
    f.write(str(accuracy))
f.close()

NameError: name 'batch_word' is not defined

## Grid search

In [8]:
import os

def grid_search(model_path = 'fastTextmodel', total_skill_jbs=total_skills_jds):
    fasttext_models = []
    accuracy_models = []
    for idx, parameters in enumerate(hyperparameter_combinations):

        log_parameters = {
            "vector_size": parameters[0],
            "windows_size": parameters[1],
            "min_cout": parameters[2],
            "sg": parameters[3],
            "hs": parameters[4],
            "negative": parameters[5],
            "ns_exponent": parameters[6],
            "seed": parameters[7],
            "batch_words": parameters[8],
            "min_n": parameters[9],
            "max_n": parameters[10],
            "bucket": parameters[11],
            "shrink_windows": parameters[12],
            "alpha": parameters[13],
            "workers": parameters[14],
            "epochs": parameters[15]
        }

        print(f"Training model with parameters: {log_parameters}")
        model = train_fasttext(total_skill_jbs, parameters[0], parameters[1], parameters[2],
                    parameters[3], parameters[4], parameters[5],
                    parameters[6], parameters[7], parameters[8],
                    parameters[9], parameters[10], parameters[11],
                    parameters[12], parameters[13], parameters[14], parameters[15])
        fasttext_models.append(model)

        # valid model
        prediction = model_prediction(model, valid_data)
        
        accuracy = valid_skill(prediction, valid_data)
        accuracy_models.append(accuracy)

        print(f'Accuracy: {accuracy}')

        # save model
        if not os.path.exists(f'{model_path}/{idx}'):
            os.makedirs(f'{model_path}/{idx}')

        model.save(f'{model_path}/{idx}/fasttext_model.model')

        # export model
        vocab = model.wv.key_to_index
        
        with open(f'{model_path}/{idx}/vocab.tsv', "w", encoding='utf-8') as f:
            # f.write("word\n")
            for word in vocab:
                f.write(f"{word}\n")

        f.close()

        with open(f'{model_path}/{idx}/vectors.tsv', "w", encoding='utf-8') as f:
            for word in vocab:
                vector = "\t".join(str(val) for val in model.wv[word])

                f.write(f"{vector}\n")
        f.close()

        with open(f'{model_path}/{idx}/parameters.txt', 'w', encoding='utf-8') as f:
            f.write(str(parameters))
        f.close()

        with open(f'{model_path}/{idx}/logmodel.txt', 'w', encoding='utf-8') as f:
            f.write(str(log_parameters) + '\n')
            f.write(str(accuracy))
        f.close()

        with open(f'{model_path}/{model_path}_log.txt', 'w', encoding='utf-8') as f:
            f.write(str(log_parameters) + '\n')
            f.write(str(accuracy))
        f.close()
        

    # find best model
    # best_model = fasttext_models[np.argmax(accuracy_models)]
    # best_accuracy = max(accuracy_models)

    # # save best model
    # if not os.path.exists(f'{model_path}/best_model'):
    #     os.makedirs(f'{model_path}/best_model')

    # best_model.save(f'{model_path}/best_model/fasttext_model.model')
    # # save best parameter
    # with open(f'{model_path}/best_model/best_parameter.txt', 'w') as f:
    #     f.write(f'Best parameter: {hyperparameter_combinations[np.argmax(accuracy_models)]}\n')
    #     f.write(f'Best accuracy: {best_accuracy}')

    return fasttext_models, accuracy_models

In [9]:
models = grid_search('fastTextmodel_hs1_group_1', total_skills_jds)

Training model with parameters: {'vector_size': 100, 'windows_size': 5, 'min_cout': 12, 'sg': 1, 'hs': 1, 'negative': 0, 'ns_exponent': 0, 'seed': 1, 'batch_words': 100, 'min_n': 2, 'max_n': 5, 'bucket': 2000000, 'shrink_windows': False, 'alpha': 0.025, 'workers': 2, 'epochs': 200}
Epoch 1/200 - Accuaracy: 0.057129276154543156
Epoch 2/200 - Accuaracy: 0.09771096686026648
Epoch 3/200 - Accuaracy: 0.12312651327223303
Epoch 4/200 - Accuaracy: 0.14037224640156862
Epoch 5/200 - Accuaracy: 0.15232988220614668
Epoch 6/200 - Accuaracy: 0.16249015908854592
Epoch 7/200 - Accuaracy: 0.17106103593232425
Epoch 8/200 - Accuaracy: 0.17651252952273436
Epoch 9/200 - Accuaracy: 0.18096879131326035
Epoch 10/200 - Accuaracy: 0.18520224001426003
Epoch 11/200 - Accuaracy: 0.18869297841683874
Epoch 12/200 - Accuaracy: 0.19145586072696483
Epoch 13/200 - Accuaracy: 0.19386224209384886
Epoch 14/200 - Accuaracy: 0.1965657075801013
Epoch 15/200 - Accuaracy: 0.1984224833261538
Epoch 16/200 - Accuaracy: 0.200710031

In [34]:
# model = fasttext_models[0]

# prediction = model_prediction(model, valid_data.keys())
    
# accuracy = valid_skill(prediction, valid_data, total_valid_skill)
# accuracy_models.append(accuracy)

# print(f'Accuracy: {accuracy}')

# # save model
# if not os.path.exists(f'{model_path}/{idx}'):
#     os.makedirs(f'{model_path}/{idx}')

# model.save(f'{model_path}/{idx}/fasttext_model.model')

# # export model
# vocab = model.wv.key_to_index
    
# with open(f'{model_path}/{idx}/vocab.txt', "w", encoding='utf-8') as f:
#     f.write("word")
#     for word in vocab:
#         f.write(f"{word}\n")

# f.close()

# with open(f'{model_path}/{idx}/vectors.txt', "w", encoding='utf-8') as f:
#     for word in vocab:
#         vector = "\t".join(str(val) for val in model.wv[word])

#         f.write(f"{vector}\n")
# f.close()

# with open(f'{model_path}/{idx}/parameters.txt', 'w', encoding='utf-8') as f:
#     f.write(str(parameters))
# f.close()
    

Accuracy: 0.20854021847070506


# Test model

In [6]:
vector = model.wv['html']  # get numpy vector of a word
sims = model.wv.most_similar('hive', topn=10)  # get other similar words
sims

[('sap', 0.686120867729187),
 ('plant construction', 0.680181622505188),
 ('genomics', 0.6622642278671265),
 ('snowflake', 0.6429843306541443),
 ('scrum', 0.6354222297668457),
 ('obstetrics', 0.6297516822814941),
 ('mathematics', 0.6162903308868408),
 ('business intelligence', 0.5987536311149597),
 ('splunk', 0.5981040596961975),
 ('olap', 0.5967950820922852)]

# Save model

In [16]:
save_folder = 'fastText_newdata/1/'

In [17]:
word_vectors = model.wv

vocab = list(word_vectors.index_to_key)

tsv_file = save_folder + "word_vectors_fasttext.tsv"

with open(tsv_file, "w", encoding="utf-8") as file:

    # Write word vectors
    for word in vocab:
        vector = "\t".join(str(val) for val in word_vectors[word])

        file.write(f"{vector}\n")

print(f"Word vectors saved to {tsv_file}")

tsv_metadata = save_folder + 'metadata_fasttext.tsv'

with open (tsv_metadata, "w", encoding='utf-8') as file:

    file.write("word")

    for word in vocab:
        file.write(f"{word}\n")

print(f"Metadata saved to {tsv_metadata}")

Word vectors saved to fastText_newdata/1/word_vectors_fasttext.tsv
Metadata saved to fastText_newdata/1/metadata_fasttext.tsv


In [18]:
model_name = save_folder + "fasttext_model"
model.save(model_name)

# End