In [1]:
from gensim.models.phrases import Phrases, Phraser
from gensim.models.word2vec import Word2Vec

# Data

In [2]:
import json

# read file json data_skill_tensor.json

with open('data//train/data_skill_tensor.json', 'r') as f:
    data = json.load(f)

f.close()

data_skill = [job['skills'] for job in data]

In [3]:
# load IT skill JDs
def load_data_skill(filename):
    with open(filename, 'r') as f:
        data = json.load(f)

    f.close()
    
    return list(data.values())

skill1 = load_data_skill('data//train/IT_skill_JDs_p1.json')
skill2 = load_data_skill('data/train/IT_skill_JDs_p1.json')

In [4]:
def nomalize_skill(skill):
    for i in range(len(skill)):
        skill[i] = [item.lower() for item in skill[i]]
    
    return skill

data_skill = nomalize_skill(data_skill)
skill1 = nomalize_skill(skill1)
skill2 = nomalize_skill(skill2)

In [5]:
# concate three list
total_skill_jbs = data_skill + skill1 + skill2

# Data Agumentation

In [6]:
import random
import numpy as np

def shuffle_data(skills, discard=0):
    # Make a copy of the input list to avoid modifying the original list
    shuffled_skills = skills.copy()
    
    # Shuffle the copy of the list in place
    random.shuffle(shuffled_skills)
    
    return shuffled_skills

def cutout_data(skills, mask_size=5):

    cutout_skills = skills.copy()
    
    mask_size = min(mask_size, len(cutout_skills))
    
    cutout_index = random.sample(range(len(skills)), mask_size)
    
    for idx in cutout_index:
        cutout_skills[idx] = ''
        

    cutout_skills = [x for x in cutout_skills if x != '']
    
    return cutout_skills


In [7]:
def create_new_data(total_skills, func, num_samples=2100):

    if num_samples > len(total_skills):
        num_samples = len(total_skills) 

    skills_augmentation_collection = []
    # pick random sample in skills jbs
    random_indexs = random.sample(range(len(total_skills)), num_samples)
    for index in random_indexs:
        mask_size = 0
        if func.__name__ == 'cutout_data':
            mask_size = random.randint(0, int(len(total_skills[index])/2))

        skills_augmentation_collection.append(func(total_skills[index], mask_size))

    return skills_augmentation_collection

In [8]:
skill1_augmentation = create_new_data(skill1, cutout_data) + create_new_data(skill1, shuffle_data)
skill2_augmentation = create_new_data(skill2, cutout_data) + create_new_data(skill2, shuffle_data)
total_skill_jbs += skill1_augmentation + skill2_augmentation

In [9]:
total_augmentation = create_new_data(total_skill_jbs, cutout_data, len(total_skill_jbs)) + create_new_data(total_skill_jbs, shuffle_data, len(total_skill_jbs))
total_skill_jbs += total_augmentation

In [10]:
import json

# Specify the file path where you want to save the JSON data
file_path = 'data/jbs_skill_augmentations.json'

# Open the file for writing and save the data as JSON
with open(file_path, 'w') as json_file:
    json.dump(total_skill_jbs, json_file, indent=4)

print(f'Saved JSON data to {file_path}')


Saved JSON data to data/jbs_skill_augmentations.json


# Valid Function

In [11]:
def model_prediction(model, skills, topn=5):
    result = {}
    for skill in skills:
        topn_skills = [skill[0] for skill in model.wv.most_similar(skill, topn=topn)]

        result[skill] = topn_skills

    return result

def valid_skill(prediction, valid_data, total_skills=None): # prediction: dict, valid_data: dict
    count = 0
    for skill in prediction.keys():
        for rlv_skill in prediction[skill]:
            if rlv_skill in valid_data[skill]:
                count += 1
    
    # find the number of skill in valid data
    if total_skills is None:
        total_skills = 0
        for skill in valid_data.keys():
            total_skills += len(valid_data[skill])
    
    return count/total_skills


# Model training

In [14]:
# # Define hyperparameters
# vector_size = 100  # Can adjust this to your desired vector size
# window_size = 5    # Context window size
# min_count = 3      # Minimum word frequency
# sg = 1             # Skip-gram model (1 for skip-gram, 0 for CBOW)
# hs = 0                # Using negative sampling (1 for HS, 0 for negative sampling)
# ns_exponent = 0.75    # The exponent used to shape the negative sampling distribution (0 for unigram, 1 for uniform)
# batch_words = 100     # Number of words for each training batch
# min_n = 2             # Minimum character n-gram length
# max_n = 5             # Maximum character n-gram length
# bucket = 2000000      # Character n-grams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model
# shrink_windows = True # Whether to shrink the window size as training goes on. This usually improves the accuracy of word vectors
# alpha = 0.1           # The initial learning rate

# workers = 2           # Number of worker threads to train the model
# epochs = 100          # Number of epochs to train the model

In [28]:
import itertools

# create hyperparameter for training fasttext model and use grid search to find best hyperparameter
# Define hyperparameters
vector_size = [100, 200]  # Can adjust this to your desired vector size
window_size = [5, 7]    # Context window size
min_count = [3, 4, 5]      # Minimum word frequency
sg = [1, 0]             # Skip-gram model (1 for skip-gram, 0 for CBOW)
hs = [0, 1]                # Using negative sampling (1 for HS, 0 for negative sampling)
ns_exponent = [0.75, 1]    # The exponent used to shape the negative sampling distribution (0 for unigram, 1 for uniform)
batch_words = [100, 200]     # Number of words for each training batch
# bucket = [2000000, 3000000]      # Character n-grams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model
shrink_windows = [True, False] # Whether to shrink the window size as training goes on. This usually improves the accuracy of word vectors
alpha = [0.1, 0.2]           # The initial learning rate

workers = [2]           # Number of worker threads to train the model
epochs = [200]          # Number of epochs to train the model

# Create combinations of hyperparameters
hyperparameter_combinations = list(itertools.product(vector_size, window_size, min_count, sg, hs, ns_exponent, batch_words, shrink_windows, alpha, workers, epochs))

# Print the total number of combinations
print(f"Total number of hyperparameter combinations: {len(hyperparameter_combinations)}")

Total number of hyperparameter combinations: 768


In [33]:
# function train model fasttext
def train_word2vec(skills_jbs,
                   vector_size=100,
                   window_size=5,
                   min_count=3,
                   sg=1,
                   hs=0,
                   ns_exponent=0.75,
                   batch_words=100,
                   shrink_windows=True,
                   alpha=0.025,
                   workers=2,
                   epochs=100):

    # Initialize and train the FastText model
    model = Word2Vec(
        vector_size=vector_size,
        window=window_size,
        min_count=min_count,
        sg=sg,
        hs=hs,
        alpha=alpha,
        ns_exponent=ns_exponent,
        batch_words=batch_words,
        shrink_windows=shrink_windows,
        workers=workers,
        epochs=epochs
    )

    model.build_vocab(corpus_iterable=skills_jbs)

    model.train(
        corpus_iterable=skills_jbs,
        total_words = model.corpus_total_words,
        total_examples=len(skills_jbs),
        epochs=model.epochs,
        
    )

    return model

## Grid search

In [23]:
# valid data
valid_data_path = 'data/valid/top_200_valid_automation.json'
with open(valid_data_path, 'r') as f:
    valid_data = json.load(f)

f.close()

# total valid skill
total_valid_skill = 0
for skill in valid_data.keys():
    total_valid_skill += len(valid_data[skill])

print(f'Total valid skill: {total_valid_skill}')

Total valid skill: 1007


In [34]:
model_path = 'word2vecmodel/'

In [35]:
import os

fasttext_models = []
accuracy_models = []
for idx, parameters in enumerate(hyperparameter_combinations):
    print(f"Training model with parameters: {parameters}")
    model = train_word2vec(total_skill_jbs, parameters[0], parameters[1], parameters[2],
                   parameters[3], parameters[4], parameters[5],
                   parameters[6], parameters[7], parameters[8],
                   parameters[9], parameters[10],
                   )
    fasttext_models.append(model)

    # valid model
    prediction = model_prediction(model, valid_data.keys())
    
    accuracy = valid_skill(prediction, valid_data, total_valid_skill)
    accuracy_models.append(accuracy)

    print(f'Accuracy: {accuracy}')

    # save model
    if not os.path.exists(f'{model_path}/{idx}'):
        os.makedirs(f'{model_path}/{idx}')

    model.save(f'{model_path}/{idx}/fasttext_model.model')

    # export model
    vocab = model.wv.key_to_index
    
    with open(f'{model_path}/{idx}/vocab.tsv', "w", encoding='utf-8') as f:
        # f.write("word\n")
        for word in vocab:
            f.write(f"{word}\n")

    f.close()

    with open(f'{model_path}/{idx}/vectors.tsv', "w", encoding='utf-8') as f:
        for word in vocab:
            vector = "\t".join(str(val) for val in model.wv[word])

            f.write(f"{vector}\n")
    f.close()

    with open(f'{model_path}/{idx}/parameters.txt', 'w', encoding='utf-8') as f:
        f.write(str(parameters))
    f.close()
    

# find best model
best_model = fasttext_models[np.argmax(accuracy_models)]
best_accuracy = max(accuracy_models)

# save best model
if not os.path.exists(f'{model_path}/best_model'):
    os.makedirs(f'{model_path}/best_model')

best_model.save(f'{model_path}/best_model/fasttext_model.model')
# save best parameter
with open(f'{model_path}/best_model/best_parameter.txt', 'w') as f:
    f.write(f'Best parameter: {hyperparameter_combinations[np.argmax(accuracy_models)]}\n')
    f.write(f'Best accuracy: {best_accuracy}')
    

Training model with parameters: (100, 5, 3, 1, 0, 0.75, 100, True, 0.1, 2, 200)


Accuracy: 0.17974180734856007
Training model with parameters: (100, 5, 3, 1, 0, 0.75, 100, True, 0.2, 2, 200)
Accuracy: 0.0407149950347567
Training model with parameters: (100, 5, 3, 1, 0, 0.75, 100, False, 0.1, 2, 200)
Accuracy: 0.18669314796425024
Training model with parameters: (100, 5, 3, 1, 0, 0.75, 100, False, 0.2, 2, 200)
Accuracy: 0.010923535253227408
Training model with parameters: (100, 5, 3, 1, 0, 0.75, 200, True, 0.1, 2, 200)
Accuracy: 0.1708043694141013
Training model with parameters: (100, 5, 3, 1, 0, 0.75, 200, True, 0.2, 2, 200)


KeyboardInterrupt: 

# Test model

In [6]:
vector = model.wv['html']  # get numpy vector of a word
sims = model.wv.most_similar('hive', topn=10)  # get other similar words
sims

[('sap', 0.686120867729187),
 ('plant construction', 0.680181622505188),
 ('genomics', 0.6622642278671265),
 ('snowflake', 0.6429843306541443),
 ('scrum', 0.6354222297668457),
 ('obstetrics', 0.6297516822814941),
 ('mathematics', 0.6162903308868408),
 ('business intelligence', 0.5987536311149597),
 ('splunk', 0.5981040596961975),
 ('olap', 0.5967950820922852)]

# Save model

In [16]:
save_folder = 'fastText_newdata/1/'

In [17]:
word_vectors = model.wv

vocab = list(word_vectors.index_to_key)

tsv_file = save_folder + "word_vectors_fasttext.tsv"

with open(tsv_file, "w", encoding="utf-8") as file:

    # Write word vectors
    for word in vocab:
        vector = "\t".join(str(val) for val in word_vectors[word])

        file.write(f"{vector}\n")

print(f"Word vectors saved to {tsv_file}")

tsv_metadata = save_folder + 'metadata_fasttext.tsv'

with open (tsv_metadata, "w", encoding='utf-8') as file:

    file.write("word")

    for word in vocab:
        file.write(f"{word}\n")

print(f"Metadata saved to {tsv_metadata}")

Word vectors saved to fastText_newdata/1/word_vectors_fasttext.tsv
Metadata saved to fastText_newdata/1/metadata_fasttext.tsv


In [18]:
model_name = save_folder + "fasttext_model"
model.save(model_name)

# End