In [1]:
from gensim.models.phrases import Phrases, Phraser
from gensim.models import FastText

# Data

In [2]:
import json

# read file json data_skill_tensor.json

with open('data/data_skill_tensor.json', 'r') as f:
    data = json.load(f)

f.close()

data_skill = [job['skills'] for job in data]

In [3]:
for i in range(len(data_skill)):
    data_skill[i] = [item.lower() for item in data_skill[i]]

In [4]:
skills = []
for item in data_skill:
    skills += item

skills = list(set(skills))

num_vocab = len(skills)
num_vocab

853

# Model training

In [19]:
# Define hyperparameters
vector_size = 100  # Can adjust this to your desired vector size
window_size = 5    # Context window size
min_count = 1      # Minimum word frequency
sg = 1             # Skip-gram model (1 for skip-gram, 0 for CBOW)
hs = 1
ns_exponent = 0.75
sorted_vocab = 1
batch_words = 1000
min_n = 3
max_n = 6
bucket = 2000000
word_ngrams = 1
bucket = 2000000
shrink_windows = True
alpha = 0.01

workers = 4
epochs = 100

In [20]:
# Initialize and train the FastText model
model = FastText(
    vector_size=vector_size,
    window=window_size,
    min_count=min_count,
    sg=sg,
    hs=hs,
    alpha=alpha,
    ns_exponent=ns_exponent,
    sorted_vocab=sorted_vocab,
    batch_words=batch_words,
    min_n=min_n,
    max_n=max_n,
    bucket=bucket,
    word_ngrams=word_ngrams,
    workers=workers,
    epochs=epochs
)

model.build_vocab(corpus_iterable=data_skill)

model.train(
    corpus_iterable=data_skill,
    total_words = model.corpus_total_words,
    total_examples=len(data_skill),
    epochs=model.epochs
)


(17643523, 28609900)

# Eval

In [9]:
import random
random_skills = random.sample(skills, 200)

def get_topk_sims(skill, model=model, k=5):
    return model.wv.most_similar(skill, topn=k)

def find_nearest_skills(skills):

    result = {}
    
    for skill in skills:
        top5 = get_topk_sims(skill)

        result[skill] = top5

    return result

In [13]:
get_topk_sims('xml', k=10)

[('apache apex', 0.43664947152137756),
 ('psychology', 0.3909670114517212),
 ('oracle apex', 0.2875107526779175),
 ('data structures', 0.2720365822315216),
 ('talend', 0.2658945918083191),
 ('jenkins', 0.2358192503452301),
 ('toolchain', 0.2184581160545349),
 ('sketch', 0.2079901397228241),
 ('springboot', 0.20551736652851105),
 ('psychiatry', 0.19896918535232544)]

In [11]:
find_nearest_skills(random_skills)

{'yaml': [('xml', 0.805616021156311),
  ('html', 0.4495927393436432),
  ('apache apex', 0.3389304578304291),
  ('psychology', 0.2721565067768097),
  ('aws ecs', 0.2661810517311096)],
 'xml': [('apache apex', 0.43664947152137756),
  ('psychology', 0.3909670114517212),
  ('oracle apex', 0.2875107526779175),
  ('data structures', 0.2720365822315216),
  ('talend', 0.2658945918083191)],
 'teradata': [('big data', 0.9364231824874878),
  ('yii', 0.39420580863952637),
  ('bigquery', 0.39032429456710815),
  ('bayesian', 0.3617774546146393),
  ('cloudwatch', 0.35656389594078064)],
 'squish': [('sql', 0.7662743926048279),
  ('bash', 0.6355722546577454),
  ('intensive care medicine', 0.3262883722782135),
  ('software design', 0.3160000741481781),
  ('.net', 0.3122173547744751)],
 'wpf': [('iso 26262', 0.5567151308059692),
  ('kanban', 0.538820743560791),
  ('bdd', 0.5217928290367126),
  ('emr', 0.4871976673603058),
  ('j2ee', 0.4719456136226654)],
 'cfengine': [('medical engineering', 0.5461308956

# Test model

In [6]:
vector = model.wv['html']  # get numpy vector of a word
sims = model.wv.most_similar('hive', topn=10)  # get other similar words
sims

[('sap', 0.686120867729187),
 ('plant construction', 0.680181622505188),
 ('genomics', 0.6622642278671265),
 ('snowflake', 0.6429843306541443),
 ('scrum', 0.6354222297668457),
 ('obstetrics', 0.6297516822814941),
 ('mathematics', 0.6162903308868408),
 ('business intelligence', 0.5987536311149597),
 ('splunk', 0.5981040596961975),
 ('olap', 0.5967950820922852)]

# Save model

In [21]:
word_vectors = model.wv

vocab = list(word_vectors.index_to_key)

# Specify the TSV file name
tsv_file = "word_vectors_fasttext.tsv"

# Open the TSV file in write mode
with open(tsv_file, "w", encoding="utf-8") as file:
    # file.write("word\t" + "\t".join([str(i) for i in range(model.vector_size)]) + "\n")

    # Write word vectors
    for word in vocab:
        vector = "\t".join(str(val) for val in word_vectors[word])
        # file.write(f"{word}\t{vector}\n")
        file.write(f"{vector}\n")

print(f"Word vectors saved to {tsv_file}")

tsv_metadata = 'metadata_fasttext.tsv'

with open (tsv_metadata, "w", encoding='utf-8') as file:

    file.write("word")

    for word in vocab:
        file.write(f"{word}\n")

print(f"Metadata saved to {tsv_metadata}")

Word vectors saved to word_vectors_fasttext.tsv
Metadata saved to metadata_fasttext.tsv


In [22]:
model_name = "fastText_model/3/fasttext_model"
model.save(model_name)