# Modelle (Word2Vec, Doc2Vec) sowie Evaluation aller Modelle

In [27]:
# Inhaltverzeichnis:
'''
1. Datenaufbereitung
2. Methodendeklaration, Hilfsfunktion, unsw.
3. Training (Word2Vec, Doc2Vec)
4. Laden der Training-Ergebnisse von LinkPred, Knowledge Graph
5. Testing aller Modelle (Word2Vec, Doc2Vec, LinkPred, Know. Graph)
6. Weitere Beispielvorführung
'''

'\n1. Datenaufbereitung\n2. Methodendeklaration, Hilfsfunktion, unsw.\n3. Training (Word2Vec, Doc2Vec)\n4. Laden der Training-Ergebnisse von LinkPred, Knowledge Graph\n5. Testing aller Modelle (Word2Vec, Doc2Vec, LinkPred, Know. Graph)\n6. Weitere Beispielvorführung\n'

## *1. Datenaufbereitung*

In [29]:
pip install emgraph

Note: you may need to restart the kernel to use updated packages.


ERROR: Ignored the following versions that require a different python version: 1.0.0rc1 Requires-Python >=3.10,<3.11
ERROR: Could not find a version that satisfies the requirement emgraph (from versions: none)
ERROR: No matching distribution found for emgraph

[notice] A new release of pip is available: 23.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [31]:
import pandas as pd
from LinkPred import model
from gensim.models import Word2Vec
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import ast
import numpy as np
import itertools
import operator
from scipy.spatial.distance import cdist
import heapq
import KG_model as graph
import importlib
import sys


In [32]:
# set up some global variables regarding data format
training_file = "data\Complete_Data_Clustered_Cleaned.csv"
test_file = "data\Complete_Data_Clustered_Cleaned_test.csv"

cluster_column = 'Cluster'
skill_column = 'NewSkills_lowercase'
training_columns = ['Cluster', 'NewSkills_lowercase']
test_columns = ['Cluster', 'NewSkills_lowercase']

ground_truth_file = 'data/test_data_graph_org_new.csv'

totalClusters = 20

In [33]:
def helper(column):
    return str(column).split(",")

def open_csv_training():
    df_data = pd.read_csv(training_file)
    df_data = df_data.drop(columns=['Unnamed: 0', 'JobTitle', 'Description', 'Skills/Description'])
    df_data = df_data[training_columns]
    return df_data

def open_csv_testdata():
    df_data = pd.read_csv(test_file)
    df_data = df_data.drop(columns=['Unnamed: 0', 'JobTitle', 'Description', 'Skills/Description'])
    df_data = df_data[test_columns]
    return df_data

## *2. Methodendeklaration, Hilfsfunktion, unsw.*

In [34]:
### Training uses the Doc2Vec or Word2Vec model which simply embeds words according to how often they occur together (ignoring order afaik)
## Adapt vector size and epochs in training once data set is complete



def trainWord2Vec(data):
# Preprocess the data and create list of skills (this training does NOT include jobs!)
    tagged_data = [ast.literal_eval(skills) for job, skills in data]

# Train the Word2Vec model, try different vector sizes for interesting effects in similarities
    model = Word2Vec(vector_size=50, min_count=1, workers=4, epochs=20)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    return model

def trainDoc2Vec(data):
    tagged_data = [TaggedDocument(words=ast.literal_eval(skills), tags=[job]) for job, skills in data[:]]

# Train the Doc2Vec model
    model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=20)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    return model

# helper method to calculate average embedding vector of a list of string skills, filters out skills unknown to the model
def averageOfSkills(model, input, axis = 0):
    vectors = [model.wv[i] for i in input if model.wv.has_index_for(i)]
    if(len(vectors) == 0):
        return np.zeros(len(model.wv[0]))
    return np.average(vectors, axis = axis)

In [35]:


def calcAvgEmbeddings(model, data):
    # Tests for the embedding go here
    # print(model.wv.most_similar("neural_networks", topn = 5))

    # split the tuples
    job_titles, skills = list(zip(*data))
    # calculate the average skill vector of every job offering / person and zip back together
    skillAverages = [averageOfSkills(model, ast.literal_eval(skill)) for skill in skills]
    data_averaged = list(zip(job_titles, skillAverages))

    return (skillAverages, data_averaged)

# This gives us a list of job offerings and average embeddings. 
# Could be used as input to a graph neural network or knowledge graph?
# Is used below to classify immediately

In [36]:
# simply suggest smallest distance to user's average ("Option 2") using word2Vec

def prepareOption2(data):
    model = trainWord2Vec(data)
    skillAverages, data_averaged = calcAvgEmbeddings(model, data)

    # sort before grouping
    data_averaged.sort(key=operator.itemgetter(0))
    # Group by job title, take the average embedding of everyone with that title and make a dictionary (maps job title to average embedding)
    job_averages = {key : np.average(list(zip(*list(value)))[1], axis = 0)
        for key, value in itertools.groupby(data_averaged, lambda x: x[0])}

    #keep keys and values
    keys = list(job_averages.keys())
    values = list(job_averages.values())
    # easy access to the avg vector
    #print(job_averages['Advisor, Data Science'])
    
    # model.save("word2vec_model_option2.bin")
    return job_averages, keys, values, model

# usage in next cells

In [41]:
# "Option 3" could be using the embedding learned by Doc2Vec and doing the same manual averaging as Option 2 for "learning" the correlation of job to skills


def prepareOption3(data):
#if !data:
#  data = open_csv_clean()
    model = trainDoc2Vec(data)
    skillAverages, data_averaged = calcAvgEmbeddings(model, data)

    # sort before grouping
    data_averaged.sort(key=operator.itemgetter(0))
    # Group by job title, take the average embedding of everyone with that title and make a dictionary (maps job title to average embedding)
    job_averages = {key : np.average(list(zip(*list(value)))[1], axis = 0)
        for key, value in itertools.groupby(data_averaged, lambda x: x[0])}

    #keep keys and values
    keys = list(job_averages.keys())
    values = list(job_averages.values())
    
    return job_averages, keys, values, model

## *3. Training (Word2Vec, Doc2Vec)*

In [42]:
# 80% training, 20% testing
'''
Full Dataset length = 5030
TrainingSet length = 4024 (80%)
TestSet length = 1006 (20%)
'''

'\nFull Dataset length = 5030\nTrainingSet length = 4024 (80%)\nTestSet length = 1006 (20%)\n'

In [43]:
#Training mit seperaten Excel Dateien

trainingdata = open_csv_training()
testingdata = open_csv_testdata()

formatted_data = trainingdata.values.tolist()
formatted_test = testingdata.values.tolist()
jobs, skills = list(zip(*formatted_test))
skills = [ast.literal_eval(skill) for skill in skills]

# Option 1: Doc2Vec
model_option1 = trainDoc2Vec(formatted_data)

# Option 2: Word2Vec
job_averages_option2, keys_option2, values_option2, model_option2 = prepareOption2(formatted_data)

# Option 3: Doc2Vec Embedding
job_averages_option3, keys_option3, values_option3, model_option3 = prepareOption3(formatted_data)


## *4. Laden der Training-Ergebnisse von LinkPred, Knowledge Graph*

In [None]:
linkpred = model(load_model=True,save_model=False,load_test=True)
linkpred_out = linkpred.test(3)

In [None]:
sys.path.append('./Emgraph2/emgraph')
sys.path.append('./Emgraph2')
graph = importlib.reload(graph)

In [None]:
# this internally also prepares the new CSV format required by the graph model
graph_model = graph.model(epochs=1, train_file=training_file, test_file = test_file, totalClusters= totalClusters, ground_truth_file=ground_truth_file)
graph_model.train()
graph_model_out = graph_model.test(3)

## *5. Testing aller Modelle (Word2Vec, Doc2Vec, LinkPred, Know. Graph)*

In [None]:
# Option 1: Doc2Vec
predictions_option1 = []
for skills_example in skills:
    infer_vector_option1 = model_option1.infer_vector(skills_example)
    similar_jobs_option1 = model_option1.dv.most_similar([infer_vector_option1], topn=3)
    predicted_jobs_option1 = [job for job, similarity in similar_jobs_option1]
    predictions_option1.append(predicted_jobs_option1)

# Option 2: Word2Vec
predictions_option2 = []
for skills_example in skills:
    avg_option2 = averageOfSkills(model_option2, skills_example)
    distVec_option2 = cdist([avg_option2], values_option2)
    topJobs_option2 = [k for dist, k in heapq.nsmallest(3, zip(distVec_option2.transpose(), keys_option2))]
    predictions_option2.append(topJobs_option2)

# Option 3: Doc2Vec Embedding
predictions_option3 = []
for skills_example in skills:
    avg_option3 = averageOfSkills(model_option3, skills_example)
    distVec_option3 = cdist([avg_option3], values_option3)
    topJobs_option3 = [k for dist, k in heapq.nsmallest(3, zip(distVec_option3.transpose(), keys_option3))]
    predictions_option3.append(topJobs_option3)

In [None]:
'''
• True positive (TP): A test result that correctly indicates the presence of a condition or characteristic.
• False positive (FP): A test result that wrongly indicates that a particular condition or attribute is present.
'''

# job[0] is the label/correct job category
ground_truth_labels = [job[0] for job in formatted_test]

# Option 1
tp_option1 = 0
fp_option1 = 0

# Iterate over predicted job titles and ground truth labels
for predicted_jobs, ground_truth in zip(predictions_option1, ground_truth_labels):
    if ground_truth in predicted_jobs:
        print("Ground truth " + str(ground_truth) + " in: " + str(predicted_jobs))
        tp_option1 += 1
    else:
        fp_option1 += 1

# Option 2
tp_option2 = 0
fp_option2 = 0

# Iterate over predicted job titles and ground truth labels
for predicted_jobs, ground_truth in zip(predictions_option2, ground_truth_labels):
    if ground_truth in predicted_jobs:
        tp_option2 += 1
    else:
        fp_option2 += 1

# Option 3
tp_option3 = 0
fp_option3 = 0

# Iterate over predicted job titles and ground truth labels
for predicted_jobs, ground_truth in zip(predictions_option3, ground_truth_labels):
    if ground_truth in predicted_jobs:
        tp_option3 += 1
    else:
        fp_option3 += 1

# Option 4: Knowledge Graph
tp_option4 = 0
fp_option4 = 0
df2 = pd.read_csv(ground_truth_file)["tail"].values
preds = pd.DataFrame(graph_model_out, columns=["Person", "Top3 Predictions"])
preds = preds["Top3 Predictions"]
for i in range(len(preds)):
    if df2[i] in preds[i]:
        tp_option4 += 1
    else:
        fp_option4 += 1

print("Option 1:")
print("True Positives (TP):", tp_option1)
print("False Positives (FP):", fp_option1)

print("\nOption 2:")
print("True Positives (TP):", tp_option2)
print("False Positives (FP):", fp_option2)

print("\nOption 3:")
print("True Positives (TP):", tp_option3)
print("False Positives (FP):", fp_option3)

print("\nOption 4:")
print("True Positives (TP):", tp_option4)
print("False Positives (FP):", fp_option4)

In [None]:
'''
• Accuracy (ACC): The proportion of correct predictions (both true positives and true negatives) out of all predictions made by the model.
• Precision or positive predictive value (PPV): The proportion of true positives out of all positive predictions made by the model.
'''
# Calculate precision (PPV)
precision_option1 = tp_option1 / (tp_option1 + fp_option1)
precision_option2 = tp_option2 / (tp_option2 + fp_option2)
precision_option3 = tp_option3 / (tp_option3 + fp_option3)

# Calculate accuracy (ACC)
total_predictions = len(ground_truth_labels)
correct_predictions_option1 = tp_option1
correct_predictions_option2 = tp_option2
correct_predictions_option3 = tp_option3
accuracy_option1 = correct_predictions_option1 / total_predictions
accuracy_option2 = correct_predictions_option2 / total_predictions
accuracy_option3 = correct_predictions_option3 / total_predictions

print("Option 1:")
print("True Positives (TP):", tp_option1)
print("False Positives (FP):", fp_option1)
print("Precision (PPV):", precision_option1)
print("Accuracy (ACC):", accuracy_option1)

print("\nOption 2:")
print("True Positives (TP):", tp_option2)
print("False Positives (FP):", fp_option2)
print("Precision (PPV):", precision_option2)
print("Accuracy (ACC):", accuracy_option2)

print("\nOption 3:")
print("True Positives (TP):", tp_option3)
print("False Positives (FP):", fp_option3)
print("Precision (PPV):", precision_option3)
print("Accuracy (ACC):", accuracy_option3)

In [None]:
#Visualisierung der Ergebnisse
import matplotlib.pyplot as plt

# Accuracy values for the three options
accuracies = [accuracy_option1, accuracy_option2, accuracy_option3]

# Labels for the x-axis
options = ['Option 1', 'Option 2', 'Option 3']

# Plotting the bar graph
plt.bar(options, accuracies)
#plt.xlabel('Options')
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison')
plt.ylim([0, 1])  # Set the y-axis limits between 0 and 1
plt.show()

## *6. Weitere Beispielvorführung*

In [40]:
skills_example = ['cad', 'maschinenbau', 'kostruktion', 'fräsen', 'drehen', 'montieren']

# Option 1: Doc2Vec
infer_vector_option1 = model_option1.infer_vector(skills_example)
similar_jobs_option1 = model_option1.dv.most_similar([infer_vector_option1], topn=3)
predicted_jobs_option1 = [job for job, similarity in similar_jobs_option1]
print("Option 1 Prediction: Given the skillset, it should belong to one of the following three groups beginning with the best match:", predicted_jobs_option1)
print("")

# Option 2: Word2Vec
avg_option2 = averageOfSkills(model_option2, skills_example)
distVec_option2 = cdist([avg_option2], values_option2)
topJobs_option2 = [k for dist, k in heapq.nsmallest(3, zip(distVec_option2.transpose(), keys_option2))]
print("Option 2 Prediction: Given the skillset, it should belong to one of the following three groups beginning with the best match:", topJobs_option2)
print("")

# Option 3: Doc2Vec Embedding
avg_option3 = averageOfSkills(model_option3, skills_example)
distVec_option3 = cdist([avg_option3], values_option3)
topJobs_option3 = [k for dist, k in heapq.nsmallest(3, zip(distVec_option3.transpose(), keys_option3))]
print("Option 3 Prediction: Given the skillset, it should belong to one of the following three groups beginning with the best match:", topJobs_option3)
print("")

Option 1 Prediction: Given the skillset, it should belong to one of the following three groups beginning with the best match: [5, 1, 25]

Option 2 Prediction: Given the skillset, it should belong to one of the following three groups beginning with the best match: [10, 31, 7]

Option 3 Prediction: Given the skillset, it should belong to one of the following three groups beginning with the best match: [10, 7, 12]

