In [None]:
import spacy
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt

# Load SpaCy model for vectorization
nlp = spacy.load("en_core_web_md")

# Vectorize skills
#skill_vectors = [nlp(skill).vector for skill in skills_list]

# Perform hierarchical clustering
#Z = linkage(skill_vectors, method='ward')

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Text vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(skills_list)

# Hierarchical clustering
Z = linkage(X.toarray(), 'ward')

# Plot dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z, labels=skills_list, leaf_rotation=90)
plt.title("Skills Hierarchical Clustering")
plt.show()

# From the dendrogram, decide the number of clusters or the level at which to cut the tree
# Then assign cluster names based on common themes in each cluster

from scipy.cluster.hierarchy import fcluster

# Determine the number of clusters, for example, by setting a threshold
distance_threshold = 1.5
clusters = fcluster(Z, distance_threshold, criterion='distance')
clusters
# Or determine by specifying the exact number of clusters desired
#k = 10
#clusters = fcluster(Z, k, criterion='maxclust')

from collections import Counter

# Assuming you have the following:
# clusters: an array with the cluster labels for each job title
# job_titles: an array with the corresponding job titles

# Create a dictionary where each key is a cluster and the value is a list of titles
clustered_titles = {i: [] for i in range(1, len(set(clusters))+1)}
for title, cluster_label in zip(skills_list, clusters):
    clustered_titles[cluster_label].append(title)

# For each cluster, find the most common words
cluster_names = {}
for cluster, titles in clustered_titles.items():
    # Flatten the list of titles into a list of words
    words = " ".join(titles).lower().split()
    # Count the frequency of each word
    word_counts = Counter(words)
    # Remove common stop words (optional, depends on your data)
    for stop_word in [ 'and', 'the', '-']:
        if stop_word in word_counts:
            del word_counts[stop_word]
    # Take the top 3 most common words as the name
    cluster_names[cluster] = ' '.join([word for word, _ in word_counts.most_common(3)])
clustered_titles
# Now cluster_names contains a 'name' for each cluster based on the most common words
########################
xs = set()
for i,each in enumerate(df_c_cleaned_2["skills_tagged"]):
    if len(each)==0:
        print(i)
        print(df_c_cleaned_2[i:i+1]["title"].unique())
        print(df_c_cleaned_2[i:i+1]["description"].unique())
        print("/n")
    for each_v in each:
        xs.add(each_v)

In [None]:
# train a spacy model to identify skills 
# create entities 
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.util import filter_spans

nlp = spacy.load("en_core_web_sm")  # Load the model
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")  # Create the matcher object


# Convert the skills list to Doc objects and add them as patterns to the matcher
patterns = [nlp.make_doc(skill) for skill in skills_list]
matcher.add("Skills", patterns)


def get_entities(TRAIN_DATA):
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            print(ent)

TRAIN_DATA = []

# Loop through the job descriptions and create training data
for job_description in job_descriptions:
    # Process the job description to create a Spacy Doc
    doc = nlp(job_description)
    # Match the patterns to the doc
    matches = matcher(doc)
    # Create Span objects for the matched sequences
    spans = [Span(doc, start, end, label="SKILLS") for match_id, start, end in matches]
    # Filter the spans to remove overlaps
    #print(spans)
    filtered_spans = filter_spans(spans)
    #print(spans)
    entities = [(span.start_char, span.end_char, span) for span in filtered_spans]
    TRAIN_DATA.append((job_description, {"entities": entities}))

#print(TRAIN_DATA)





# train the model

#train the model 
starting_fresh = False
# Load a pre-existing spaCy model
nlp = spacy.load('en_core_web_sm')  # for example
import random
from spacy.util import minibatch
# Get the Named Entity Recognizer component in the pipeline
ner = nlp.get_pipe('ner')
from spacy.training import Example
from pathlib import Path

# Add new entity labels to 'ner'
for _, annotations in TRAIN_DATA:
    for start,end,label in annotations.get('entities'):
        #print(label)
        ner.add_label(str(label))


# Disable other pipes during training
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']


# Begin training
with nlp.disable_pipes(*unaffected_pipes):
    if starting_fresh:
        nlp.begin_training()

    for itn in range(5):
        random.shuffle(TRAIN_DATA)
        losses = {}

        # Batch up the examples using spaCy's minibatch
        for batch in minibatch(TRAIN_DATA, size=2):
            examples = []
            for text, annotations in batch:
                # Create a Spacy Doc from the text
                doc = nlp.make_doc(text)
                # Create an Example using the annotations
                example = Example.from_dict(doc, annotations)
                examples.append(example)

            # Update the model
            nlp.update(
                examples,
                drop=0.5,  # Dropout - make it harder to memorize data
                losses=losses
            )
        print(losses)

'''
from pathlib import Path
output_dir = Path('/Users/nyzy/nitzmali/job_transition_pathway/models/skills_tag_spacy_nlp_model')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
'''


# Split your TRAIN_DATA into train, validate and test sets
def train_test_val_split(data, test_size=0.2, val_size=0.25, random_state=42):
    # Calculate actual validation set size of the remaining data after test split
    val_size_adjusted = val_size / (1 - test_size)
    # Split off test set from available data
    train_val_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state)
    # Split remaining data into training and validation sets
    train_data, val_data = train_test_split(train_val_data, test_size=val_size_adjusted, random_state=random_state)
    return train_data, val_data, test_data
train_data, val_data, test_data = train_test_val_split(TRAIN_DATA, test_size=0.2, val_size=0.25)


#########################################################################################
# Load the model you want to evaluate
from spacy import displacy
nlp = spacy.load('/Users/nyzy/nitzmali/job_transition_pathway/models/skills_tag_spacy_nlp_model')  # replace with your model name

# Split your TRAIN_DATA into train, validate and test sets
train_data, val_data, test_data = train_test_val_split(TRAIN_DATA, test_size=0.2, val_size=0.25)

# Convert the validation data to spaCy's Example format
examples = []
for input_, annots in val_data:
    pred = nlp(input_)
    example = Example.from_dict(pred, annots)
    examples.append(example)


# Use the Scorer to score the examples
scorer = Scorer(nlp)
scores = scorer.score(examples)


precision = scores['ents_p']
recall = scores['ents_r']
f_score = scores['ents_f']

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {f_score}")

examples[0]


# Assume 'nlp' is your loaded NLP model
for text, annots in val_data[3:5]:  # Let's use val_data as an example
    doc = nlp(text)  # Process the text to predict entities

    '''
    print("Predictions by model:")
    for ent in doc.ents:
        print("Predictions")
        #print(ent.text, ent.start_char, ent.end_char, ent.label_)

    # Now print the correct data for comparison
    print("\nCorrect labels:")
    for start, end, label in annots['entities']:
        print("Actual")
        #print(text[start:end], start, end, label)
    '''
    # You can use displacy here as well if you prefer visual comparison
    displacy.render(doc, style="ent", jupyter=True)

    # Adding a separation for readability between different examples
    print("\n" + "-" * 50 + "\n")


