While labelling data, I noticed that there is often overlap between how the CIPD 7 dimensions are expressed in job ads. For example, a sentence like "You will be supported to grow in your career" could indicate "job design and nature of work" (because it is about career progression), or it could indicate "social support and cohesion" (because it suggests a supportive work environment). To investigate this further, I decided to try clustering the sentences in the job ads using a word embedding model, and then see if the clusters corresponded to the CIPD 7 dimensions or if there were other patterns that we could potentially use.

The approach here is:
- pull out the labelled spans
- extract the entire sentence (as captured with spaCy's `sent` attribute) in which the span occurs
- embed the sentence using a pre-trained word embedding model `all-MiniLM-L6-v2`
- cluster the embeddings using kmeans
- dimensionality reduction using t-SNE
- visualise the embeddings, first with the original labels of the sentences and then with the kmeans clusters

At the moment I'm not really sure how useful this is or what conclusions we could draw. Might be more helpful to use BERTopic and try to get meaningful summaries of the clusters.

It's also just a very tiny sample of the data, which is another reason why it's hard to draw any conclusions from it.

In [None]:
import boto3
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from spacy.tokens import Span
import spacy
import srsly

from dap_job_quality import PROJECT_DIR, BUCKET_NAME, logger
import dap_job_quality.utils.prodigy_data_utils as pdu
from dap_job_quality.getters.data_getters import load_s3_jsonl

# models that we'll use
nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("all-MiniLM-L6-v2")

pd.set_option("max_colwidth", 1000)

In [None]:
local_file1 = 'outputs/prodigy/labelled_data/20240119_ads_labelled_rosie_downloaded.jsonl'
local_file2 = 'outputs/prodigy/labelled_data/20240123_ads_labelled_rosie_downloaded.jsonl'

_ = load_s3_jsonl(BUCKET_NAME, 'job_quality/prodigy/labelled_data/20240119_ads_labelled_rosie.jsonl', local_file1)
_ = load_s3_jsonl(BUCKET_NAME, 'job_quality/prodigy/labelled_data/20240123_ads_labelled_rosie.jsonl', local_file2)

all_records = []

for file in [local_file1, local_file2]:
    records = pdu.read_accepted_lines(file)
    for record in records:
        all_records.append(record)
        
all_records_deduplicated = []
seen_job_ids = set()

for item in all_records:
    job_id = item['meta']['job_id']
    if job_id not in seen_job_ids:
        seen_job_ids.add(job_id)
        all_records_deduplicated.append(item)

In [None]:
len(all_records) - len(all_records_deduplicated)

In [None]:
training_data = pdu.get_spans_and_sentences(all_records_deduplicated)

In [None]:
flat_data = []
for job_id, entries in training_data.items():
    for entry in entries:
        flat_data.append({
            "job_id": job_id,
            "labelled_span": entry["span"],
            "full_sentence": entry["sent"],
            "label": entry["label"],
            "text": entry["text"]
        })

labelled_spans_df = pd.DataFrame(flat_data)

labelled_spans_df.head()

In [None]:
labelled_spans_df['label'].value_counts()

In [None]:
# decided to label the 'benefit' category in there because it will be quite different from the other categories, so might
# shed light on to what extent the other categories overlap
clustering_df = labelled_spans_df[(labelled_spans_df['label'] != 'none')][['label', 'full_sentence', 'labelled_span']]
clustering_df.head()

In [None]:
embeddings = model.encode(clustering_df['full_sentence'].tolist())

In [None]:
num_clusters = 10 #int(len(embeddings) ** 0.5)

kmeans = KMeans(n_clusters=num_clusters)
clusters = kmeans.fit_predict(embeddings)

In [None]:
# assign the cluster names back into the dataframe
clustering_df['cluster'] = clusters

In [None]:
# reduce to 2d for visualisation
tsne = TSNE(n_components=2, random_state=0)
embeddings_2d = tsne.fit_transform(embeddings)

In [None]:
# Plot with original labels
plt.figure(figsize=(10, 8))
for label in clustering_df['label'].unique():
    indices = clustering_df['label'] == label
    plt.scatter(embeddings_2d[indices, 0], embeddings_2d[indices, 1], label=label)
plt.legend()
plt.title("Clusters with Original Labels")
plt.show()

In [None]:
# Plot with cluster labels
plt.figure(figsize=(10, 8))
for cluster in range(num_clusters):
    indices = clusters == cluster
    plt.scatter(embeddings_2d[indices, 0], embeddings_2d[indices, 1], label=f"Cluster {cluster}")
plt.legend()
plt.title("Clusters with Cluster Labels")
plt.show()

In [None]:
clustering_df['cluster'].value_counts()