# Harder dimensions of job quality: "job design and nature of work", "social support and cohesion", "voice and representation"

This notebook is in 2 parts.

Part 1 explores data that was labelled manually in Prodigy and pulls out common single words, bigrams and trigrams.

Part 2 does some very basic keyword search to see how prevalent these N-grams are in a sample of 100,000 job ads from OJO.

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
import pandas as pd
import re
import spacy
from wordcloud import WordCloud

nlp = spacy.load("en_core_web_sm")

from dap_job_quality import PROJECT_DIR, BUCKET_NAME, logger
from dap_job_quality.getters.ojo_getters import get_ojo_sample
from dap_job_quality.getters.data_getters import load_s3_jsonl
import dap_job_quality.utils.prodigy_data_utils as pdu

pd.set_option("max_colwidth", 1000)

In [None]:
# Functions for getting the data
# def load_jsonl(file_path):
#     data = []
#     with open(file_path, "r", encoding="utf-8-sig") as file:
#         for line_number, line in enumerate(file, 1):
#             line = line.strip()  # Remove leading/trailing whitespace
#             if not line:
#                 # Skip empty lines
#                 continue
#             try:
#                 data.append(srsly.json_loads(line))
#             except ValueError as e:  # srsly raises ValueError for JSON errors
#                 print(f"Error parsing JSON on line {line_number}: {e}")
#                 # Optionally, continue to next line or handle error differently
#     return data

# def download_jsonl_from_s3(bucket_name, s3_file_name, local_file):
#     """
#     Download a file from an S3 bucket

#     :param bucket_name: Bucket to download from
#     :param s3_file_name: S3 object name
#     :param local_file: File path to store the downloaded file
#     """
#     # Create an S3 client
#     s3 = boto3.client("s3")

#     output_file = None

#     # Make sure that the directory where you want to store the file exists
#     Path(local_file).parent.mkdir(parents=True, exist_ok=True)

#     try:
#         # Download the file
#         s3.download_file(bucket_name, s3_file_name, local_file)
#         logging.info(
#             f"File {s3_file_name} downloaded from {bucket_name} to {local_file}"
#         )

#         output_file = load_jsonl(local_file)
#     except FileNotFoundError:
#         print(f"The file {s3_file_name} was not found in {bucket_name}")
#     except NoCredentialsError:
#         print("Credentials not available")

#     return output_file
    

# Other helper functions
def tokenize(text, n=2):
    """Tokenize text into n-grams
    """
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    n_grams = list(ngrams(tokens, n))
    return n_grams

def most_common_ngrams(df, n=2, label_col='label', text_col='labelled_span', n_most_common=10):
    """Find the most common n-grams within a category
    """
    category_ngrams = {}
    for category in df[label_col].unique():
        ngrams_list = []
        for text in df[df[label_col] == category][text_col]:
            n_grams = tokenize(text, n)
            ngrams_list.extend(n_grams)
        category_ngrams[category] = Counter(ngrams_list).most_common(n_most_common)
    return category_ngrams

def find_phrase_and_sentence(text, phrases):
    """Find a phrase in a text and return the whole sentence containing the phrase
    """
    for phrase in phrases:
        if phrase in text.lower():  # Check if the phrase is in the text
            # Find the whole sentence containing the phrase
            sentence = re.search(r'([^.]*?'+re.escape(phrase)+r'[^.]*\.)', text, re.IGNORECASE)
            if sentence:
                return True, sentence.group()
    return False, ""

## Part 1: labelled data

Load a sample of ~60 job ads labelled for "voice and representation", "social support and cohesion" and "job design and nature of work":

In [None]:
local_file1 = PROJECT_DIR / 'dap_job_quality/pipeline/prodigy/labelled_data/20240119_ads_labelled_rosie_downloaded.jsonl'
local_file2 = PROJECT_DIR / 'dap_job_quality/pipeline/prodigy/labelled_data/20240123_ads_labelled_rosie_downloaded.jsonl'

_ = load_s3_jsonl(BUCKET_NAME, 'job_quality/prodigy/labelled_data/20240119_ads_labelled_rosie.jsonl', local_file1)
_ = load_s3_jsonl(BUCKET_NAME, 'job_quality/prodigy/labelled_data/20240123_ads_labelled_rosie.jsonl', local_file2)

In [None]:
all_records = []

for file in [local_file1, local_file2]:
    records = pdu.read_accepted_lines(file)
    for record in records:
        all_records.append(record)
        
all_records_deduplicated = []
seen_job_ids = set()

for item in all_records:
    job_id = item['meta']['job_id']
    if job_id not in seen_job_ids:
        seen_job_ids.add(job_id)
        all_records_deduplicated.append(item)
        
training_data = pdu.get_spans_and_sentences(all_records_deduplicated)

In [None]:
flat_data = []
for job_id, entries in training_data.items():
    for entry in entries:
        flat_data.append({
            "job_id": job_id,
            "labelled_span": entry["span"],
            "full_sentence": entry["sent"],
            "label": entry["label"],
            "text": entry["text"]
        })

labelled_spans_df = pd.DataFrame(flat_data)

labelled_spans_df.head()

In [None]:
labelled_spans_df['label'].value_counts()

In [None]:
len(labelled_spans_df['job_id'].unique())

Check the distribution of labels in the dataset:

In [None]:
# Bar plot for the distribution of labels
plt.figure(figsize=(10, 6))
labelled_spans_df[(labelled_spans_df['label'] != 'none') & (labelled_spans_df['label'] != 'benefit')]['label'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Distribution of Labels')
plt.xlabel('Labels')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

Inspect the sentences that occur under each label:

In [None]:
# What comes under `other_benefits`?
other_benefits = labelled_spans_df[labelled_spans_df['label'] == 'other_benefits']
other_benefits[['labelled_span']]

In [None]:
# What about `6_voice_representation`?
voice_representation = labelled_spans_df[labelled_spans_df['label'] == '6_voice_representation']
voice_representation[['labelled_span']]
# could have just regexed "equal opportunities"

In [None]:
social_support = labelled_spans_df[labelled_spans_df['label'] == '5_social_support_cohesion']
social_support[['labelled_span']]

In [None]:
# Wordcloud for each label
label_categories = labelled_spans_df['label'].unique()
label_categories = label_categories[(label_categories != 'benefit') & (label_categories != 'other_benefits') & (label_categories != 'none')]

# Create a subplot for each wordcloud in a 2x2 configuration
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))

# Flatten the axes array for easy indexing
axes = axes.flatten()

for i, label in enumerate(label_categories):
    if i < 4:  # Ensure we don't go out of bounds
        ax = axes[i]
        text = ' '.join(labelled_spans_df[labelled_spans_df['label'] == label]['labelled_span'].tolist())
        wordcloud = WordCloud(width=800, height=800, 
                              background_color ='white',
                              min_font_size = 10).generate(text)
        
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.axis("off")
        ax.set_title(label)

# This will ensure that any extra subplots are not visible
for j in range(i+1, 4):
    axes[j].axis("off")

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig(PROJECT_DIR / 'outputs/figures/wordclouds.png')
plt.show()

In [None]:
common_words = most_common_ngrams(labelled_spans_df, 1)
common_bigrams = most_common_ngrams(labelled_spans_df, 2)
common_trigrams = most_common_ngrams(labelled_spans_df, 3)

In [None]:
for category in common_words:
    if category != 'benefit':
        print(f"Category: {category}")
        print("Most common words:", common_words[category])
        print("Most common bigrams:", common_bigrams[category])
        print("Most common trigrams:", common_trigrams[category])
        print("\n")


## Part 2: relate insights from labelled data to OJO sample

Load in a sample of 100,000 OJO ads and perform basic keyword searches.

In [None]:
unlabelled_data = get_ojo_sample()
unlabelled_data.head()

### Keyword search

We make a dict of search terms where the key is the CIPD dimension or subdimension and the values are the exact terms to search for. These are pruned versions of the bigrams/trigrams above.

In [None]:
phrases = {"job design and nature of work": ['career progression', 'career prospects', 'career development', 'progression opportunities', 'ongoing training', "opportunities for progression", 'learning and development'],
           "social support and cohesion": ["relaxed environment", "friendly working environment", "ongoing support", "great culture"],
           # reward/recognition seems a little trickier, so I decided to see how many ads contained these phrases specifically
           "reward": ["making a difference", "make a difference", "sense of purpose"]
           }

In [None]:
purpose_results = unlabelled_data[unlabelled_data['description'].str.contains('|'.join(phrases["reward"]))]
purpose_results.head()

In [None]:
search_results = unlabelled_data[unlabelled_data['description'].str.contains('|'.join(phrases["job design and nature of work"]))]
search_results.head()

In [None]:
social_results = unlabelled_data[unlabelled_data['description'].str.contains('|'.join(phrases["social support and cohesion"]))]
len(social_results)

In [None]:
print(f"{len(search_results)/len(unlabelled_data)} of the job descriptions contain at least one of the career progression phrases")

In [None]:
# Create new columns for (a) job design and nature of work phrases; (b) more specific purpose-related phrases
unlabelled_data[["jdnw_phrase", "jdnw_sentence"]] = unlabelled_data.apply(lambda row: find_phrase_and_sentence(row["description"], phrases["job design and nature of work"]), axis=1, result_type='expand')
unlabelled_data[["purpose_phrase", "purpose_sentence"]] = unlabelled_data.apply(lambda row: find_phrase_and_sentence(row["description"], phrases["reward"]), axis=1, result_type='expand')

In [None]:
print(f"Number of exact mentions of 'learning and development': {sum('learning and development' in desc.lower() for desc in unlabelled_data['description'].unique())}")

## spaCy collocations

... a possible next step