# Harder dimensions of job quality: "job design and nature of work", "social support and cohesion", "voice and representation"

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
import nltk
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.util import ngrams
import pandas as pd
from spacy.tokens import Span
import spacy
import srsly
from wordcloud import WordCloud

nlp = spacy.load("en_core_web_sm")

from dap_job_quality import PROJECT_DIR

pd.set_option("max_colwidth", 1000)

In [None]:
# Define helper functions
def tokenize(text, n=2):
    """Tokenize text into n-grams
    """
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    n_grams = list(ngrams(tokens, n))
    return n_grams

def most_common_ngrams(df, n=2, label_col='label', text_col='labelled_span', n_most_common=10):
    """Find the most common n-grams within a category
    """
    category_ngrams = {}
    for category in df[label_col].unique():
        ngrams_list = []
        for text in df[df[label_col] == category][text_col]:
            n_grams = tokenize(text, n)
            ngrams_list.extend(n_grams)
        category_ngrams[category] = Counter(ngrams_list).most_common(n_most_common)
    return category_ngrams

Load a sample of ~60 job ads labelled for "voice and representation", "social support and cohesion" and "job design and nature of work":

In [None]:
file = PROJECT_DIR / 'dap_job_quality/pipeline/prodigy/labelled_data/20240119_ads_labelled_rosie.jsonl'

Inspect the data as a dataframe first of all, to see what fields it contains:

In [None]:
# Load JSONL data
data = []
for line in srsly.read_jsonl(file):
    data.append(line)

# Convert to DataFrame
df = pd.DataFrame(data)
df.head()

Read the data in and find the labelled spans:

In [None]:
records = []

for line in srsly.read_jsonl(file):
    if line["answer"] == "accept":
        records.append(line)

training_data = {}

for record in records:
    # convert each text to a spacy document
    doc = nlp(record['text'])
    all_sents = list(doc.sents)
    # get the labelled spans within each document
    spans = record["spans"]
    spans_parsed = []
    # map the span back to the text it corresponds to
    for span in spans:
        span_data = {}
        span_data["sent"] = Span(
                        doc,
                        span["token_start"],
                        span["token_end"] + 1,
                        span["label"],
                    ).text
        span_data["label"] = span["label"]
        span_data["text"] = record['text']
        spans_parsed.append(span_data)
    training_data[record["meta"]["job_id"]] = spans_parsed

Create a dataframe containing the parsed, labelled spans:

In [None]:
flat_data = []
for job_id, entries in training_data.items():
    for entry in entries:
        flat_data.append({
            "job_id": job_id,
            "labelled_span": entry["sent"],
            "label": entry["label"],
            "text": entry["text"]
        })

labelled_spans_df = pd.DataFrame(flat_data)

labelled_spans_df.head()

In [None]:
labelled_spans_df.to_csv('test_data.csv')

Check the distribution of labels in the dataset:

In [None]:
# Bar plot for the distribution of labels
plt.figure(figsize=(10, 6))
labelled_spans_df['label'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Distribution of Labels')
plt.xlabel('Labels')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

Inspect the sentences that occur under each label:

In [None]:
# What comes under `other_benefits`?
other_benefits = labelled_spans_df[labelled_spans_df['label'] == 'other_benefits']
other_benefits[['labelled_span']]

In [None]:
# What about `6_voice_representation`?
voice_representation = labelled_spans_df[labelled_spans_df['label'] == '6_voice_representation']
voice_representation[['labelled_span']]
# could have just regexed "equal opportunities"

In [None]:
social_support = labelled_spans_df[labelled_spans_df['label'] == '5_social_support_cohesion']
social_support[['labelled_span']]

In [None]:
# Wordcloud for each label
label_categories = labelled_spans_df['label'].unique()
label_categories = label_categories[(label_categories != 'benefit') & (label_categories != 'other_benefits')]

# Create a subplot for each wordcloud
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 10))
fig.suptitle('Wordclouds for Each Label', fontsize=16)

for i, label in enumerate(label_categories):
    ax = axes[i]
    text = ' '.join(labelled_spans_df[labelled_spans_df['label'] == label]['labelled_span'].tolist())
    wordcloud = WordCloud(width=800, height=800, 
                          background_color ='white',
                          min_font_size = 10).generate(text)
    
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis("off")
    ax.set_title(label)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
common_words = most_common_ngrams(labelled_spans_df, 1)
common_bigrams = most_common_ngrams(labelled_spans_df, 2)
common_trigrams = most_common_ngrams(labelled_spans_df, 3)

In [None]:
for category in common_words:
    if category != 'benefit':
        print(f"Category: {category}")
        print("Most common words:", common_words[category])
        print("Most common bigrams:", common_bigrams[category])
        print("Most common trigrams:", common_trigrams[category])
        print("\n")
