In [None]:
from collections import Counter
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
import pandas as pd
import re
import spacy
import srsly
from wordcloud import WordCloud

nlp = spacy.load("en_core_web_sm")

from dap_job_quality import PROJECT_DIR, BUCKET_NAME
from dap_job_quality.getters.data_getters import load_s3_jsonl
import dap_job_quality.utils.prodigy_data_utils as pdu

pd.set_option("max_colwidth", 1000)

In [None]:
#Other helper functions
def tokenize(text, n=2):
    """Tokenize text into n-grams
    """
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    n_grams = list(ngrams(tokens, n))
    return n_grams

def most_common_ngrams(df, n=2, label_col='label', text_col='labelled_span', n_most_common=10):
    """Find the most common n-grams within a category
    """
    category_ngrams = {}
    for category in df[label_col].unique():
        ngrams_list = []
        for text in df[df[label_col] == category][text_col]:
            n_grams = tokenize(text, n)
            ngrams_list.extend(n_grams)
        category_ngrams[category] = Counter(ngrams_list).most_common(n_most_common)
    return category_ngrams

def find_phrase_and_sentence(text, phrases):
    """Find a phrase in a text and return the whole sentence containing the phrase
    """
    for phrase in phrases:
        if phrase in text.lower():  # Check if the phrase is in the text
            # Find the whole sentence containing the phrase
            sentence = re.search(r'([^.]*?'+re.escape(phrase)+r'[^.]*\.)', text, re.IGNORECASE)
            if sentence:
                return True, sentence.group()
    return False, ""

In [None]:
local_file1 = PROJECT_DIR / 'dap_job_quality/pipeline/prodigy/labelled_data/benefits_model_eval.jsonl'

_ = load_s3_jsonl(BUCKET_NAME, 'job_quality/prodigy/labelled_data/benefits_model_eval.jsonl', local_file1)

In [None]:
all_records = []

for file in [local_file1]:
    records = []
    for line in srsly.read_jsonl(file):
        records.append(line)
    for record in records:
        all_records.append(record)
        
all_records_deduplicated = []
seen_job_ids = set()

for item in all_records:
    job_id = item['meta']['job_id']
    if job_id not in seen_job_ids:
        seen_job_ids.add(job_id)
        all_records_deduplicated.append(item)

In [None]:
all_records = pd.DataFrame(all_records_deduplicated)
all_records.head()

In [None]:
all_records[all_records['answer']!='accept']

In [None]:
rejected_spans = pdu.get_spans_and_sentences(all_records[all_records['answer']!='accept'].to_dict(orient='records'))
rejected_spans

In [None]:
accepted_spans = pdu.get_spans_and_sentences(all_records_deduplicated)
accepted_spans

In [None]:
flat_data = []
for job_id, entries in accepted_spans.items():
    for entry in entries:
        flat_data.append({
            "job_id": job_id,
            "labelled_span": entry["span"],
            "full_sentence": entry["sent"],
            "label": entry["label"],
            "text": entry["text"]
        })

labelled_spans_df = pd.DataFrame(flat_data)

labelled_spans_df.head()

In [None]:
labelled_spans_df['label'].value_counts()

In [None]:
len(labelled_spans_df['job_id'].unique())

In [None]:
precision = 31/(31+2)
precision

In [None]:
recall = 31/(31+70)
recall

In [None]:
f1 = 2*(precision*recall)/(precision+recall)
f1

Not sure the wordclouds tell us very much because it was such a small sample!

In [None]:
# Wordcloud for each label
label_categories = labelled_spans_df['label'].unique()
label_categories = label_categories[(label_categories != 'none')]

# Create a subplot for each wordcloud in a 1x2 configuration
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 10))

# Flatten the axes array for easy indexing
axes = axes.flatten()

for i, label in enumerate(label_categories):
    if i < 2:  # Ensure we don't go out of bounds
        ax = axes[i]
        text = ' '.join(labelled_spans_df[labelled_spans_df['label'] == label]['labelled_span'].tolist())
        wordcloud = WordCloud(width=800, height=800, 
                              background_color ='white',
                              min_font_size = 10).generate(text)
        
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.axis("off")
        ax.set_title(label)

# This will ensure that any extra subplots are not visible
for j in range(i+1, 2):
    axes[j].axis("off")

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig(PROJECT_DIR / 'outputs/figures/wordclouds_benefits.png')
plt.show()