## IMPORTS

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import re
import string
import ast
import mmh3
import matplotlib.pyplot as plt
import openai
import sys
import yaml
import threading
import networkx as nx
import warnings

from langdetect import detect
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from tqdm import tqdm
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import davies_bouldin_score
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from networkx.algorithms import community
from sklearn.metrics import davies_bouldin_score, rand_score, normalized_mutual_info_score
from sklearn.metrics import normalized_mutual_info_score
from openai import OpenAI

## Utils

In [2]:
def load_data(kind="processed"):
  """
  Load the data from the data folder.
  args:
    kind: "raw" or "processed"
  """
  if kind == "raw":
    df = pd.read_csv('data/raw/jobs.csv', sep=';')
  elif kind == "processed":
    df = pd.read_csv('data/processed/cleaned_jobs.csv', sep=';')
  elif kind == "ground_truth":
    df = pd.read_csv('clusters/ground_truth_gpt.csv')
  elif kind == "skills":
    df = pd.read_csv('extracted_skills/skills_extracted_gpt3_v2.csv')
  return df


def is_english(text):
  try:
    return detect(text) == 'en'
  except:
    return False


def apply_kmeans(tfidf_matrix, k=5):
  kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
  return kmeans.fit_predict(tfidf_matrix.toarray())


def words2sentence(word_list):
  return " ".join(word_list)


def apply_tftidf(data):
  vectorizer = TfidfVectorizer()
  return vectorizer.fit_transform(data)


def visualize_cluster(data,
                      cluster,
                      reduce_dim=True,
                      savefig=False,
                      filename="cluster.png",
                      name="Cluster method"):
  """
  Visualize the clusters
  Data: 2d numpy array of the individual data points that were used for clustering
  cluster: 1d numpy array of the cluster labels
  reduced_dim: Boolean, if True, perform pca to 2 dimensions
  """

  if reduce_dim:
    pca = PCA(n_components=2)
    data = pca.fit_transform(data)

  plt.figure(figsize=(10, 6))
  plt.scatter(data[:, 0], data[:, 1], c=cluster,
              cmap='tab20', edgecolor='black', alpha=0.7, s=100)
  plt.title(name, fontsize=16, fontweight='bold')
  plt.xlabel("PCA 1", fontsize=14)
  plt.ylabel("PCA 2", fontsize=14)
  plt.grid(True, linestyle='--', alpha=0.5)
  plt.tight_layout()
  if savefig:
    plt.savefig(f"figures/{filename}")
  plt.show()


def visualize_ground_truth(gt, savefig=False, filename="ground_truth.png"):
  plt.figure(figsize=(10, 6))
  plt.bar(gt["category"].value_counts().index,
          gt["category"].value_counts().values, color='dodgerblue')

  plt.xticks(rotation=75)
  plt.title("Ground truth distribution", fontsize=16, fontweight='bold')
  plt.xlabel("Category", fontsize=14)
  plt.ylabel("Count", fontsize=14)
  plt.grid(True, linestyle='--', alpha=0.5)
  plt.tight_layout()
  if savefig:
    plt.savefig(f"figures/{filename}")
  plt.show()


def skill_cleanup(data):

  # skills is a list of strings, connect them into one string

  data["skills_string"] = data["skills"].apply(lambda x: ' '.join(x))

  print(data.head())
  return data


## Logger

In [3]:
def working_on(message):
  print(":wrench: [bold green]WORKING ON[/bold green]: " + message)


def warning(message):
  print(":tomato: [bold red]WARNING[/bold red]: " + message)


def info(message):
  print(":information_source: [bold yellow]INFO[/bold yellow]: " + message)


def success(message):
  print(":white_check_mark: [bold green]SUCCESS[/bold green]: " + message)


def winner(message):
    print(":trophy: [bold yellow]WINNER[/bold yellow]: " + message)

## SCRAPING

In [4]:
class LinkedinScraper:
    def __init__(self, location, keywords=None, amount=50):
        self.location = location
        self.keywords = keywords
        self.amount = amount
        self.job_ids = []
        self.jobs = []

        if amount > 1000:
            print(
                "⚠️ WARNING: LinkedIn only allows you to scrape 1000 jobs per search. ⚠️"
            )
            print("⚠️ WARNING: The amount will be set to 1000. ⚠️")
            self.amount = 1000
        if keywords == None:
            self.all_jobs_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?location={self.location}"
            self.all_jobs_url += "&start={}"
        else:
            self.all_jobs_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={self.keywords}&location={self.location}"
            self.all_jobs_url += "&start={}"

        self.job_url = "https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{}"

    def save_to_csv(self, filename="jobs.csv"):
        print("📝 Saving jobs to CSV file...")
        if os.path.isfile(filename):
            existing_ids = set(pd.read_csv(filename, sep=";")["id"])
        else:
            existing_ids = set()

        # Filter out jobs that are already saved in the CSV

        unique_jobs = [job for job in self.jobs if int(job["id"]) not in existing_ids]

        if unique_jobs:
            df = pd.DataFrame(unique_jobs)
            df.to_csv(
                filename,
                mode="a",
                sep=";",
                header=not os.path.isfile(filename),
                index=False,
            )

    def _get_job_ids(self):
        for i in tqdm(
            range(0, self.amount, 25),
            desc="💼 Scraping job IDs 🔍",
            ascii=True,
            colour="#0077B5",
        ):
            res = requests.get(self.all_jobs_url.format(i))
            soup = BeautifulSoup(res.text, "html.parser")
            alljobs_on_this_page = soup.find_all("li")
            for x in range(0, len(alljobs_on_this_page)):
                try:
                    jobid = (
                        alljobs_on_this_page[x]
                        .find("div", {"class": "base-card"})
                        .get("data-entity-urn")
                        .split(":")[3]
                    )
                    self.job_ids.append(jobid)
                except:
                    print("❌ One Job ID could not be retrieved ❌")
                    pass

    def scrape(self):
        # First scrape the job ids
        self._get_job_ids()

        # Then scrape the job details
        for j in tqdm(
            range(0, len(self.job_ids)),
            desc="💼 Scraping job details 🔍",
            ascii=True,
            colour="#0077B5",
        ):
            job = {}  # Create a new job dictionary
            resp = requests.get(self.job_url.format(self.job_ids[j]))
            soup = BeautifulSoup(resp.text, "html.parser")

            job["id"] = self.job_ids[j]
            job["date_scraped"] = pd.Timestamp.now()
            job["keyword_scraped"] = self.keywords
            job["location_scraped"] = self.location
            job["linkedin_num"] = j

            try:
                job["company"] = (
                    soup.find("div", {"class": "top-card-layout__card"})
                    .find("a")
                    .find("img")
                    .get("alt")
                )
            except:
                job["company"] = None

            try:
                job["title"] = (
                    soup.find("div", {"class": "top-card-layout__entity-info"})
                    .find("a")
                    .text.strip()
                )
            except:
                job["title"] = None

            try:
                job["num_applicants"] = (
                    soup.find("div", {"class": "top-card-layout__entity-info"})
                    .find("h4")
                    .find("span", {"class": "num-applicants__caption"})
                    .text.strip()
                )
            except:
                job["num_applicants"] = None

            try:
                job["date_posted"] = (
                    soup.find("div", {"class": "top-card-layout__entity-info"})
                    .find("h4")
                    .find("span", {"class": "posted-time-ago__text"})
                    .text.strip()
                )
            except:
                job["date_posted"] = None

            try:
                ul_element = soup.find(
                    "ul", {"class": "description__job-criteria-list"}
                )

                for li_element in ul_element.find_all("li"):
                    subheader = li_element.find(
                        "h3", {"class": "description__job-criteria-subheader"}
                    ).text.strip()
                    criteria = li_element.find(
                        "span",
                        {
                            "class": "description__job-criteria-text description__job-criteria-text--criteria"
                        },
                    ).text.strip()

                    if "Seniority level" in subheader:
                        job["level"] = criteria
                    elif "Employment type" in subheader:
                        job["employment_type"] = criteria
                    elif "Job function" in subheader:
                        job["function"] = criteria
                    elif "Industries" in subheader:
                        job["industries"] = criteria
            except:
                job["level"] = None
                job["employment_type"] = None
                job["function"] = None
                job["industries"] = None

            try:
                job["description"] = soup.find(
                    "div", {"class": "description__text description__text--rich"}
                ).text.strip()
            except:
                job["description"] = None

            self.jobs.append(job)

            # Checkpoint to save the jobs to the CSV file every 500 jobs
            if (j + 1) % 500 == 0:
                self.save_to_csv()
                self.jobs = []

        if self.jobs:
            self.save_to_csv()
            
scraper = LinkedinScraper(location="Denmark", amount=1000)
scraper.scrape()

💼 Scraping job IDs 🔍:  12%|[38;2;0;119;181m##5                 [0m| 5/40 [00:05<00:37,  1.07s/it][0m

❌ One Job ID could not be retrieved ❌


💼 Scraping job IDs 🔍:  22%|[38;2;0;119;181m####5               [0m| 9/40 [00:08<00:29,  1.05it/s][0m

❌ One Job ID could not be retrieved ❌


💼 Scraping job IDs 🔍:  38%|[38;2;0;119;181m#######1           [0m| 15/40 [00:13<00:21,  1.14it/s][0m

❌ One Job ID could not be retrieved ❌


💼 Scraping job IDs 🔍:  45%|[38;2;0;119;181m########5          [0m| 18/40 [00:16<00:21,  1.04it/s][0m

❌ One Job ID could not be retrieved ❌


💼 Scraping job IDs 🔍:  50%|[38;2;0;119;181m#########5         [0m| 20/40 [00:18<00:18,  1.10it/s][0m

❌ One Job ID could not be retrieved ❌


💼 Scraping job IDs 🔍:  55%|[38;2;0;119;181m##########4        [0m| 22/40 [00:20<00:17,  1.03it/s][0m

❌ One Job ID could not be retrieved ❌


💼 Scraping job IDs 🔍:  57%|[38;2;0;119;181m##########9        [0m| 23/40 [00:21<00:16,  1.05it/s][0m

❌ One Job ID could not be retrieved ❌
❌ One Job ID could not be retrieved ❌


💼 Scraping job IDs 🔍:  60%|[38;2;0;119;181m###########4       [0m| 24/40 [00:22<00:16,  1.01s/it][0m

❌ One Job ID could not be retrieved ❌
❌ One Job ID could not be retrieved ❌


💼 Scraping job IDs 🔍:  80%|[38;2;0;119;181m###############2   [0m| 32/40 [00:31<00:07,  1.05it/s][0m

❌ One Job ID could not be retrieved ❌


💼 Scraping job IDs 🔍:  92%|[38;2;0;119;181m#################5 [0m| 37/40 [00:35<00:02,  1.09it/s][0m

❌ One Job ID could not be retrieved ❌


💼 Scraping job IDs 🔍: 100%|[38;2;0;119;181m###################[0m| 40/40 [00:38<00:00,  1.04it/s][0m
💼 Scraping job details 🔍:  52%|[38;2;0;119;181m######7      [0m| 500/961 [04:37<04:49,  1.59it/s][0m

📝 Saving jobs to CSV file...


💼 Scraping job details 🔍: 100%|[38;2;0;119;181m#############[0m| 961/961 [08:46<00:00,  1.82it/s][0m

📝 Saving jobs to CSV file...





## PREPROCESSING

In [5]:
def remove_words_with_numbers(word_list):
  """
  Takes a string representation of a list of words as input,
  removes any special characters from the words, and then removes any words that contain numbers.

  Args:
    word_list_str: A string representation of a list of words.

  Returns:
    The function `remove_words_with_numbers` returns a list of words without any special characters or
  numbers.
  """
  word_list_without_special = [
      re.sub(r"[^a-zA-Z0-9\s]", "", word) for word in word_list
  ]
  word_list_without_numbers = [
      word for word in word_list_without_special if not re.search(r"\d", word)
  ]
  return word_list_without_numbers


def convert_date_posted(date_str, date_scraped):
  try:
    days_ago = int(date_str.split(' ')[0])
    actual_date = pd.to_datetime(date_scraped) - pd.Timedelta(days=days_ago)
    return actual_date
  except:
    return date_scraped  # If the format is not "x days ago", use the scraped date


def split_combined_words(text):
  """
  Since during the scraping, some words are combined, e.g. "requirementsYou're" or "offerings.If" we need to split them
  Splits words at:
  1. Punctuation marks followed by capital letters.
  2. Lowercase letters followed by uppercase letters.
  """
  # 1. split
  text = re.sub(r'([!?,.;:])([A-Z])', r'\1 \2', text)

  # 2. split
  text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

  return text


def text_preprocessing(text):
  """
  Preprocesses text by:
    - Splitting combined words
    - Tokenizing
    - Removing stopwords
    - Remove punctuation
    - Lemmatizing
  """

  text = split_combined_words(text)
  text = text.lower()

  # Remove punctuation
  text = re.sub(f'[{string.punctuation}]', '', text)
  # Remove numbers
  text = re.sub(r'\d+', '', text)

  tokens = word_tokenize(text)

  stop_words = set(stopwords.words('english'))

  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]

  punctuation = {'!', ',', '.', ';', ':', '?',
                 '(', ')', '[', ']', '-', '+', '"', '*', '—', '•', '’', '‘', '“', '”', '``'}
  tokens = [w for w in tokens if w not in punctuation]

  # Remove last 3 words since they are always the same (scraped buttons from the website)
  tokens = tokens[:-3]

  return tokens


def main():
  """
  Main function of the preprocessing module.
  Loads the raw data and does the following:
  - Checks for english language
  - Removes rows with missing descriptions
  - Inferes the date posted
  - Preprocesses the description
  - Saves the preprocessed data to data/processed/cleaned_jobs.csv
  """

  working_on("Loading data")
  df = load_data(kind="raw")

  # Remove duplicates
  df.drop_duplicates(subset=['id'], inplace=True)
  df.drop_duplicates(subset=['description'], inplace=True)
  # Filter out jobs with missing descriptions
  df = df[df['description'].notna()]

  working_on("Filtering out non-english descriptions ...")
  for index, row in df.iterrows():
    if not is_english(row['description'][:100]):
      df.drop(index, inplace=True)

  working_on("Infering dates ...")
  df['date_posted'] = df.apply(lambda x: convert_date_posted(
      x['date_posted'], x['date_scraped']), axis=1)

  # Lower case all text
  df['title'] = df['title'].str.lower()
  df['function'] = df['function'].str.lower()
  df['industries'] = df['industries'].str.lower()
  df['industries'] = df['industries'].str.replace('\n', ' ')

  # Removing outliers (where industries is whole description of offer)
  df["industries_length"] = df["industries"].str.split()
  df["industries_length"] = df["industries_length"].str.len()
  df = df[df["industries_length"] < 15]
  df.drop(columns=["industries_length"], inplace=True)

  df["industries"] = df["industries"].str.replace(" and ", ",")
  df["function"] = df["function"].str.replace(" and ", ",")
  df["industries"] = df["industries"].str.replace("/", ",")
  df["function"] = df["function"].str.replace("/", ",")

  df["industries"] = df["industries"].str.replace(r",,|, ,", ",")
  df["function"] = df["function"].str.replace(r",,|, ,", ",")

  tqdm.pandas(desc="🐼 Preprocessing description", ascii=True, colour="#0077B5")

  df['description'] = df['description'].progress_apply(text_preprocessing)

  # Remove rows with empty descriptions or descriptions containing less than 3 words
  df = df[df['description'].map(len) > 3]

  # Remove special characters and numbers from the tokenized list
  df['description'] = df['description'].apply(
      lambda x: remove_words_with_numbers(x)
  )

  df = df.reset_index(drop=True)

  working_on("Saving preprocessed data ...")
  df.to_csv('data/processed/cleaned_jobs.csv', index=False, sep=';')

## LOAD CLEAN DATA

In [6]:
df = load_data(kind="processed")
working_on("Loading data")
data = load_data(kind="processed")
data["description"] = data["description"].apply(ast.literal_eval)
success("Data loaded")

:wrench: [bold green]WORKING ON[/bold green]: Loading data
:white_check_mark: [bold green]SUCCESS[/bold green]: Data loaded


## TF IDF CLUSTERING

In [7]:
def TFIDF_cluster(data, save_clusters=True):
    """
    data: pandas dataframe (cleaned jobs)
    save_clusters: Boolean, if True, save the clusters to a csv file in a format "id, cluster"
    """

    data["description"] = data["description"].apply(words2sentence)
    tfidf_matrix = apply_tftidf(data["description"])

    data["cluster"] = apply_kmeans(tfidf_matrix, k=20)

    if save_clusters:
        data[["id", "cluster"]].to_csv("clusters/tfidf_clusters.csv", index=False)

    dbs = round(davies_bouldin_score(tfidf_matrix.toarray(), data["cluster"]), 3)

    success("David Bouldin score: " + str(dbs))

    return data[["id", "cluster"]], tfidf_matrix.toarray()


def TFIDF_industries_and_functions_cluster(data, save_clusters=False):
    """
    data: pandas dataframe (cleaned jobs)
    save_clusters: Boolean, if True, save the clusters to a csv file in a format "id, cluster"
    """

    data["industries"] = data['function'] + ', ' + data['industries']
    data['industries'] = data['industries'].str.replace(',,', ',', regex=False)

    tfidf_matrix = apply_tftidf(data["industries"])

    data["cluster"] = apply_kmeans(tfidf_matrix, k=20)

    if save_clusters:
        data[["id", "cluster"]].to_csv("clusters/tfidf_industries_and_functions_clusters.csv", index=False)

    dbs = round(davies_bouldin_score(tfidf_matrix.toarray(), data["cluster"]), 3)

    success("David Bouldin score: " + str(dbs))

    return data[["id", "cluster"]], tfidf_matrix.toarray()

In [8]:
clusters_text = TFIDF_cluster(data, save_clusters=False)
clusters_industries = TFIDF_industries_and_functions_cluster(data, save_clusters=False)
print(clusters_text, clusters_industries)

:white_check_mark: [bold green]SUCCESS[/bold green]: David Bouldin score: 5.13
:white_check_mark: [bold green]SUCCESS[/bold green]: David Bouldin score: 2.24
(              id  cluster
0     3701420292        6
1     3712625082        6
2     3664534049        6
3     3576920179        6
4     3590303623        6
...          ...      ...
1860  3666672537        7
1861  3686843498        7
1862  3748928225        6
1863  3585733023        6
1864  3748579693        8

[1865 rows x 2 columns], array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])) (              id  cluster
0     3701420292        8
1     3712625082        6
2     3664534049        6
3     3576920179        6
4     3590303623        8
...          ...      ...
1860  3666672537       17
1861  3686843498       17
1862  3748928225        2


## WORD2VEC CLUSTERING

In [9]:
def words_to_sentence(word_list):
    return " ".join(word_list)

def remove_words_with_numbers(word_list_str):
    """
    Takes a string representation of a list of words as input,
    removes any special characters from the words, and then removes any words that contain numbers.

    Args:
      word_list_str: A string representation of a list of words.

    Returns:
      The function `remove_words_with_numbers` returns a list of words without any special characters or
    numbers.
    """
    word_list = ast.literal_eval(word_list_str)
    word_list_without_special = [
        re.sub(r"[^a-zA-Z0-9\s]", "", word) for word in word_list
    ]
    word_list_without_numbers = [
        word for word in word_list_without_special if not re.search(r"\d", word)
    ]
    return word_list_without_numbers

df = load_data(kind="processed")

# Apply the function to the 'words' column of the DataFrame
# df["description"] = df["description"].apply(
#     lambda x: remove_words_with_numbers(x)
# )

##########################################

##### PRETRAINED MODEL WORD2VEC ######
# model = KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin', binary=True)
# def description_to_vector(description, model):
#     valid_words = [word for word in description if word in model.key_to_index]
#     if valid_words:
#         return np.mean([model[word] for word in valid_words], axis=0)
#     else:
#         return np.zeros(model.vector_size)
# df['vector'] = df['description'].apply(lambda desc: description_to_vector(desc, model))
###################################

######### Word2Vec #########
model = Word2Vec(sentences=df['description'], vector_size=100, window=5, min_count=1, workers=4)
def description_to_vector(description):
    # Filter out words not in the model's vocabulary
    valid_words = [word for word in description if word in model.wv.key_to_index]
    if valid_words:
        # Average the vectors of the words in the description
        return np.mean(model.wv[valid_words], axis=0)
    else:
        # If no valid words, return a zero vector
        return np.zeros(model.vector_size)

df['vector'] = df['description'].apply(description_to_vector)
##################################
# Convert list of vectors to a 2D array for clustering
vectors = np.array(df['vector'].tolist())

# Apply KMeans clustering
kmeans = KMeans(n_clusters=20) 
df['cluster'] = kmeans.fit_predict(vectors)

df_id_and_cluster = df[["id", "cluster"]].sort_values(
    by="cluster", ascending=True
)
df_id_and_cluster.to_csv("clusters/word2wev_clustering_id.csv", index=False)

  super()._check_params_vs_input(X, default_n_init=10)


## FEATURE CLUSTERING (ONE HOT ENCODED FUNCTIONS AND INDUSTRIES)

## DOC2VEC CLUSTERING

In [10]:
class Doc2VecWrapper:
    def __init__(self):
        self.tagged_documents = None
        self.model = None
        self.epochs = None
        self.original_data_mapping = None

    def init(self, vector_size, alpha, min_alpha, min_count, epochs):
        """
        Initializes the doc2vec model.

        vector_size: Dimensionality of the feature vectors.
        alpha: The initial learning rate.
        min_alpha: Learning rate will linearly drop to min_alpha as training progresses.
        min_count: Ignores all words with total frequency lower than this.
        epochs: Number of iterations (epochs) over the corpus.
        """
        self.model = Doc2Vec(vector_size=vector_size,
                             alpha=alpha,
                             min_alpha=min_alpha,
                             min_count=min_count)
        self.epochs = epochs

    def fit(self, tokenized_texts: list[list[str]]):
        """
        Fits the doc2vec model on the data.

        tokenized_texts: List of lists of tokens.
        """
        self._tag_data(tokenized_texts)
        self.model.build_vocab(self.tagged_documents)

        self.original_data_mapping = {
            f"DOC_{str(i)}": text for i, text in enumerate(tokenized_texts)}

    def _tag_data(self, tokenized_texts: list[list[str]]):
        """
        Tags the data for the doc2vec model.

        tokenized_texts: List of lists of tokens.
        """
        self.tagged_documents = [TaggedDocument(
            words=_d, tags=[f"DOC_{str(i)}"]) for i, _d in enumerate(tokenized_texts)]

    def train(self):
        """
        Trains a doc2vec model on the data.
        """
        for epoch in tqdm(range(self.epochs), desc='Training doc2vec', ascii=True, colour="#0077B5"):
            self.model.train(self.tagged_documents,
                             total_examples=self.model.corpus_count, epochs=1)
            # decrease the learning rate
            self.model.alpha -= 0.002
            # fix the learning rate, no decay
            self.model.min_alpha = self.model.alpha

    def infer(self, tokenized_text: list[str]):
        """
        Infers a vector for a given tokenized text.

        tokenized_text: List of tokens.

        returns: Vector representation of the text.
        """
        return self.model.infer_vector(tokenized_text)

    def most_similar(self, doc_tag, topn=10):
        """
        Finds the most similar documents to a given document.

        doc_tag: Tag of the document.
        topn: Number of similar documents to return.

        returns: List of tuples (tag, similarity).
        """
        return self.model.dv.most_similar(doc_tag, topn=topn)

    def most_similar_original_format(self, doc_tag, topn=10):
        """
        Finds the most similar documents to a given document.

        doc_tag: Tag of the document.
        topn: Number of similar documents to return.

        returns: List of tuples (tag, similarity).
        """
        return [(self.original_data_mapping[doc_tag], similarity) for doc_tag, similarity in self.most_similar(doc_tag, topn)]

jobs = load_data(kind="processed")
jobs_descriptions = jobs['description'].tolist()

doc2vec = Doc2VecWrapper()
doc2vec.init(vector_size=50, alpha=0.025,
             min_alpha=0.00025, min_count=1, epochs=100)
doc2vec.fit(jobs_descriptions)
doc2vec.train()

print(doc2vec.infer(["you", "are", "a", "very", "good", "programmer"]))

# Original format query
print(doc2vec.original_data_mapping["DOC_50"])

similar_docs = doc2vec.most_similar_original_format("DOC_50")

for doc in similar_docs:
    print(doc)
    print(50 * "-")

Training doc2vec: 100%|[38;2;0;119;181m#######################[0m| 100/100 [01:52<00:00,  1.12s/it][0m

[-3.2849133e-03 -2.6715684e-03  2.1414864e-03  6.0340287e-03
  9.8793395e-03 -6.2247431e-03  7.2544194e-03  2.9419660e-05
  6.4916764e-03  5.3687263e-03 -1.8554962e-03 -4.9450602e-03
 -5.1674973e-03 -4.8928233e-03 -3.4063600e-03  9.3516108e-05
  2.1961343e-03 -2.6351481e-03 -3.8856696e-03 -3.8499737e-03
 -4.4055749e-03 -7.7464636e-03 -5.6949425e-03  3.2602048e-03
 -7.1620359e-03  7.1301744e-03  4.6692286e-03 -5.0445641e-03
  1.0659301e-03 -8.3084460e-03  6.8165065e-04  2.3031044e-03
 -2.6062273e-03 -9.0058362e-03  4.2783534e-03 -3.9463867e-03
 -7.0883157e-03 -7.4489675e-03  8.5577607e-04  8.2220305e-03
 -4.1868580e-03  9.0158870e-03 -4.6184212e-03 -2.8841526e-03
  3.0309998e-03 -9.0387370e-03 -4.4420152e-03  9.2533948e-03
 -9.1791330e-03  5.9312833e-03]
['crediwire', 'startup', 'group', 'young', 'passionate', 'individual', 'ambition', 'eager', 'shape', 'future', 'youre', 'passionate', 'design', 'development', 'dream', 'joining', 'next', 'fintech', 'generation', 'read', 'onthe', 'role',




## SIMILARITY

In [11]:
def minhashes(shingles, seeds):
  hashs = []
  for seed in range(seeds):
    mini = float('inf')
    for shi in shingles:
      # hashes a list of strings
      hash = 0
      for e in shi:
        hash = hash ^ mmh3.hash(e, seed)
      # find the minimum value
      if mini > hash:
        mini = hash
    hashs.append(mini)
  return list(hashs)

# get every signature in data


def signatures(df, k, seeds):
  hash_dic = {}
  df = df.apply(
        lambda x: ast.literal_eval(x)
        )
  for i in range(len(df)):
    # make a description into k-shingles
    shi = []
    for ch in range(len(df[i])-k+1):
      shi.append(df[i][ch:ch+k])

    hash_dic[i] = minhashes(list(shi), seeds)
  return hash_dic


def convert_matrix(N, scores):
  similarity_matrix = np.zeros((N, N))
  for i in range(N):
    for j in range(N):
      if i == j:
        similarity_matrix[i][j] = 1.0
      elif i > j:
        similarity_matrix[i][j] = scores[(j, i)]
      else:
        similarity_matrix[i][j] = scores[(i, j)]
  return similarity_matrix


def find_sim(data, q, seed):
  """
  Finds the similarity between any two job's description for a given dataset using the shingle, minihash 
  and jaccord similarity.

  Args:
    data: The "data" parameter is the dataset that you want to cluster. It should be a 2D array-like
  object, such as a numpy array or a pandas DataFrame, where each row represents a data point and each
  column represents a feature of that data point.
    q: The q parameter represents the number of shingles ( k = 2 or 3 for small documents such as emails)
    seed: The seed parameter represents how mand seeds to use for doing the minihashes

  Returns:
    A dictionary where the keys are pairs of indices, and the values are scores representing the similarity 
    between job descriptions at those indices
  """
  sign = signatures(data, q, seed)

  score_list = {}
  keys = list(sign.keys())
  for k in tqdm(range(len(keys)-1), desc='Calculating jaccard similarity'):
    for j in range(k+1, len(keys)):
      # calculate jaccard simiarity and store the score
      score = len(np.intersect1d(
          sign[keys[k]], sign[keys[j]]))/len(np.union1d(sign[keys[k]], sign[keys[j]]))
      score_list[(keys[k], keys[j])] = score
  return score_list


def louvain_cluster(N, scores):
  """
  Determines the best partition of a graph for a given similarity score value
  using the Louvain Community Detection Algorithm, (Not using Girvan Newman
  because it's too time comsuming) and find its Davies-Bouldin index value.

  Args:
    N: length of data
    scores: A dictionary where the keys are pairs of indices, and the values are scores representing the similarity 
    between job descriptions at those indices

  Returns:
    the cluster label for each data points and the corresponding Davies-Bouldin index.
  """
  # Create a graph
  G = nx.Graph()

  # Add nodes (text points)
  G.add_nodes_from(range(N))

  # Add edges based on similarity scores (you can adjust the threshold)
  for idx, idy in scores:
    G.add_edge(idx, idy, weight=scores[(idx, idy)]*100)

  # Use Louvain community detection algorithm to detect communities
  communities = community.louvain_communities(G)

  # Retrieve the cluster assignments
  clusters = {}
  for label, nodes in enumerate(communities):
    for idx in nodes:
      clusters[idx] = label

  # Sort the cluster based on id order and calculate the dbi
  sorted_dict = dict(sorted(clusters.items()))
  dbi = davies_bouldin_score(convert_matrix(
    N, scores), list(sorted_dict.values()))
  return sorted_dict, dbi 


def kmean_cluster(N, scores):
  """
  Determines the clusters for a given similarity matrix.

  Args:
    N: length of data
    scores: A dictionary where the keys are pairs of indices, and the values are scores representing the similarity 
    between job descriptions at those indices
    k_max: The parameter `k_max` represents the maximum number of clusters to consider. In the given
    code, it is set to 30, which means the function will iterate over values of `k` from 2 to 30
    (inclusive) to find the best value of `k` based on the. Defaults to 30
    ground_truth: An array of cluster label generated by feature clustering 

  Returns:
    the cluster label for each data points and the corresponding Davies-Bouldin index .
  """

  warnings.filterwarnings("ignore")
  similarity_matrix = convert_matrix(N, scores)
  
  # Kmeans clustering
  kmeans = KMeans(n_clusters=19)
  # Convert similarity to distance
  labels = kmeans.fit_predict(similarity_matrix)

  dbi = davies_bouldin_score(similarity_matrix, labels)

  # Retrieve the cluster assignments
  clusters = {}
  for label, idx in zip(labels, range(N)):
    clusters[idx] = label

  return clusters, dbi

In [12]:
# Load the data
df = load_data(kind="processed")

# Number of jobs
N = len(df)
# Give q & seeds for hash to find similarity for each job's descriptions
# q = number of singles ( k = 2 or 3 for small documents such as emails)
q = 2
seeds = 100
working_on("Finding similarity ...")
scores = find_sim(df['description'], q, seeds)

# Plot the network based on similarity and find community based on graph
# To evaluate the functionality of the cluster, calculate the DBI (The minimum
# score is zero, with lower values indicating better clustering)
# and measure rand index between feature label ground truth and prediction
# (similarity score between 0.0 and 1.0, inclusive, 1.0 stands for a perfect match)

:wrench: [bold green]WORKING ON[/bold green]: Finding similarity ...


Calculating jaccard similarity: 100%|███████| 1864/1864 [02:45<00:00, 11.28it/s]


## SIMILARITY CLUSTERING COMMUNITY DISCOVERY

In [13]:
working_on("Clustering based on community discovery...")
cluster_graph, dbi_graph = louvain_cluster(N, scores)
df['cluster_graph'] = cluster_graph

:wrench: [bold green]WORKING ON[/bold green]: Clustering based on community discovery...


## SIMILARITY CLUSTERING KMEANS

In [14]:
working_on("Clustering based on kmean...")
cluster_kmean, dbi_kmean = kmean_cluster(N, scores)
df['cluster_kmean'] = cluster_kmean

:wrench: [bold green]WORKING ON[/bold green]: Clustering based on kmean...


# Save clusters

In [15]:
working_on("Saving clusters ...")
graph_clusters = df[["id", "cluster_graph"]]
graph_clusters = graph_clusters.rename(columns={"cluster_graph": "cluster"})

kmean_clusters = df[["id", "cluster_kmean"]]
kmean_clusters = kmean_clusters.rename(columns={"cluster_kmean": "cluster"})

graph_clusters.to_csv("clusters/sim_community_discovery_clusters.csv", index=False)
kmean_clusters.to_csv("clusters/sim_kmeans_clusters.csv", index=False)

:wrench: [bold green]WORKING ON[/bold green]: Saving clusters ...


## EVALUATION

In [16]:
def load_clustering_methods(paths):
    """Loads multiple clusterings from specified file paths."""
    clustering_method = {}
    for name, path in paths.items():
        clustering_method[name] = pd.read_csv(path)
    return clustering_method


def compare_clusters_nmi(clusters):
    """Compares multiple sets of clusters using Normalized Mutual Information (NMI)."""
    nmi_matrix = pd.DataFrame(index=clusters.keys(), columns=clusters.keys())
    for name1, data1 in clusters.items():
        for name2, data2 in clusters.items():
            if name1 != name2:
                merged_data = pd.merge(data1, data2, on='id', suffixes=('_1', '_2'))
                nmi_score = normalized_mutual_info_score(
                    merged_data['cluster_1'], merged_data['cluster_2'])
                nmi_matrix.loc[name1, name2] = nmi_score
            else:
                nmi_matrix.loc[name1, name2] = 1.0  # Same clustering method comparison
    return nmi_matrix


def compare_clusters_rand_index(clusters):
    """Compares multiple sets of clusters using Rand Index."""
    rand_matrix = pd.DataFrame(index=clusters.keys(), columns=clusters.keys())
    for name1, data1 in clusters.items():
        for name2, data2 in clusters.items():
            if name1 != name2:
                merged_data = pd.merge(data1, data2, on='id', suffixes=('_1', '_2'))
                rand = rand_score(
                    merged_data["cluster_1"], merged_data["cluster_2"])
                rand_matrix.loc[name1, name2] = rand
            else:
                # Same clustering method comparison
                rand_matrix.loc[name1, name2] = 1.0
    return rand_matrix


def evaluation():
    paths = {
        # 'ground_truth': 'clusters/ground_truth.csv',
        'ground_truth_gpt': 'clusters/ground_truth_gpt.csv',
        'word2vec': 'clusters/word2vec_clusters.csv',
        'tfidf_text': 'clusters/tf_idf_clusters.csv',
        'tfidf_industries': "clusters/tfidf_industries_and_functions_clusters.csv",
        'industries_functions': 'clusters/ind_fun_onehot_clusters.csv',
        'similarity_community_disc': 'clusters/sim_community_discovery_clusters.csv',
        'similarity_kmeans': 'clusters/sim_kmeans_clusters.csv',
        'doc2vec_gmm': 'clusters/doc2vec_gmm_clusters.csv',
        'doc2vec_kmeans': 'clusters/doc2vec_kmeans_clusters.csv',
    }

    # Load the datasets
    working_on("Comparing clusters ...")
    cluster_methods = load_clustering_methods(paths)

    # Compare the clusters and get NMI matrix
    nmi_matrix = compare_clusters_nmi(cluster_methods)
    rand_index_matrix = compare_clusters_rand_index(cluster_methods)

    success("Normalized Mutual Information matrix:")
    # Dataframe to string
    print(nmi_matrix.to_string(index=False))

    success("Rand Index matrix:")
    print(rand_index_matrix.to_string(index=False))

    ground_truth_nmi = nmi_matrix['ground_truth_gpt'].drop('ground_truth_gpt')
    ground_truth_rand = rand_index_matrix['ground_truth_gpt'].drop(
        'ground_truth_gpt')

    # Select the best clustering method based on NMI and Rand Index
    #best_nmi = ground_truth_nmi.idxmax()
    #best_rand = ground_truth_rand.idxmax()
    best_nmi_index = ground_truth_nmi.astype(float).idxmax()
    best_rand_index = ground_truth_rand.astype(float).idxmax()
    
    # Get the corresponding values
    best_nmi = ground_truth_nmi[best_nmi_index]
    best_rand = ground_truth_rand[best_rand_index]
    
    winner(f"Best clustering method based on NMI: {best_nmi}")
    #print(f"NMI SCORE: {round(ground_truth_nmi[best_nmi],3)}")
    print(f"NMI SCORE: {round(best_nmi, 3)}")

    winner(f"Best clustering method based on Rand Index: {best_rand}")
    #print(f"RAND SCORE: {round(ground_truth_rand[best_rand],3)}")
    print(f"RAND SCORE: {round(best_rand, 3)}")

    # Plot the NMI and Rand Index

In [17]:
evaluation()

:wrench: [bold green]WORKING ON[/bold green]: Comparing clusters ...
:white_check_mark: [bold green]SUCCESS[/bold green]: Normalized Mutual Information matrix:
ground_truth_gpt  word2vec tfidf_text tfidf_industries industries_functions similarity_community_disc similarity_kmeans doc2vec_gmm doc2vec_kmeans
             1.0  0.082135   0.255119         0.262505             0.258349                  0.111291          0.134077    0.040393        0.03664
        0.082135       1.0   0.161284         0.074703             0.059478                  0.113221          0.127688    0.050201         0.0563
        0.255119  0.161284        1.0         0.231081              0.21041                  0.276787          0.334771    0.059916       0.059618
        0.262505  0.074703   0.231081              1.0             0.440846                  0.131934          0.179344    0.041914       0.040662
        0.258349  0.059478    0.21041         0.440846                  1.0                  0.113928    

## GROUND TRUTH INFERENCE

In [18]:
def transform_string(s):
    return s[1:-1].replace("'", "").replace(", ", " ")

def api_call_thread(offer, client, result_container):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": f"You are a professional job recruiter. Your task is to categorize a job description with keywords into one and only one of the specified 20 categories: {industries}. You are not allowed to use any other categories."},
                {"role": "user", "content": "Classify into one of the given indsutries. Job description: '''would like part ryanair group amazing cabin crew family k crew customer oriented love delivering great service want fast track career opportunity would delighted hear experience required bag enthusiasm team spirit europe largest airline carrying k guest daily flight looking next generation cabin crew join u brand new copenhagen base flying board ryanair group aircraft amazing perk including discounted staff travel destination across ryanair network fixed roster pattern free training industry leading pay journey becoming qualified cabin crew member start week training course learn fundamental skill require part day day role delivering top class safety customer service experience guest course required study exam taking place regular interval training culminates supernumerary flight followed cabin crew wing member ryanair group cabin crew family immersed culture day one career opportunity endless including becoming number base supervisor european base manager regional manager aspire becoming director inflight life cabin crew fun rewarding however demanding position safety number priority required operate early late shift report duty early morning early roster return home midnight afternoon roster morning person think twice applying requirement bag enthusiasm customer serviceoriented background ie previous experience working bar restaurant shop etc applicant must demonstrate legal entitlement work unrestricted basis across euyou must cm cm height must able swim meter unaided help hardworking flexible outgoing friendly personality adaptable happy work shift roster enjoy dealing public ability provide excellent customer service attitude comfortable speaking writing english ease passion travelling meeting new people benefit free cabin crew training course adventure experience lifetime within cabin crew network explore new culture city colleague day flexible day day staff roster unlimited highly discounted staff travel rate sale bonus free uniform year security working financially stable airline daily per diem provided whilst training direct employment contract highly competitive salary package click link start new exciting career sky'''. Keywords: '''management,manufacturing, technology, information,internet'''"},
                {"role": "assistant", "content": "Hospitality & Tourism"},
                {"role": "user", "content": f"Classify into one of the given indsutries. Job description: '''{offer['description']}'''. Keywords: '''{offer['keywords']}'''"},
            ]
        )
        result_container["response"] = response
    except Exception as e:
        result_container["error"] = str(e)

def restart_script():
    print("Restarting script...")
    os.execv(sys.executable, ['python'] + sys.argv)

openai.organization = "ORG_KEY"
openai.api_key = os.getenv("OPENAI_API_KEY")

df = pd.read_csv("data/processed/cleaned_jobs.csv", delimiter=';')

df['description'] = df['description'].apply(transform_string)
df['keywords'] = df['function'] + ', ' + df['industries']
job_descriptions = df[['id', 'keywords', 'description']]

industries = "Software & IT, Healthcare & Medicine, Education & Training, Engineering & Manufacturing, Finance & Accounting, Sales & Marketing, Creative Arts & Design, Hospitality & Tourism, Construction & Real Estate, Legal & Compliance, Science & Research, Human Resources & Recruitment, Transportation & Logistics, Agriculture & Environmental, Retail & Consumer Goods, Media & Communications, Government & Public Sector, Non-Profit & Social Services, Energy & Utilities, Arts & Entertainment"

client = openai.Client()

ground_truth = {}
yaml_file = 'ground_truth.yaml'
if os.path.exists(yaml_file):
    with open(yaml_file, 'r') as file:
        ground_truth = yaml.safe_load(file) or {}

for index, offer in job_descriptions.iterrows():
    if offer['id'] in ground_truth:
        continue

    result_container = {}
    thread = threading.Thread(target=api_call_thread, args=(offer, client, result_container))
    thread.start()
    thread.join(timeout=10)

    if thread.is_alive() or "error" in result_container:
        restart_script()
    
    response = result_container.get("response")
    if response:
        skills = response.choices[0].message.content
        ground_truth[offer['id']] = skills
        with open(yaml_file, 'w') as file:
            yaml.dump(ground_truth, file, default_flow_style=False)
        print(f"Saved ground truth for offer ID: {offer['id']}")

ground_truth_df = pd.DataFrame.from_dict(ground_truth, orient='index', columns=['category'])
ground_truth_df.index.name = 'id'
ground_truth_df.reset_index(inplace=True)

mapping_rules = {
    'Software & IT': 'Software & IT',
    'Creative Arts & Design': 'Creative Arts & Design',
    'Engineering & Manufacturing': 'Engineering & Manufacturing',
    'Manufacturing': 'Engineering & Manufacturing',
    'Human Resources & Recruitment': 'Human Resources & Recruitment',
    'Energy & Utilities': 'Energy & Utilities',
    'Sales & Marketing': 'Sales & Marketing',
    'Consumer Goods': 'Retail & Consumer Goods',
    'Transportation & Logistics': 'Transportation & Logistics',
    'Finance & Accounting': 'Finance & Accounting',
    'Information Technology & Services': 'Software & IT',
    'IT & Software': 'Software & IT',
    'Non-Profit & Social Services': 'Non-Profit & Social Services',
    'Media & Communications': 'Media & Communications',
    'Technology': 'Software & IT',
    'Hospitality & Tourism': 'Hospitality & Tourism',
    'Retail & Consumer Goods': 'Retail & Consumer Goods',
    'Technology & Information': 'Software & IT',
    'Legal & Compliance': 'Legal & Compliance',
    'Healthcare & Medicine': 'Healthcare & Medicine',
    'Science & Research': 'Science & Research',
    'Information Technology': 'Software & IT',
    'Education & Training': 'Education & Training',
    'Business & Entrepreneurship': 'Finance & Accounting',
    'Logistics & Supply Chain': 'Transportation & Logistics',
    'Construction & Real Estate': 'Construction & Real Estate',
    'Arts & Entertainment': 'Arts & Entertainment',
    'Agriculture & Environmental': 'Agriculture & Environmental',
    'Staffing & Recruiting': 'Human Resources & Recruitment',
    'Maritime & Transportation': 'Transportation & Logistics',
    'Technology & IT': 'Software & IT',
    'Public Relations & Communications': 'Media & Communications',
    'Customer Service': 'Human Resources & Recruitment',
    'Information Technology (IT)': 'Software & IT',
    'Manufacturing & Engineering': 'Engineering & Manufacturing',
    'Renewable energy': 'Energy & Utilities',
    'Government & Public Sector': 'Government & Public Sector',
    'Customer Success': 'Sales & Marketing',
    'Insurance & Risk Management': 'Finance & Accounting',
    'Human Resources': 'Human Resources & Recruitment',
    'Marketing & Advertising': 'Sales & Marketing',
    'Pharmaceutical & Healthcare': 'Healthcare & Medicine',
    'Retail': 'Retail & Consumer Goods',
    'Environmental & Sustainability': 'Agriculture & Environmental',
    'Real Estate & Construction': 'Construction & Real Estate',
    'Aerospace & Defense': 'Engineering & Manufacturing',
    'Public Relations': 'Media & Communications',
    'Event Planning & Management': 'Hospitality & Tourism',
    'Sports & Recreation': 'Arts & Entertainment',
    'Medical equipment manufacturing': 'Healthcare & Medicine',
    'Renewable Energy': 'Energy & Utilities',
    'Technology & Internet': 'Software & IT',
    'Technology & Information Technology': 'Software & IT',
    'Administration & Office Support': 'Human Resources & Recruitment',
    'Information & Technology': 'Software & IT',
    'Administration': 'Human Resources & Recruitment',
    'Technology & Telecommunications': 'Software & IT',
    'Insurance': 'Finance & Accounting',
    'Insurance & Financial Services': 'Finance & Accounting',
    'Logistics & Supply Chain Management': 'Transportation & Logistics',
    'Market Research': 'Sales & Marketing'
}

ground_truth_df['category'] = ground_truth_df['category'].map(mapping_rules)

ground_truth_df['category'] = pd.Categorical(ground_truth_df['category'])
ground_truth_df['cluster'] = ground_truth_df['category'].cat.codes
df_id_and_cluster = ground_truth_df[["id", "category", "cluster"]].sort_values(
    by="cluster", ascending=True
)

df_id_and_cluster.to_csv("./csv_files/ground_truth_gpt.csv", index=False)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

## SKILL EXTRACTION

In [19]:
# Set your OpenAI API key from the environment
API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=API_KEY)


def extract_skills_gpt3(job_description):

    # system = f"You are an expert in job analysis. Your task is to identify skills required for a job based on its description, selecting only from the following predefined skills list: {', '.join(skill_list)}. You are not allowed to use any other skills than those mentioned in the skills list. Do not infer or add skills not mentioned in the description and provide the skills in a simple, comma-separated format."
    # system = f"You are an expert in job analysis. Your task is to identify skills required for a job based on its description. You are only to identify soft skills. Do not infer or add skills not mentioned in the description and provide the skills in a simple, comma-separated format."
    system = f"You are an expert in job analysis. Your task is to extract at most 10 skills required for a job based on its description. Do not infer or add skills not mentioned in the description. You are required to present me the skills in a raw list format: [skill1, skill2, ... skill10]."

    # prompt = f"Extract and list the skills required for Present the skills in a simple, comma-separated list. No explanations or additional text. Job Description: '{job_description_str}' Skills:"
    prompt = f"Identify at most 10 skills required for this job based on the description. Present them to me in a raw list format [skill1, skill2, ..., skill10]. Description: '{job_description}'"

    response = client.chat.completions.create(model="gpt-3.5-turbo",
                                              messages=[
                                                  {"role": "system",
                                                   "content": system},
                                                  {"role": "user", "content": prompt},
                                              ])

    skills_response = response.choices[0].message.content
    return skills_response


def skill_extraction(save_skills=False):

    df_clean = load_data("processed")
    df_raw = load_data("raw")

    df_clean = df_clean[['id', 'description']]
    df_raw = df_raw[['id', 'description']]

    # Obtain the original unprocessed job descriptions from the jobs that appear in the clean dataset
    merged = pd.merge(df_clean, df_raw, on='id', how="left",
                      suffixes=('_clean', '_raw'))

    # Drop duplicates based on id
    merged = merged.drop_duplicates(subset=['id'])

    extracted_skills = {"id": [], "skills": [], "description_raw": []}

    N = len(merged)
    count = 0

    for _, row in merged.iterrows():
        job_description = row['description_raw']
        job_description = job_description.replace("\n", " ")
        pattern = r'(?<=[a-z])(?=[A-Z])'
        job_description = re.sub(pattern, ' ', job_description)
        # Remove the last 56 trash characters
        job_description = job_description[:-56]

        skills = extract_skills_gpt3(job_description)
        _id = row['id']

        extracted_skills["id"].append(_id)
        extracted_skills["skills"].append(skills)
        extracted_skills["description_raw"].append(job_description)

        count += 1

        # Print progress in place
        print(f"\r💬 Skills for {_id} extracted! Progress: {count}/{N}", end="")

    extracted_skills_df = pd.DataFrame(extracted_skills)
    success("Skills extracted")
    if save_skills:
        name = "skills_extracted_gpt3_v2.csv"
        extracted_skills_df.to_csv(
            f"extracted_skills/{name}", index=False)
        success(f"Skills saved to extracted_skills/{name}")


OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [20]:
skill_extraction(save_skills=True)

NameError: name 'skill_extraction' is not defined