In [7]:
%pip install flair
%pip install transformers
%pip uninstall torch torchvision torchaudio -y
%pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
%pip install spacy
%pip install nltk
%pip install pandas
%pip install numpy
!python -m spacy download en_core_web_trf


Found existing installation: torch 2.6.0
Uninstalling torch-2.6.0:
  Successfully uninstalled torch-2.6.0
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl.metadata (26 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.21.0%2Bcpu-cp311-cp311-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl.metadata (6.6 kB)
Downloading https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl (178.7 MB)


In [None]:
!pip install numpy
!pip install pandas
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install --upgrade transformers keras
!pip install flair
!pip install spacy
!pip install nltk
!python -m spacy download en_core_web_trf
!python -m spacy validate

In [1]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

import pandas as pd
import spacy
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
from flair.models import TARSClassifier
from flair.data import Sentence
import torch

# Load spaCy model (Transformer-based)
nlp_spacy = spacy.load("en_core_web_trf")

# Load Huggingface QA pipeline on CPU
qa_model = pipeline(
    "question-answering",
    model="deepset/bert-base-cased-squad2",
    device=-1  # Force CPU
)

# Load TARS model and configure task
tars_model = TARSClassifier.load("tars-base")
tars_model.add_and_switch_to_new_task("experience", ["experience_requirement"], "single_label")

# Load RoBERTa Token Classification model on CPU
roberta_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
roberta_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
roberta_model.to("cpu")  # Force CPU

# Experience Extraction Functions
def extract_spacy_experience(text):
    if not isinstance(text, str):
        return None
    doc = nlp_spacy(text)
    experience = []
    for ent in doc.ents:
        if ent.label_ in ["DATE", "CARDINAL"] and "experience" in ent.sent.text.lower():
            experience.append(ent.text)
    return ", ".join(experience) if experience else None

def extract_bert_qa(text):
    if not isinstance(text, str):
        return None
    result = qa_model(
        question="What is the total experience required for the job (years)?",
        context=text
    )
    return result["answer"] if result["score"] > 0.1 else None

def extract_tars_zero_shot(text):
    if not isinstance(text, str):
        return None
    sentence = Sentence(text)
    tars_model.predict(sentence)
    for label in sentence.labels:
        if label.value == "experience_requirement" and label.score > 0.5:
            return sentence.text
    return None

def extract_roberta_ner(text):
    if not isinstance(text, str):
        return None
    inputs = roberta_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = roberta_model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)
    tokens = roberta_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    entities = []
    current_entity = ""
    for token, prediction in zip(tokens, predictions[0]):
        if token.startswith("##"):
            current_entity += token[2:]
        else:
            if current_entity:
                entities.append(current_entity)
                current_entity = ""
            current_entity = token if prediction.item() != 0 else ""
    if current_entity:
        entities.append(current_entity)
    return ", ".join(entities) if entities else None

# Load Job Descriptions
df = pd.read_excel("Job Descriptions.xlsx")
results = []

for idx, row in df.iterrows():
    jd = row["Job Descriptions"]
    spacy_exp = extract_spacy_experience(jd)
    bert_qa_exp = extract_bert_qa(jd)
    tars_exp = extract_tars_zero_shot(jd)
    roberta_exp = extract_roberta_ner(jd)

    results.append({
        "Original JD": jd,
        "spaCy NER": spacy_exp,
        "BERT QA": bert_qa_exp,
        "TARS Zero-Shot": tars_exp,
        "RoBERTa NER": roberta_exp
    })

output_df = pd.DataFrame(results)
output_df.to_excel("output.xlsx", index=False)

print("Extraction complete — results saved to output.xlsx")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


2025-04-15 10:37:15,745 https://nlp.informatik.hu-berlin.de/resources/models/tars-base/tars-base-v8.pt not found in cache, downloading to /tmp/tmpr3a6xuo0


100%|██████████| 418M/418M [00:42<00:00, 10.2MB/s]

2025-04-15 10:37:59,171 copying /tmp/tmpr3a6xuo0 to cache at /root/.flair/models/tars-base-v8.pt





2025-04-15 10:38:00,856 removing temp file /tmp/tmpr3a6xuo0


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

2025-04-15 10:38:12,964 TARS initialized without a task. You need to call .add_and_switch_to_new_task() before training this model


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Extraction complete — results saved to output.xlsx
