In [4]:
from transformers import pipeline

In [5]:
import transformers
import torch

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

model_id = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [28]:
import pandas as pd
df = pd.read_csv("/kaggle/input/extracted-sections/extracted_resume_sections.csv")
print(df.shape)
print(df.columns)

(5168, 4)
Index(['Resume ID', 'Job Title', 'Section', 'Content'], dtype='object')


In [2]:
import tqdm
from tqdm import tqdm

In [30]:
df['experience_vagueness_index'] = 0.0
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Experience Sections"):
    if row['Section'] == 'experience':
        paragraph = row['Content']
        
        prompt = (
            "Suppose you are a hiring agent and a person has submitted you a CV, irrespective of the role, "
            "we can judge by the content of the CV about the capability of the person. "
            "So, for this, I want you to score between 0 to 5, how likely are you to hire this person irrespective of the role? "
            "I also want you to think from a CV perspective whether the sentences given in the paragraph are vague or not, "
            "and also consider that for scoring, penalize such vagueness heavily, give only the score as output as 'Score : ', and noting else :\n"
            f"Paragraph: {paragraph}\nScore:"
        )

        outputs = pipeline(prompt, max_new_tokens=3, do_sample=False)
        generated_text = outputs[0]["generated_text"]
        
        score = generated_text.split("Score:")[-1].strip()
        df.at[index, 'experience_vague'] = float(score)
#         print((row['Resume ID'], score))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing Experience Sections:   0%|          | 7/5168 [00:10<2:01:43,  1.42s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing Experience Sections:   0%|          | 13/5168 [00:15<1:31:44,  1.07s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing Experience Sections:   0%|          | 19/5168 [00:25<1:54:12,  1.33s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing Experience Sections:   0%|          | 25/5168 [00:32<1:46:43,  1.25s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing Experience Sections:   1%|          | 30/5168 [00:38<1:48:19,  1.26s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing Experience Sections:   1%|          | 37/5168 [00:48<1:50:28,  1

In [32]:
df.to_csv('experience_vagueness.csv', index=False)

In [17]:
import pandas as pd
df_lor = pd.read_csv("/kaggle/input/small-lor/even_smaller_actual_lor.csv")
df_lor.shape

(380, 7)

In [18]:
df_lor.columns

Index(['id', 'title', 'text', 'role_raw', 'role', 'lor_data', 'lor_from'], dtype='object')

In [19]:
from tqdm import tqdm
import re

def process_row(index, row):
    paragraph = row['lor_data']
    
    # Single prompt with evaluation criteria included
    prompt = (
        "You are a hiring agent evaluating a CV. Your task is to score some content between 0 to 5 both inclusive based on trust. "
        "Consider the following:\n\n"
        "- Vagueness: Heavily penalize vague or unclear statements.\n"
        "- Content Quality: Judge the overall content based on how trustable it sounds.\n\n"
        "Provide only the score as output in the format 'Score: ', and nothing else.\n\n"
        f"Content: {paragraph}\nScore:"
    )
    
    outputs = pipeline(prompt, max_new_tokens=3, do_sample=False)
    generated_text = outputs[0]["generated_text"]
    
    # Extract the score from the generated text
    score = re.findall(r"\d+\.?\d*", generated_text.split("Score:")[-1].strip())
    if score:
        return index, float(score[0])
    else:
        return index, 0.0

# Iterate over the DataFrame without parallelism
for index, row in tqdm(df_lor.iterrows(), total=df_lor.shape[0]):
    index, score = process_row(index, row)
    df_lor.at[index, 'lor_sentiment_score'] = score

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 2/380 [00:07<22:30,  3.57s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 3/380 [00:12<28:19,  4.51s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 4/380 [00:18<30:12,  4.82s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|▏         | 5/380 [00:22<29:03,  4.65s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 6/380 [00:26<28:20,  4.55s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 7/380 [00:31<27:41,  4.45s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 8/380 [00:42<40:42,  6.56s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏        

In [24]:
df_lor.rename(columns={'lor_sentiment_score': 'lor_trust_score'}, inplace=True)
df_lor.columns

Index(['id', 'title', 'text', 'role_raw', 'role', 'lor_data', 'lor_from',
       'lor_trust_score'],
      dtype='object')

In [25]:
df_lor.to_csv('added_lor_trust_score.csv', index=False)