In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import torch
from typing import List, Tuple
import PyPDF2
from tqdm import tqdm
from datasets import Dataset
from bs4 import BeautifulSoup
import re
import spacy

In [2]:
def load_and_preprocess_data(csv_path: str, pdf_folder_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    print(f"Loaded CSV with {len(df)} rows")
    
    def clean_html(html_text):
        if pd.isna(html_text):
            return ""
        soup = BeautifulSoup(html_text, 'html.parser')
        text = soup.get_text(separator=' ', strip=True)
        return re.sub(r'\s+', ' ', text).strip()
    
    def read_pdf(file_path):
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                return ' '.join(page.extract_text() or '' for page in pdf_reader.pages).strip()
        except Exception as e:
            print(f"Error reading PDF {file_path}: {str(e)}")
            return ""
    
    nlp = spacy.load("en_core_web_sm")
    
    def process_text(text):
        if pd.isna(text) or text == "":
            return ""
        doc = nlp(text[:1000000])  
        
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        skills = [token.text for token in doc if token.pos_ == "NOUN" and token.is_alpha]
        
        return f"Entities: {entities}\nSkills: {skills}\nOriginal: {text[:1000]}"

    print("Cleaning HTML content...")
    df['cleaned_resume_html'] = df['Resume_html'].apply(clean_html)

    print("Reading PDF files...")
    tqdm.pandas()
    df['pdf_content'] = df.apply(lambda row: read_pdf(os.path.join(pdf_folder_path, row['Category'], f"{row['ID']}.pdf")), axis=1)

    print("Processing resume content...")
    df['processed_resume_html'] = df['cleaned_resume_html'].apply(process_text)
    df['processed_pdf_content'] = df['pdf_content'].apply(process_text)

    df['final_processed_resume'] = df.apply(lambda row: 
        f"HTML Content:\n{row['processed_resume_html']}\n\nPDF Content:\n{row['processed_pdf_content']}", axis=1)

    print("Data preprocessing completed")
    return df

In [3]:
def prepare_data_for_model(df: pd.DataFrame, tokenizer: AutoTokenizer, max_length: int = 512) -> Dataset:
    texts = [f"[CATEGORY]{row['Category']}[RESUME]{row['final_processed_resume']}[END]" for _, row in df.iterrows()]
    
    unique_categories = df['Category'].unique()
    category_to_id = {category: idx for idx, category in enumerate(unique_categories)}
    labels = [category_to_id[category] for category in df['Category']]
    
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    
    encodings['labels'] = torch.tensor(labels)
    return Dataset.from_dict(encodings)

In [4]:
# Load and preprocess the data
csv_path = 'D:\jarvis-calling-hiring-contest\Resume\Resume.csv'
pdf_folder_path = 'D:\jarvis-calling-hiring-contest\data\data'
df = load_and_preprocess_data(csv_path, pdf_folder_path)

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

Loaded CSV with 2484 rows
Cleaning HTML content...
Reading PDF files...
Processing resume content...
Data preprocessing completed


In [5]:
# Initialize tokenizer and model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Add special tokens
special_tokens_dict = {'additional_special_tokens': ['[CATEGORY]', '[RESUME]', '[END]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

# Get unique categories and create a mapping
unique_categories = df['Category'].unique()
category_to_id = {category: idx for idx, category in enumerate(unique_categories)}
num_labels = len(category_to_id)



In [6]:
# Initialize the model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    load_in_8bit=True,
    device_map="auto"
)

# Set padding token ID
model.config.pad_token_id = tokenizer.pad_token_id

# Resize token embeddings
model.resize_token_embeddings(len(tokenizer))

# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at TinyLlama/TinyLlama-1.1B-Chat-v1.0 and are newly initialized: ['model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.11

In [7]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]
)

In [8]:
# Apply LoRA to the model
model = get_peft_model(model, peft_config)


In [9]:
# Prepare datasets
train_dataset = prepare_data_for_model(train_df, tokenizer)
val_dataset = prepare_data_for_model(val_df, tokenizer)


In [14]:
# Set up training arguments
output_dir = 'D:\jarvis-calling-hiring-contest\model'
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=1e-4,
    fp16=True,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
)


In [15]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [16]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [19]:
# Train the model
trainer.train()


  0%|          | 0/280 [00:00<?, ?it/s]

{'loss': 3.9041, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.14}
{'loss': 3.764, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.29}
{'loss': 3.7157, 'learning_rate': 6e-06, 'epoch': 0.43}
{'loss': 3.9128, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.57}
{'loss': 3.9061, 'learning_rate': 1e-05, 'epoch': 0.71}
{'loss': 3.7967, 'learning_rate': 1.2e-05, 'epoch': 0.86}
{'loss': 3.8565, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.0}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 3.9586000442504883, 'eval_accuracy': 0.028112449799196786, 'eval_precision': 0.024881984076657503, 'eval_recall': 0.028112449799196786, 'eval_f1': 0.025655090329542614, 'eval_runtime': 23.8896, 'eval_samples_per_second': 10.423, 'eval_steps_per_second': 1.339, 'epoch': 1.0}




{'loss': 3.6572, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.14}
{'loss': 3.667, 'learning_rate': 1.8e-05, 'epoch': 1.29}
{'loss': 3.6336, 'learning_rate': 2e-05, 'epoch': 1.43}
{'loss': 3.4715, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.57}
{'loss': 3.5103, 'learning_rate': 2.4e-05, 'epoch': 1.71}
{'loss': 3.4565, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.86}
{'loss': 3.3718, 'learning_rate': 2.8000000000000003e-05, 'epoch': 2.0}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 3.6096770763397217, 'eval_accuracy': 0.03614457831325301, 'eval_precision': 0.049324749980313415, 'eval_recall': 0.03614457831325301, 'eval_f1': 0.036434083656566474, 'eval_runtime': 23.9212, 'eval_samples_per_second': 10.409, 'eval_steps_per_second': 1.338, 'epoch': 2.0}




{'loss': 3.3645, 'learning_rate': 3e-05, 'epoch': 2.14}
{'loss': 3.1565, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.29}
{'loss': 3.0123, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.43}
{'loss': 2.8555, 'learning_rate': 3.6e-05, 'epoch': 2.57}
{'loss': 2.5164, 'learning_rate': 3.7800000000000004e-05, 'epoch': 2.71}
{'loss': 1.6242, 'learning_rate': 3.9800000000000005e-05, 'epoch': 2.86}
{'loss': 0.6437, 'learning_rate': 4.18e-05, 'epoch': 3.0}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 5.55534553527832, 'eval_accuracy': 0.04417670682730924, 'eval_precision': 0.04417670682730924, 'eval_recall': 0.04417670682730924, 'eval_f1': 0.04417670682730924, 'eval_runtime': 24.13, 'eval_samples_per_second': 10.319, 'eval_steps_per_second': 1.326, 'epoch': 3.0}




{'loss': 0.1421, 'learning_rate': 4.38e-05, 'epoch': 3.14}
{'loss': 0.0296, 'learning_rate': 4.5600000000000004e-05, 'epoch': 3.29}
{'loss': 0.0039, 'learning_rate': 4.76e-05, 'epoch': 3.43}
{'loss': 0.0013, 'learning_rate': 4.96e-05, 'epoch': 3.57}
{'loss': 0.0006, 'learning_rate': 5.16e-05, 'epoch': 3.71}
{'loss': 0.0004, 'learning_rate': 5.360000000000001e-05, 'epoch': 3.86}
{'loss': 0.0003, 'learning_rate': 5.560000000000001e-05, 'epoch': 4.0}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 12.213642120361328, 'eval_accuracy': 0.04417670682730924, 'eval_precision': 0.04417670682730924, 'eval_recall': 0.04417670682730924, 'eval_f1': 0.04417670682730924, 'eval_runtime': 23.8793, 'eval_samples_per_second': 10.427, 'eval_steps_per_second': 1.34, 'epoch': 4.0}




{'train_runtime': 4080.8188, 'train_samples_per_second': 2.191, 'train_steps_per_second': 0.069, 'train_loss': 2.4633959026383567, 'epoch': 4.0}


TrainOutput(global_step=280, training_loss=2.4633959026383567, metrics={'train_runtime': 4080.8188, 'train_samples_per_second': 2.191, 'train_steps_per_second': 0.069, 'train_loss': 2.4633959026383567, 'epoch': 4.0})

In [20]:
# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)




  0%|          | 0/32 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 3.6096770763397217, 'eval_accuracy': 0.03614457831325301, 'eval_precision': 0.049324749980313415, 'eval_recall': 0.03614457831325301, 'eval_f1': 0.036434083656566474, 'eval_runtime': 23.9401, 'eval_samples_per_second': 10.401, 'eval_steps_per_second': 1.337, 'epoch': 4.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print("Fine-tuning completed and Model saved.")



Fine-tuning completed and Model saved.
