In [1]:
from PyPDF2 import PdfReader
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer, BartForConditionalGeneration, TFT5ForConditionalGeneration, DataCollatorWithPadding, AutoModelForSeq2SeqLM
import torch
from datasets import load_dataset
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm





In [1]:
import torch

# Check if CUDA is available
cuda_available = torch.cuda.is_available()

print(f"CUDA Available: {cuda_available}")

# If CUDA is available, check the number of GPUs
if cuda_available:
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs: {num_gpus}")
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")


CUDA Available: False


In [4]:
# Preprocess

def clean_data(extracted_text):
    extracted_text = ''.join(extracted_text.replace('\n', ' '))
    # extracted_text = [text.lower() for text in extracted_text]

    return extracted_text


In [5]:
PRETRAINED_MODEL = 'facebook/bart-large-cnn'
# PRETRAINED_MODEL = 'burberg92/resume_summary'


model = BartForConditionalGeneration.from_pretrained(PRETRAINED_MODEL)
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
finetune_dataset = load_dataset("burberg92/resume_summary")

finetune_dataset = finetune_dataset['train'].train_test_split(test_size=0.2, seed=42)

finetune_dataset

Downloading readme: 100%|██████████| 241/241 [00:00<00:00, 975B/s]
Downloading data: 100%|██████████| 36.8k/36.8k [00:01<00:00, 31.3kB/s]
Generating train split: 100%|██████████| 100/100 [00:00<00:00, 2432.43 examples/s]


DatasetDict({
    train: Dataset({
        features: ['resume', 'ex_summary'],
        num_rows: 80
    })
    test: Dataset({
        features: ['resume', 'ex_summary'],
        num_rows: 20
    })
})

In [7]:
def preprocess_function(examples):
    inputs = tokenizer(examples['resume'], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['ex_summary'], max_length=150, truncation=True, padding="max_length")

    inputs["labels"] = labels["input_ids"]
    return inputs

train_dataset = finetune_dataset['train'].map(preprocess_function, batched=True)
val_dataset = finetune_dataset['test'].map(preprocess_function, batched=True)


Map: 100%|██████████| 80/80 [00:00<00:00, 1440.36 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 886.67 examples/s]


In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
train_dataset

Dataset({
    features: ['resume', 'ex_summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 80
})

In [10]:
import torch
from rouge import Rouge

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    print(predictions, labels)

    rouge = Rouge()

    scores = rouge.get_scores(predictions, labels)

    return {"scores": scores}


In [11]:
from transformers import TrainingArguments

epochs = 5
batch_size = 4

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

: 

In [12]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

 10%|█         | 10/100 [03:32<30:41, 20.46s/it]

{'loss': 8.0872, 'grad_norm': 25.911968231201172, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.5}


 20%|██        | 20/100 [08:20<36:14, 27.19s/it]

{'loss': 7.3558, 'grad_norm': 13.724514961242676, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.0}


In [None]:
# Read PDF

PDF_PATH = '/Anthonio Obert - Software Developer - CV (1).pdf'

reader = PdfReader(PDF_PATH)
n_pages = len(reader.pages)

extracted_text = ''

for i in range(n_pages):
    page = reader.pages[i]
    extracted_text += page.extract_text()

extracted_text = clean_data(extracted_text)
extracted_text

"Anthonio Obert Software Developer+62 81273724892 laisobert2@gmail.com  Jakarta, Indonesia SUMMARY A passionate college student with a keen interest in software development that is able to learn quickly and delve deeply into  new subjects. Currently working as a Database Administrator for Bina Nusantara's Software Laboratory, where I manage and  maintain student scores across multiple campuses with honesty and integrity. Capable of working under pressure and  meeting deadlines.  EXPERIENCE 02/2024 - Present Database Administrator  Bina Nusantara University  Manage and maintain student scores for laboratory subjects across six campuses: Kemanggisan, Alam Sutera, Bekasi,  Bandung, Malang, and Semarang.  Maintain web application to support internal and external activities.  Create and maintain SQL query for internal and external requests.  Provide student's scores data to identify and improve laboratory processes.  Schedule important dates for laboratory activities.  Post student's scores

In [None]:
import os
os.environ["TORCH_USE_CUDA_DSA"] = "1"


In [None]:
len(extracted_text)

1976

In [None]:
device = torch.device("cuda:0")
model.to(device)


inputs = tokenizer([extracted_text], truncation=True, return_tensors='pt', max_length=512).to(device)

summary_ids = model.generate(inputs['input_ids'], num_beams=4, early_stopping=True, min_length=100, max_length=120)
summary = ([tokenizer.decode(id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for id in summary_ids])

summary

["Results-oriented Software Developer with expertise in C, SQL, R, Python, Java, HTML, CSS, and JS. Capable of working under pressure and meeting deadlines. Holds a Bachelor's Degree in Computer Science from Bina Nusantara University and is a Web Development Finalist with proficiency in Excel and other software tools. Phone number is 81273724892 and email address is laisobert2@gmail.com in Jakarta, Indonesia. If you are concerned about a security breach, call the National Suicide Prevention Lifeline at 1-800-273"]