<a href="https://colab.research.google.com/github/olonok69/LLM_Notebooks/blob/main/custom_classifier/cv_non_cv_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install loguru==0.7.2  evaluate==0.4.1 wandb bitsandbytes accelerate -q

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
from datasets import Dataset


def load_dataset(data_path) -> Dataset:
    """ Load dataset. """
    stratify_column_name = "label2"
    dataset_ecommerce_pandas = pd.read_csv(data_path, header=None, names=['label', 'text'])
    dataset_ecommerce_pandas['label2']= dataset_ecommerce_pandas['label'].values
    dataset_ecommerce_pandas['label'] = dataset_ecommerce_pandas['label'].astype(str)
    dataset_ecommerce_pandas['label2'] = dataset_ecommerce_pandas['label2'].astype(str)
    dataset_ecommerce_pandas['text'] = dataset_ecommerce_pandas['text'].astype(str)
    dataset = Dataset.from_pandas(dataset_ecommerce_pandas)
    dataset = dataset.shuffle(seed=42)
    dataset = dataset.class_encode_column(stratify_column_name).train_test_split(test_size=0.3)
    return dataset

In [4]:
path= "/content/drive/MyDrive/data/documents_cv.csv"

In [5]:
PROJECT = "FlanT5-Custom"
MODEL_NAME = 'google/flan-t5-base'
DATASET = "CVS-Premcloud"

In [6]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,Title,Document,Class
0,"Resume - Adkins, Justin.txt",justin wa adkins thrives cultivating business ...,cv
1,"Resume - Ahner,Denise.txt",denise ahner redclover pa executive years hand...,cv
2,"Resume - Ali, Salman.txt",evaluation document created ali role field sof...,cv
3,"Resume - Allahiq, Ryann.txt",london ryann contractor development skill codi...,cv
4,"Resume - Allani, Praveen.txt",allani cognitive search elastic solr fess solu...,cv


In [7]:
dfcv =df[df.Class=="cv"]
dfnoncv = df[df.Class=="non-cv"]

In [8]:
dfnoncv_sample = dfnoncv.sample(len(dfcv))

In [9]:
df_final = pd.concat([dfcv, dfnoncv_sample], ignore_index=True)
len(df_final)

484

In [10]:
data = df_final[["Class", "Document"]]
data

Unnamed: 0,Class,Document
0,cv,justin wa adkins thrives cultivating business ...
1,cv,denise ahner redclover pa executive years hand...
2,cv,evaluation document created ali role field sof...
3,cv,london ryann contractor development skill codi...
4,cv,allani cognitive search elastic solr fess solu...
...,...,...
479,non-cv,evaluation document created laurella record un...
480,non-cv,evaluation document created first last 38 clov...
481,non-cv,evaluation document created nelli record helyn...
482,non-cv,evaluation document created 09933 hermina geol...


In [11]:
data_path = "/content/drive/MyDrive/data/documents_final_cv.csv"
data.to_csv(data_path, index= False, header=None)

In [12]:
data.isnull().sum()

Class       0
Document    0
dtype: int64

In [13]:
len(data.Class.unique())

2

In [14]:
import evaluate
import nltk
import numpy as np
from typing import List, Tuple
from nltk.tokenize import sent_tokenize
from datasets import Dataset, concatenate_datasets
from huggingface_hub import HfFolder
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

MODEL_ID = "google/flan-t5-base"

In [37]:
dataset = load_dataset(data_path)


Casting to class labels:   0%|          | 0/484 [00:00<?, ? examples/s]

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'label2'],
        num_rows: 338
    })
    test: Dataset({
        features: ['label', 'text', 'label2'],
        num_rows: 146
    })
})

In [17]:
len(dataset['test'].to_pandas()['label'].unique())

2

In [18]:
dataset['test'].to_pandas()['label'].value_counts()

label
cv        76
non-cv    70
Name: count, dtype: int64

In [19]:
dataset['train'].to_pandas()['label'].value_counts()

label
non-cv    172
cv        166
Name: count, dtype: int64

In [20]:
MODEL_ID = "google/flan-t5-base"
# Load tokenizer of FLAN-t5
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

In [21]:
# Metric
metric = evaluate.load("f1")

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["text"], truncation=True), batched=True, remove_columns=['text', 'label', 'label2']
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

Max source length: 512


In [22]:
# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["label"], truncation=True), batched=True, remove_columns=['text', 'label','label2']
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

Max target length: 5


In [38]:
REPOSITORY_ID = f"{MODEL_ID.split('/')[1]}-text-classification"
REPOSITORY_ID

'flan-t5-base-text-classification'

In [32]:
# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=REPOSITORY_ID,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False,     # Overflows with fp16
    learning_rate=3e-4,
    num_train_epochs=5,
    logging_dir=f"{REPOSITORY_ID}/logs",    # logging & evaluation strategies
    logging_strategy="epoch",
    evaluation_strategy="no",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=False,
    report_to="wandb",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=REPOSITORY_ID,
    hub_token="",
)

def preprocess_function(sample: Dataset, padding: str = "max_length") -> dict:
    """ Preprocess the dataset. """

    # add prefix to the input for t5
    inputs = [item for item in sample["text"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["label"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def postprocess_text(preds: List[str], labels: List[str]) -> Tuple[List[str], List[str]]:
    """ helper function to postprocess text"""
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, average='macro')
    result = {k: round(v * 100, 10) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [25]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['text', 'label','label2'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/338 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [31]:
MODEL_ID

'google/flan-t5-base'

In [26]:
# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)

In [27]:
nltk.download("punkt")

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

In [34]:
tokenized_dataset["test"]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 146
})

In [35]:
# TRAIN
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
43,0.2592
86,0.0071
129,0.0
172,0.0
215,0.0


TrainOutput(global_step=215, training_loss=0.05328479867785909, metrics={'train_runtime': 377.2378, 'train_samples_per_second': 4.48, 'train_steps_per_second': 0.57, 'total_flos': 1157239925637120.0, 'train_loss': 0.05328479867785909, 'epoch': 5.0})

In [39]:
trainer.model.save_pretrained(REPOSITORY_ID)

In [42]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

In [44]:
tokenizer.save_pretrained(REPOSITORY_ID)
trainer.create_model_card()
trainer.push_to_hub(token= HF_TOKEN)

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/olonok/flan-t5-base-text-classification/commit/3f29f5f4ab44273701600ac9d80250f8dea525fd', commit_message='End of training', commit_description='', oid='3f29f5f4ab44273701600ac9d80250f8dea525fd', pr_url=None, pr_revision=None, pr_num=None)

In [45]:
import torch
from tqdm.auto import tqdm

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics import classification_report

#dataset = load_dataset(data_path)

# Load model and tokenizer from the hub
tokenizer = AutoTokenizer.from_pretrained(f"{REPOSITORY_ID}")
model = AutoModelForSeq2SeqLM.from_pretrained(f"{REPOSITORY_ID}")
model.to('cuda') if torch.cuda.is_available() else model.to('cpu')

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [46]:
def classify(text_to_classify: str) -> str:
    """Classify a text using the model."""
    inputs = tokenizer.encode_plus(text_to_classify, padding='max_length', max_length=512, return_tensors='pt')
    inputs = inputs.to('cuda') if torch.cuda.is_available() else inputs.to('cpu')
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction


def evaluate_model() -> None:
    """Evaluate the model on the test dataset."""
    predictions_list, labels_list = [], []

    samples_number = len(dataset['test'])
    progress_bar = tqdm(range(samples_number))

    for i in range(samples_number):
        text = dataset['test']['text'][i]
        predictions_list.append(classify(text))
        labels_list.append(str(dataset['test']['label'][i]))

        progress_bar.update(1)

    report = classification_report(labels_list, predictions_list, zero_division=0)
    print(report)

In [47]:
evaluate_model()

  0%|          | 0/146 [00:00<?, ?it/s]

              precision    recall  f1-score   support

          cv       1.00      1.00      1.00        70
      non-cv       1.00      1.00      1.00        76

    accuracy                           1.00       146
   macro avg       1.00      1.00      1.00       146
weighted avg       1.00      1.00      1.00       146



In [48]:
text_to_classify ="""A 15-year-old boy has been arrested on suspicion of murdering Harry Pitman, who was fatally stabbed on New Year's Eve.

Harry, 16, from Haringey, was attacked in Primrose Hill, north London, at about 23:40 GMT on Sunday.

The boy was arrested - along with an 18-year-old man on suspicion of affray - on Tuesday night, the Met said.

Specialist detectives have found there was no indication the attack was racially motivated, the force added.

Vigil held for boy killed in New Year's Eve stabbing
Teen killed in New Year's Eve stabbing named
Det Ch Insp Geoff Grogan, who is leading the investigation, said despite having made the arrests, he was "still very keen to hear from anyone who has footage or information".

On Sunday, a 16-year-old boy was arrested at the scene on suspicion of murder and later released on bail pending further inquiries.
"""

In [51]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [53]:
tokens = text_to_classify.split(" ")
tokens_pun_lower = [i.lower() for i in tokens if i.isalnum()]
tokens_stop = [i for i in tokens_pun_lower if i not in stop_words]
text_to_class = " ".join(tokens_stop)

In [59]:
classify(text_to_class)

'non-cv'

In [55]:
text_to_class1= """Justin T. Adkins
Castle Rock, WA 98611
Cell: 425-761-4004     Justin.T.Adkins@gmail.com
MISSION:
Justin Adkins thrives in cultivating business workflows and creating strategic data
solutions with key stakeholders that are simple and effective that delights customers by
identifying key inflection points inside a business and focusing on innovation and swift
support.  Bringing 8 years of coding experience and managing teams, his mission is to
make the world a better place by supporting those who make and build.
EDUCATION:
University of Central Missouri, Warrensburg, MO
B.S. Actuarial Science and Mathematics, May 2013
EXPERIENCE:
ETL Consultant/Project Manager (May 2021 - Current)
Tilson – Construction Accounting Consulting and Telecom Infrastructure, Portland, Maine
●
Vista ERP related project summaries:
o
ETL data conversion from MS Dynamics Solomon to Vista AP, AR, JB, JC, PR,
EM in under 3 months with limited Solomon resources
o
Fractional CIO role managing multiple systems bringing cohesion for team
members and stakeholders
Team Lead Senior Analyst (November 2016 - April 2021)
Silvertrek Systems – Construction Accounting Consulting, Battle Ground, Wa
●
Vista ERP related project summaries:
o
Sold, project managed, and code Vista ETL in GL, AR, AP, JC, JB, CM, HQ, PR,
EM, modules and reconcile AR, AP, JC beginning balances using SQL
o
Project managed and implemented MS Materials module with automatic
quoting ticketing, and AP/AR batch creation between multiple companies,
and streamlined payroll
o
Developed XML Certified Payroll Report for efficient Washington L&I labor
reporting
o
Automated Equipment GPS import and debit/credit allocation procedure for
standby equipment
o
Designed and built forecast software for Project Managers with download
and upload capabilities directly into Viewpoint database for billion-dollar GC
●
Team Lead EOS Level 10 weekly meetings and mentored 6 analysts/developers
●
Problem-solve various business challenges through:
o
Built user stories from the field/office understanding business needs and
processes, listening, and implementing project scopes
o
Architect, direct, and develop advanced reports using Crystal, SSRS, and
other BI tools
o
Identified and implement software automation supporting time-saving
workflows
●
Engage multiple levels of audiences – driving Vista software engagement and
success in the office and field
●
Solid project management increased customer engagement, satisfaction, and
Silvertrek profitability by 400% in revenue since 2016
Business Analyst (February 2016 – August 2016)
ECI Consulting – Syteline ERP Software Development, Vancouver, Wa
●
Upgraded SyteLine ERP software focusing on multiple department interaction with
125 users building reports and implementing business workflows in ASP.NET, T-SQL,
and SSRS
●
Assisted in the design of department ERP homepages with custom-built Business
Intelligence (BI) reports driving business decisions using T-SQL
●
Formulated and executed a successful upgrade of SyteLine ERP 8 to 9 within 3
months of employment
Data Analyst II (August 2014 – February 2016)
DTI Global – Legal e-Discovery, Portland, OR
●
Collaborated with the development team to integrate third-party and proprietary
software using the .NET framework using version control software
●
Developed stored procedures, dynamic SQL, indexes, and CTE’s with nested windows
functions to monitor and support large import and export data load operations using
custom generated XML with data transformations
●
Worked closely with the Client Services team to understand client requests, provide
technical interpretation, and document sustainable confluence workflows
Data Production Programmer (June 2014 – August 2014)
MCH Strategic Data - Leading Compiler of Institutional Data, Sweet Springs, MO
●
Prepared and generated custom reports utilizing SQL Access and DataLever data
management software for analysis-based processing
●
Processed orders requiring programming services such as File Cleaning, CASS, NCOA,
Merge/Purge, Match & Append, and customized programming
Math Tutor (June 2011-September 2011)
Discover an Educational Workshop, Warrensburg, MO
Server (November 2005-May 2014)
Buca Di Beppo, Kansas City, MO
TECHNICAL SKILLS:
●
T-SQL programming in SQL Server for 8 years including 2008, 2012, 2016,
●
SSRS
●
Crystal Reports
●
Vista by Viewpoint Construction Software
●
Keystyle
●
Database Administration
●
VPN Management
●
Active Directory
●
Project Management Tools (Trello, Lastpass, Remote Desktop Manager, Dropbox, MS
Project)

"""

In [56]:
tokens = text_to_class1.split(" ")
tokens_pun_lower = [i.lower() for i in tokens if i.isalnum()]
tokens_stop = [i for i in tokens_pun_lower if i not in stop_words]
text_to_class1 = " ".join(tokens_stop)

In [57]:
text_to_class1

'justin wa adkins thrives cultivating business workflows creating strategic key stakeholders simple effective delights customers key inflection points inside business focusing innovation bringing 8 years coding experience managing mission world better place supporting make central actuarial science may manager 2021 construction accounting consulting telecom erp related project data conversion ms dynamics solomon vista 3 months limited solomon cio role managing multiple systems bringing cohesion lead senior analyst 2016 april systems construction accounting battle erp related project project code vista etl modules reconcile jc beginning balances using managed implemented ms materials module batch creation multiple streamlined xml certified payroll report efficient washington equipment gps import allocation procedure built forecast software project managers upload capabilities directly viewpoint database lead eos level 10 weekly meetings mentored 6 various business challenges user storie

In [58]:
classify(text_to_class1)

'cv'