<a href="https://colab.research.google.com/github/ollyekhan/SocialMediaMiningFinalProject/blob/main/Group_10_BART_Fine_Tuning_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install dependencies
#!pip install -U datasets
#!pip install -U accelerate

In [None]:
# connect to drive
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
import pandas as pd

# remember to set it to the correct path in your drive
file_path = 'Dataset - fine_tuning_dataset_1.csv'
df = pd.read_csv(file_path)

# print the first few lines
df.head()

Unnamed: 0,Number,context,answer
0,16,"An empty treadmill isn’t compelling, but once ...","Software Quality Engineering, Testing, Agile M..."
1,47,Company Description\n\nJobs for Humanity is pa...,"Software development, Troubleshooting, C, C++,..."
2,54,Ledgent Technology is looking for a Sr. Softwa...,"Linux, Operating Systems, Kernel, Device Firmw..."
3,65,PHP Software Engineer\n\nDo you enjoy being pa...,"Backend engineering, Fullstack engineering, In..."
4,86,FlexTrade Systems is a financial technology So...,"Web development, Mobile development, Angular, ..."


In [None]:
from sklearn.model_selection import train_test_split

# (optional) remove any null values from one or two columns - remember to set the correct column name
#df.context = df.context.replace(r'\s+', ' ', regex=True)

# Split dataset into training and temp (for validation and testing)
train_df, temp_df = train_test_split(df, test_size=0.15, random_state=42)
# Split temp into validation and testing
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [None]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# create a dataset dict with the train, validate and test set
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# print the dict
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['Number', 'context', 'answer', '__index_level_0__'],
        num_rows: 510
    })
    validation: Dataset({
        features: ['Number', 'context', 'answer', '__index_level_0__'],
        num_rows: 45
    })
    test: Dataset({
        features: ['Number', 'context', 'answer', '__index_level_0__'],
        num_rows: 45
    })
})

In [None]:
# funtion to map out some examples from the dataset
# remember to set the correct column names
def show_samples(dataset, num_samples=10, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Context: {example['context']}'")
        print(f"'>> Answer: {example['answer']}'")

# run function
show_samples(dataset_dict)


'>> Context: Here’s the opportunity to make a difference within the global transportation planning and design community working with the technical team to design and implement CAD-based software for the engineering and architectural design communities.

You’ll thrive in an agile software development environment with plenty of learning opportunities, working with users to develop innovative new tools to solve engineering problems. There is also an element of 3D visualisation within the role as they prepare to incorporate this into their products.

They are looking for you to have a combination of knowledge and experience with desktop applications and web applications- this skill will benefit upcoming projects as well as attention to detail ensuring quality over quantity.

SALARY: Up to around £50,000.

LOCATION: Alcester, Warwickshire- hybrid working.

PACKAGE: Private Healthcare, Pension, 25 days Annual Leave plus Bank Holidays.

Here’s a flavour of some of your responsibi

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# set the correct model you'll be fine-tuning
model_name = 'facebook/bart-large'
# get the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# check the token length of the text field - you can do this for both fields
texts = dataset_dict['train']['context']

# Tokenize all texts and find the maximum length (max for BART is 1024 tokens)
max_token_length = max(len(tokenizer.encode(text, truncation=True)) for text in texts)
print(f"The longest text is {max_token_length} tokens long.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

The longest text is 1024 tokens long.


In [None]:
# preprocessing function - sequence to sequence model
def get_feature(batch):
  encodings = tokenizer(batch['context'], text_target=batch['answer'],
                        max_length=1024, truncation=True)

  encodings = {'input_ids': encodings['input_ids'],
               'attention_mask': encodings['attention_mask'],
               'labels': encodings['labels']}

  return encodings

dataset_pt = dataset_dict.map(get_feature, batched=True)
dataset_pt

Map:   0%|          | 0/510 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Number', 'context', 'answer', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 510
    })
    validation: Dataset({
        features: ['Number', 'context', 'answer', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 45
    })
    test: Dataset({
        features: ['Number', 'context', 'answer', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 45
    })
})

In [None]:
columns = ['input_ids', 'labels', 'attention_mask']
dataset_pt.set_format(type='torch', columns=columns)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from transformers import TrainingArguments, Trainer

# we're using the Trainer API which abstracts away a lot of complexity
training_args = TrainingArguments(
    output_dir = 'job_posting_skills_extraction', # rename to what you want it to be called
    num_train_epochs=6, # your choice
    warmup_steps = 500,
    per_device_train_batch_size=4, # keep a small batch size when working with a small GPU
    per_device_eval_batch_size=4,
    weight_decay = 0.01, # helps prevent overfitting
    logging_steps = 10,
    evaluation_strategy = 'steps',
    eval_steps=50, # base this on the size of your dataset and number of training epochs
    save_steps=1e6,
    gradient_accumulation_steps=16 # running this on a small GPU
)

trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer, data_collator=data_collator,
                  train_dataset = dataset_pt['train'], eval_dataset = dataset_pt['validation'])

trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=48, training_loss=2.8924402793248496, metrics={'train_runtime': 555.5329, 'train_samples_per_second': 5.508, 'train_steps_per_second': 0.086, 'total_flos': 6259922902892544.0, 'train_loss': 2.8924402793248496, 'epoch': 6.0})

In [None]:
# save the model
trainer.save_model('job_posting_skills_extraction') # set the name you want

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [None]:
def prepare_text(text, max_length=1020):
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True)
    return tokenizer.decode(inputs.input_ids[0])

In [None]:
from transformers import pipeline

# test the model using Hugging Face's pipeline
pipe = pipeline('summarization', model='job_posting_skills_extraction')

# test the first item in the test set to see how it does
test_text = prepare_text(dataset_dict['test'][1]['context'])
#test_text = dataset_dict['test'][1]['context']
answer = dataset_dict['test'][1]['answer']
#print("the text: ", test_text)
out = pipe(test_text)[0]['summary_text']
print("generated keywords: ", out)
gen_kw_list = [item.strip() for item in out.split(',')]
print(gen_kw_list)
print()
print("orginal keywords : ",answer)
ans_kw_list = [item.strip() for item in answer.split(',')]
print(ans_kw_list)

generated keywords:  Software Engineering, Agile, DevSecOps, Software Development Lifecycle, Software Design, Testing, Maintenance, Unit Testing, Verification Test Plans, Deployment, Network Troubleshooting
['Software Engineering', 'Agile', 'DevSecOps', 'Software Development Lifecycle', 'Software Design', 'Testing', 'Maintenance', 'Unit Testing', 'Verification Test Plans', 'Deployment', 'Network Troubleshooting']

orginal keywords :  Java, Software Development, Software Engineering, Battlefield Command and Control, Geospatial Data Management, Exploitation, Visualization, Analysis, AWS, Docker, Kubernetes, Agile Methodologies, Design, Documentation, Ticketing Systems, Version Control, Formal Software Engineering Principles
['Java', 'Software Development', 'Software Engineering', 'Battlefield Command and Control', 'Geospatial Data Management', 'Exploitation', 'Visualization', 'Analysis', 'AWS', 'Docker', 'Kubernetes', 'Agile Methodologies', 'Design', 'Documentation', 'Ticketing Systems',

In [None]:
import numpy as np
scoreList = []
meanList = []
sDeviationList = []
varianceList = []

for i in range(0, len(dataset_dict['test'])):
     test_text = prepare_text(dataset_dict['test'][i]['context'])
     answer = dataset_dict['test'][i]['answer']
     out = pipe(test_text)[0]['summary_text']

     print("generated keywords: ", out)
     gen_kw_list = [item.strip() for item in out.split(',')]
     print(gen_kw_list)
     print()

     print("orginal keywords : ",answer)
     ans_kw_list = [item.strip() for item in answer.split(',')]
     print(ans_kw_list)
     print()

     Intersection = set(gen_kw_list).intersection(ans_kw_list)
     HitRate = round(len(Intersection)/len(ans_kw_list) * 100, 2)
     scoreList.append(HitRate)
     print(scoreList)
     print()
     print()


generated keywords:  Software Engineering, Python, Java, Scala, C++, AWS, GCE, Azure, Data Integration, Data Processing, Apache NiFi, Spark
['Software Engineering', 'Python', 'Java', 'Scala', 'C++', 'AWS', 'GCE', 'Azure', 'Data Integration', 'Data Processing', 'Apache NiFi', 'Spark']

orginal keywords :  Kubernetes, Octopus Deploy, ADO Pipelines, .NET Core, C#, Java, MS SQL, Oracle, MongoDB, Elastic, Cosmos, HTTP, REST, GIT, GitHub, TFS, Kafka, ESB
['Kubernetes', 'Octopus Deploy', 'ADO Pipelines', '.NET Core', 'C#', 'Java', 'MS SQL', 'Oracle', 'MongoDB', 'Elastic', 'Cosmos', 'HTTP', 'REST', 'GIT', 'GitHub', 'TFS', 'Kafka', 'ESB']

[5.56]


generated keywords:  Software Engineering, Agile, DevSecOps, Software Development Lifecycle, Software Design, Testing, Maintenance, Unit Testing, Verification Test Plans, Deployment, Network Troubleshooting
['Software Engineering', 'Agile', 'DevSecOps', 'Software Development Lifecycle', 'Software Design', 'Testing', 'Maintenance', 'Unit Testing', 'Ve

Your max_length is set to 128, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)


generated keywords:  Crossover is the world's #1 source of full-time remote jobs. We're recruiting this role for our client, GFI Software. The position is immediately available and requires entering into an independent contractor agreement with Crossover. The compensation level for this role is $50 USD/hour, which equates to $100,000 USD/year assuming 40 hours per week and 50 weeks per year.Software Engineering, C++, Network Communications, Security, AI, Large Language Models (LLMs), Prompt Engineering, Chain ofThought (CoT)
["Crossover is the world's #1 source of full-time remote jobs. We're recruiting this role for our client", 'GFI Software. The position is immediately available and requires entering into an independent contractor agreement with Crossover. The compensation level for this role is $50 USD/hour', 'which equates to $100', '000 USD/year assuming 40 hours per week and 50 weeks per year.Software Engineering', 'C++', 'Network Communications', 'Security', 'AI', 'Large Langua

Your max_length is set to 128, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


generated keywords:  C#, HTML, CSS, JavaScript, TypeScript, SQL, SQL Server, Azure, AWS, Docker, Kubernetes, Agile, Continuous Integration, Continuous Deployment
['C#', 'HTML', 'CSS', 'JavaScript', 'TypeScript', 'SQL', 'SQL Server', 'Azure', 'AWS', 'Docker', 'Kubernetes', 'Agile', 'Continuous Integration', 'Continuous Deployment']

orginal keywords :  .NET, JavaScript, CSS, HTML, SQL, XML, JSON, APIs, Agile, .NET Core, .NET Framework, ASP.NET, TDD, ObjectOriented Design, Web Development, CloudNative Technologies
['.NET', 'JavaScript', 'CSS', 'HTML', 'SQL', 'XML', 'JSON', 'APIs', 'Agile', '.NET Core', '.NET Framework', 'ASP.NET', 'TDD', 'ObjectOriented Design', 'Web Development', 'CloudNative Technologies']

[5.56, 5.88, 0.0, 0.0, 15.0, 0.0, 3.12, 0.0, 7.69, 4.88, 0.0, 0.0, 15.79, 0.0, 0.0, 9.09, 0.0, 33.33, 0.0, 31.25]


generated keywords:  Software Engineering, Software Development, Software Architecture, Software Design, Software Testing, Software Quality Assurance
['Software Engine

In [None]:
def split_list(lst, chunk_size):
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

result = split_list(scoreList, 9)

test_s1 = result[0]
s1_mean = np.mean(test_s1)
s1_std = np.std(test_s1)
s1_var = np.var(test_s1)
print(test_s1)
print(s1_mean)
print(s1_std)
print(s1_var)
print()

test_s2 = result [1]
s2_mean = np.mean(test_s2)
s2_std = np.std(test_s2)
s2_var = np.var(test_s2)
print(test_s2)
print(s2_mean)
print(s2_std)
print(s2_var)
print()


test_s3 = result [2]
s3_mean = np.mean(test_s3)
s3_std = np.std(test_s3)
s3_var = np.var(test_s3)
print(test_s3)
print(s3_mean)
print(s3_std)
print(s3_var)
print()


test_s4 = result [3]
s4_mean = np.mean(test_s4)
s4_mean = np.mean(test_s4)
s4_std = np.std(test_s4)
s4_var = np.var(test_s4)
print(test_s4)
print(s4_mean)
print(s4_std)
print(s4_var)
print()


test_s5 = result [4]
s5_mean = np.mean(test_s5)
s5_mean = np.mean(test_s5)
s5_std = np.std(test_s5)
s5_var = np.var(test_s5)
print(test_s5)
print(s5_mean)
print(s5_std)
print(s5_var)
print()

meanMean = np.mean([s1_mean, s2_mean, s3_mean, s4_mean, s5_mean])
meanStd = np.mean([s1_std, s2_std, s3_std, s4_std, s5_std])
meanVar = np.mean([s1_var, s2_var, s3_var, s4_var, s5_var])

print(meanMean)
print(meanStd)
print(meanVar)



[5.56, 5.88, 0.0, 0.0, 15.0, 0.0, 3.12, 0.0, 7.69]
4.138888888888889
4.774758736067648
22.79832098765432

[4.88, 0.0, 0.0, 15.79, 0.0, 0.0, 9.09, 0.0, 33.33]
7.01
10.668721677043704
113.82162222222223

[0.0, 31.25, 0.0, 0.0, 6.25, 33.33, 30.0, 16.67, 0.0]
13.055555555555555
14.025274892224719
196.7083358024691

[12.5, 0.0, 0.0, 0.0, 0.0, 5.26, 12.5, 0.0, 9.09]
4.372222222222222
5.278349442724134
27.860972839506175

[8.33, 0.0, 8.0, 12.5, 2.78, 0.0, 0.0, 4.55, 0.0]
4.017777777777777
4.38161506950165
19.19855061728395

6.51888888888889
7.8257439635123704
76.07756049382716
