In [6]:
import pandas as pd 
df=pd.read_csv("data job posts.csv")


In [7]:
df = df.drop(['AnnouncementCode', 'Term', 'Eligibility', 'Audience', 'StartDate', 'Duration','RequiredQual', 'Salary',
       'ApplicationP', 'OpeningDate', 'Deadline', 'Notes', 'AboutC', 'Attach', 'Year', 'Month', 'IT'], axis=1)

In [8]:
import pandas as pd
import re

# Remove irrelevant information
# Clean the job title
df['Title'] = df['Title'].apply(lambda x: str(x))
df['Title'] = df['Title'].apply(lambda x: re.sub('[^a-z\s]', '', x))
df['Title'] = df['Title'].str.strip()


# Clean the JobRequirment
df['JobRequirment'] = df['JobRequirment'].apply(lambda x: str(x))
df['JobRequirment'] = df['JobRequirment'].apply(lambda x: re.sub('[^a-z\s]', '', x))
df['JobRequirment'] = df['JobRequirment'].str.strip()

# Extract the company name
df['Company'] = df['Company'].apply(lambda x: str(x).split(',')[0] if pd.notna(x) else '')
df['Company'] = df['Company'].str.strip()

# Extract the location
df['Location'] = df['Location'].apply(lambda x: x.split(',')[0] if pd.notna(x) else '')
df['Location'] = df['Location'].str.strip()

# Extract the required JobDescription
df['JobDescription'] = df['JobDescription'].apply(lambda x: str(x))
df['JobDescription'] = df['JobDescription'].apply(lambda x: re.findall('(?:requirement|qualification)[s]?[:]?[\\n\\s]*(.*?)(?:responsibilit|how to apply)', x))
df['JobDescription'] = df['JobDescription'].apply(lambda x: [item.strip() for item in x])
df['JobDescription'] = df['JobDescription'].apply(lambda x: ' '.join(x))




In [9]:
#model take a lot of time to run to we will take only 2000 rows
df = df.iloc[:2000]

In [10]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df))

# Prepare the training data
train_texts = list(df['JobDescription'])
train_labels = list(range(len(df)))
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

class JobPostingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = JobPostingDataset(train_encodings, train_labels)

# Fine-tune the pre-trained BERT model on the training data
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

# Save the fine-tuned model
model.save_pretrained('/finetune/model')
tokenizer.save_pretrained('/finetune/tokenizer')

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:13<00:00, 17.6kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 13.7kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 551kB/s]
Downloading pytorch_model.bin: 100%|██████████| 440M/440M [41:00<00:00, 179kB/s]    
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.

{'loss': 7.6521, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.08}


  5%|▌         | 20/375 [01:30<13:08,  2.22s/it]

{'loss': 7.6636, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.16}


  8%|▊         | 30/375 [01:50<11:31,  2.00s/it]

{'loss': 7.6271, 'learning_rate': 3e-06, 'epoch': 0.24}


 11%|█         | 40/375 [02:10<10:49,  1.94s/it]

{'loss': 7.6932, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.32}


 13%|█▎        | 50/375 [02:28<10:06,  1.87s/it]

{'loss': 7.6793, 'learning_rate': 5e-06, 'epoch': 0.4}


 16%|█▌        | 60/375 [02:48<09:51,  1.88s/it]

{'loss': 7.6761, 'learning_rate': 6e-06, 'epoch': 0.48}


 19%|█▊        | 70/375 [03:06<09:06,  1.79s/it]

{'loss': 7.6591, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.56}


 21%|██▏       | 80/375 [03:26<10:08,  2.06s/it]

{'loss': 7.6422, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.64}


 24%|██▍       | 90/375 [03:45<09:12,  1.94s/it]

{'loss': 7.6521, 'learning_rate': 9e-06, 'epoch': 0.72}


 27%|██▋       | 100/375 [04:04<08:01,  1.75s/it]

{'loss': 7.6631, 'learning_rate': 1e-05, 'epoch': 0.8}


 29%|██▉       | 110/375 [04:26<10:05,  2.29s/it]

{'loss': 7.6702, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.88}


 32%|███▏      | 120/375 [04:46<07:37,  1.80s/it]

{'loss': 7.658, 'learning_rate': 1.2e-05, 'epoch': 0.96}


 35%|███▍      | 130/375 [05:05<08:01,  1.97s/it]

{'loss': 7.6372, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.04}


 37%|███▋      | 140/375 [05:23<06:40,  1.70s/it]

{'loss': 7.6006, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.12}


 40%|████      | 150/375 [05:40<06:22,  1.70s/it]

{'loss': 7.628, 'learning_rate': 1.5e-05, 'epoch': 1.2}


 43%|████▎     | 160/375 [05:57<05:46,  1.61s/it]

{'loss': 7.6585, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.28}


 45%|████▌     | 170/375 [06:16<06:25,  1.88s/it]

{'loss': 7.6356, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.36}


 48%|████▊     | 180/375 [06:36<06:06,  1.88s/it]

{'loss': 7.6341, 'learning_rate': 1.8e-05, 'epoch': 1.44}


 51%|█████     | 190/375 [06:55<05:58,  1.94s/it]

{'loss': 7.6065, 'learning_rate': 1.9e-05, 'epoch': 1.52}


 53%|█████▎    | 200/375 [07:15<06:09,  2.11s/it]

{'loss': 7.5965, 'learning_rate': 2e-05, 'epoch': 1.6}


 56%|█████▌    | 210/375 [07:35<05:11,  1.89s/it]

{'loss': 7.6474, 'learning_rate': 2.1e-05, 'epoch': 1.68}


 59%|█████▊    | 220/375 [07:58<05:46,  2.23s/it]

{'loss': 7.6333, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.76}


 61%|██████▏   | 230/375 [08:17<04:27,  1.85s/it]

{'loss': 7.6149, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.84}


 64%|██████▍   | 240/375 [08:35<04:06,  1.82s/it]

{'loss': 7.6243, 'learning_rate': 2.4e-05, 'epoch': 1.92}


 67%|██████▋   | 250/375 [08:53<03:37,  1.74s/it]

{'loss': 7.6393, 'learning_rate': 2.5e-05, 'epoch': 2.0}


 69%|██████▉   | 260/375 [09:11<03:36,  1.89s/it]

{'loss': 7.6344, 'learning_rate': 2.6000000000000002e-05, 'epoch': 2.08}


 72%|███████▏  | 270/375 [09:31<03:20,  1.91s/it]

{'loss': 7.6306, 'learning_rate': 2.7000000000000002e-05, 'epoch': 2.16}


 75%|███████▍  | 280/375 [09:50<03:06,  1.96s/it]

{'loss': 7.6215, 'learning_rate': 2.8000000000000003e-05, 'epoch': 2.24}


 77%|███████▋  | 290/375 [10:09<02:39,  1.88s/it]

{'loss': 7.6137, 'learning_rate': 2.9e-05, 'epoch': 2.32}


 80%|████████  | 300/375 [10:26<02:02,  1.63s/it]

{'loss': 7.6105, 'learning_rate': 3e-05, 'epoch': 2.4}


 83%|████████▎ | 310/375 [10:44<01:51,  1.72s/it]

{'loss': 7.6677, 'learning_rate': 3.1e-05, 'epoch': 2.48}


 85%|████████▌ | 320/375 [11:01<01:31,  1.67s/it]

{'loss': 7.6455, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.56}


 88%|████████▊ | 330/375 [11:21<01:18,  1.74s/it]

{'loss': 7.7121, 'learning_rate': 3.3e-05, 'epoch': 2.64}


 91%|█████████ | 340/375 [11:38<01:00,  1.73s/it]

{'loss': 7.6546, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.72}


 93%|█████████▎| 350/375 [11:56<00:42,  1.69s/it]

{'loss': 7.662, 'learning_rate': 3.5e-05, 'epoch': 2.8}


 96%|█████████▌| 360/375 [12:14<00:26,  1.80s/it]

{'loss': 7.6479, 'learning_rate': 3.6e-05, 'epoch': 2.88}


 99%|█████████▊| 370/375 [12:31<00:08,  1.63s/it]

{'loss': 7.6134, 'learning_rate': 3.7e-05, 'epoch': 2.96}


100%|██████████| 375/375 [12:40<00:00,  1.83s/it]

{'train_runtime': 768.5411, 'train_samples_per_second': 7.807, 'train_steps_per_second': 0.488, 'train_loss': 7.6430610656738285, 'epoch': 3.0}


100%|██████████| 375/375 [12:40<00:00,  2.03s/it]


In [None]:
from transformers import pipeline

# load the fine-tuned model
model = pipeline('text2text-generation', model='/finetune/model', tokenizer='/finetune/tokenizer')

# define a function to perform semantic search
def search_jobs(query, location=None, max_results=10):
    # preprocess the query and location (if provided)
    query = query.lower()
    if location is not None:
        location = location.lower()
    
    # retrieve all job postings that match the location (if provided)
    if location is not None:
        matching_jobs = df[df['Location'].str.lower().str.contains(location)]
    else:
        matching_jobs = df
    
    # perform semantic search to retrieve the most relevant job postings
    results = []
    for i, row in matching_jobs.iterrows():
        job_description = row['JobDescription']
        score = model(query, job_description)[0]['score']
        results.append((i, score))
    
    # sort the results by relevance score and return the top results
    results = sorted(results, key=lambda x: x[1], reverse=True)[:max_results]
    return [matching_jobs.loc[i] for i, _ in results]


: 