In [None]:
# Standard library imports
import csv
import os

# Third-party library imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader, Dataset
from transformers import (AdamW, BertForSequenceClassification, BertTokenizer, get_linear_schedule_with_warmup, get_scheduler, Trainer)


tokenizer = BertTokenizer.from_pretrained("D:/Dropbox/Dropbox/vs_cloud/Job_posting_data/chinese-bert-wwm/")

df = pd.read_csv("G:/Data/job_posting/processed/finetune/est_sample.csv", encoding = "utf_8_sig", on_bad_lines='skip', encoding_errors='ignore')
# replace the symbol '-' to '.' in soc_code column, and convert soc_code to int
df['soc_code'] = df['soc_code'].str.replace('-', '')
# replace 'Yes' with True and NaN with False using the fillna() and astype() methods
df['true_ind'] = df['true_ind'].fillna(False).astype(bool)
# generate a new column 'soc_code1' with value to recode the 'soc_code' in ascending order. Create a dictionary to map unique soc_codes to sequential integer labels
unique_soc_codes = sorted(df['soc_code'].unique())
soc_code_dict  = {soc_code: i for i, soc_code in enumerate(unique_soc_codes)}

In [None]:
### The dataset is split into train, validation, and test sets as follows:
# The initial train_test_split call splits df into train_df_sample (60% of the data) and temp_df_sample (40% of the data).
# The second train_test_split call further splits temp_df_sample into valid_df_sample (50% of temp_df_sample, or 20% of the original data) and test_df_sample (50% of temp_df_sample, or 20% of the original data).
# So, the final ratio of the dataset split is 60% for training, 20% for validation, and 20% for testing. Create into train, validation and test set

train_df_sample, temp_df_sample = train_test_split(df, test_size=0.4, random_state=42)
valid_df_sample, test_df_sample = train_test_split(temp_df_sample, test_size=0.5, random_state=42)

# export the train, validation and test set to csv
train_df_sample.to_csv('F:/Data/job_posting/processed/finetune/train_df_sample.csv', index=False, encoding = "utf_8_sig", header=True)
test_df_sample.to_csv('F:/Data/job_posting/processed/finetune/test_df_sample.csv', index=False, encoding = "utf_8_sig", header=True)
valid_df_sample.to_csv('F:/Data/job_posting/processed/finetune/valid_df_sample.csv', index=False, encoding = "utf_8_sig", header=True)

def preprocess_df(df, soc_code_dict):
    # Drop the 'Unnamed: 0' column
    df = df.drop(['Unnamed: 0'], axis=1)
    # Generate a new column 'soc_code1' with mapped values from 'soc_code'
    df['soc_code1'] = df['soc_code'].map(soc_code_dict)
    return df

# Applying the function to each DataFrame
train_df_sample = preprocess_df(train_df_sample, soc_code_dict)
test_df_sample = preprocess_df(test_df_sample, soc_code_dict)
valid_df_sample = preprocess_df(valid_df_sample, soc_code_dict)

# Function to extract titles, texts, and labels from a DataFrame
def extract_data(df):
    titles = df['工作名称'].astype(str).tolist()
    texts = df['工作描述'].astype(str).tolist()
    labels = df['soc_code1'].tolist()
    return titles, texts, labels

# Extracting data from each DataFrame
train_titles, train_texts, train_labels = extract_data(train_df_sample)
test_titles, test_texts, test_labels = extract_data(test_df_sample)
valid_titles, valid_texts, valid_labels = extract_data(valid_df_sample)

# If you have more "credible" or reliable labels in your dataset, you can leverage this information to improve the performance of your model by assigning different weights to the loss function during training. This way, the model will put more emphasis on learning from the credible samples.
def assign_weight(true_ind):
    if true_ind:
        return 1.0
    else:
        return 0.5
    
# True for credible labels and False for less credible labels
train_df_sample['weight'] = train_df_sample['true_ind'].apply(assign_weight)
test_df_sample['weight'] = test_df_sample['true_ind'].apply(assign_weight)
valid_df_sample['weight'] = valid_df_sample['true_ind'].apply(assign_weight)

# Extract the weights
train_weights = train_df_sample['weight'].tolist()
valid_weights = valid_df_sample['weight'].tolist()
test_weights = test_df_sample['weight'].tolist()

# Create the JobPostingDataset class
class JobPostingDataset(Dataset):
    def __init__(self, titles, descriptions, labels, weights, tokenizer, max_length):
        self.titles = titles
        self.descriptions = descriptions
        self.labels = labels
        self.weights = weights
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        title = self.titles[idx]
        description = self.descriptions[idx]
        label = self.labels[idx]
        weight = self.weights[idx]

        # Concatenate title and description, repeat the title to give it more importance
        repeat_title = 2  # Adjust this value to control the importance of the title
        text = (title + " ") * repeat_title + description

        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        # Return a tuple of the input tensors, label, and weight
        return (
            encoding["input_ids"].squeeze(0),
            encoding["attention_mask"].squeeze(0),
            torch.tensor(label, dtype=torch.long),
            torch.tensor(weight, dtype=torch.float),
        )

# Create the datasets
max_length = 512
train_dataset = JobPostingDataset(train_titles, train_texts, train_labels, train_weights, tokenizer, max_length)
valid_dataset = JobPostingDataset(valid_titles, valid_texts, valid_labels, valid_weights, tokenizer, max_length)
test_dataset = JobPostingDataset(test_titles, test_texts, test_labels, test_weights, tokenizer, max_length)

# Create the data loaders
batch_size = 20
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
### Define an evaluate() function to compute the validation loss
def evaluate(model, valid_loader, device, loss_fn):
    model.eval()
    total_loss = 0
    num_batches = 0
    with torch.no_grad():
        for batch in valid_loader:
            inputs = batch[0].to(device)
            masks = batch[1].to(device)
            labels = batch[2].to(device)
            weights = batch[3].to(device)

            logits = model(inputs, attention_mask=masks).logits
            batch_loss = loss_fn(logits, labels)
            weighted_batch_loss = batch_loss * weights
            loss = torch.mean(weighted_batch_loss)

            total_loss += loss.item()
            num_batches += 1
    return total_loss / num_batches

In [None]:
# Handle class imbalance by adjusting class weights in the CrossEntropyLoss criterion. To achieve this, you need to compute class weights and pass them as an argument to the CrossEntropyLoss function.
# The training loop to include early stopping based on the validation loss.
# Define the model, the optimizer, and the learning rate scheduler

num_labels = len(train_df_sample['soc_code1'].unique())
model = BertForSequenceClassification.from_pretrained("G:/Other computers/我的计算机/cloud_share/Job_posting_data/chinese-bert-wwm/", num_labels=num_labels)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
num_epochs = 300
num_training_steps = num_epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
# Compute class weights using the train_labels_np array
unique_labels = train_df_sample['soc_code1'].unique()
class_weights = compute_class_weight('balanced', classes=unique_labels, y=train_labels)
class_weights = torch.tensor(class_weights, dtype=torch.float) 

In [None]:
# The patience parameter determines how many consecutive epochs the model can go without an improvement in validation loss before stopping the training. 
# In this case, the patience is set to 3, meaning that if the validation loss does not improve for 3 consecutive epochs, the training will be stopped.
early_stopping_patience = 3

# This line initializes a counter variable called num_epochs_without_improvement that keeps track of the number of consecutive epochs without an improvement in validation loss. 
# The counter is set to 0 at the beginning of the training process and is incremented by 1 whenever there is no improvement in the validation loss. If the validation loss improves in a particular epoch, the counter is reset to 0.
num_epochs_without_improvement = 0

# During the training loop, if num_epochs_without_improvement becomes equal to or greater than early_stopping_patience, the training will be stopped. 
# This way, the training process can be terminated early when the model starts overfitting, or when there is no significant improvement in the validation loss.
best_valid_loss = float('inf')

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize lists to store losses
train_losses = []

# Utilize multiple GPUs with DataParallel
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

# Pass the computed class_weights to the CrossEntropyLoss function:
# Create a loss function that doesn't reduce the losses right away and pass class_weights
# By incorporating class weights into the loss function, the model will pay more attention to the minority classes during training. 
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights.to(device), reduction='none')

model.train()
for epoch in range(num_epochs):
    epoch_train_loss = 0
    num_batches = 0
    for batch in train_loader:
        inputs = batch[0].to(device)
        masks = batch[1].to(device)
        labels = batch[2].to(device)
        weights = batch[3].to(device)  # Assuming the weights are the 4th element in the batch

        optimizer.zero_grad()

        logits = model(inputs, attention_mask=masks).logits

        # Compute the loss for each sample
        batch_loss = loss_fn(logits, labels)

        # Multiply the loss by the corresponding weight
        weighted_batch_loss = batch_loss * weights

        # Average the weighted losses
        loss = torch.mean(weighted_batch_loss)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        # Add the current batch loss to the epoch_train_loss
        epoch_train_loss += loss.item()
        num_batches += 1

    # Calculate average loss for the current epoch and append it to the train_losses list
    epoch_train_loss /= num_batches
    train_losses.append(epoch_train_loss)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_train_loss:.4f}")

    # Evaluate the model on the validation set
    valid_loss = evaluate(model, valid_loader, device, loss_fn)
    print(f"Validation Loss: {valid_loss:.4f}")

    # Save the best model based on the validation loss
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        if isinstance(model, torch.nn.DataParallel):
            model.module.save_pretrained("F:/Data/job_posting/processed/finetune/best_model")
        else:
            model.save_pretrained("F:/Data/job_posting/processed/finetune/best_model")
        num_epochs_without_improvement = 0
    else:
        num_epochs_without_improvement += 1

    # Check the stopping condition and break the loop if needed
    if num_epochs_without_improvement >= early_stopping_patience:
        print("Early stopping due to no improvement in validation loss.")
        break