# BERT Fine-Tuning for Domain-Specific Text Classification

In [None]:

# Install the required libraries
!pip install transformers torch datasets


## Step 1: Load the Dataset

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load dataset
df = pd.read_csv('domain_specific_text_classification_dataset.csv')

# Split into training and testing datasets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Text'].tolist(), df['Category'].tolist(), test_size=0.2
)

# Convert to Hugging Face Dataset
train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
test_df = pd.DataFrame({'text': test_texts, 'label': test_labels})

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


## Step 2: Tokenize the Data

In [None]:

from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


## Step 3: Convert Labels to Integers

In [None]:

label_dict = {label: i for i, label in enumerate(df['Category'].unique())}
train_dataset = train_dataset.map(lambda x: {'label': label_dict[x['label']]})
test_dataset = test_dataset.map(lambda x: {'label': label_dict[x['label']]})


## Step 4: Load Pre-trained BERT for Classification

In [None]:

from transformers import BertForSequenceClassification

num_labels = len(label_dict)

# Load pre-trained BERT model with a classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)


## Step 5: Fine-Tune the Model

In [None]:

from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()


## Step 6: Evaluate the Model

In [None]:

# Evaluate the model
results = trainer.evaluate()
print(results)
