In [15]:
# pip install scikit-learn

[0mDefaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading joblib-1.5.1

In [7]:
# pip install torch
# pip install transformers

[0mDefaulting to user installation because normal site-packages is not writeable
Collecting torch
  Downloading torch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting filelock (from torch)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.6.80 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.6.80-

In [90]:
import os
os.environ['HF_HOME'] = 'my_cache'

In [91]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np

In [92]:
# Load and process only the first 100,000 lines
data = []
with open('train_dailydialog.txt', 'r') as file:
    for i, line in enumerate(file):
        if i >= 400000:
            break
        parts = line.strip().split('|||')
        if len(parts) == 3:
            label = parts[0].strip()
            dialogue = parts[1].strip()
            response = parts[2].strip()
            data.append([label, dialogue, response])

# Convert to a DataFrame
df = pd.DataFrame(data, columns=['Label', 'Dialogue', 'Response'])
pd.options.display.max_colwidth = 400  # Adjust this value as needed

# Check the first few rows
print(df.head())

         Label  \
0  adversarial   
1  adversarial   
2  adversarial   
3     original   
4     original   

                                                                                                                                                                                                                                                                                                                                        Dialogue  \
0  What made you think that she wasn't very confident ? Did you notice the way that she avoided making eye contact with us while she talked ? She was a bit nervous , I guess . What else ? When she first walked into the room to greet us , she didn't shake our hands or introduce herself at all . I thought that was a bit unprofessional .   
1                                                                                                                                                                                                                  

In [93]:
# Step 1: Map string labels to integers
label_map = {
    'original': 0,
    'adversarial': 1,
    'random': 2
}

# Apply the mapping to the 'Label' column
df['Label'] = df['Label'].map(label_map)

# Ensure the Label column is in integer format
df['Label'] = df['Label'].astype(int)

# Check the DataFrame to confirm the label mapping
print(df[['Label', 'Dialogue', 'Response']].head())

   Label  \
0      1   
1      1   
2      1   
3      0   
4      0   

                                                                                                                                                                                                                                                                                                                                        Dialogue  \
0  What made you think that she wasn't very confident ? Did you notice the way that she avoided making eye contact with us while she talked ? She was a bit nervous , I guess . What else ? When she first walked into the room to greet us , she didn't shake our hands or introduce herself at all . I thought that was a bit unprofessional .   
1                                                                                                                                                                                                                  Yes . She goes three times a week . 

In [94]:
from transformers import RobertaTokenizer, AutoTokenizer

# Use AutoTokenizer for better compatibility
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', cache_dir='my_cache')

def tokenize_function(row):
    # Combine dialogue and response with [SEP] token for better separation
    combined_text = f"{row['Dialogue']} [SEP] {row['Response']}"
    
    # Tokenize the combined text
    tokenized = tokenizer(
        combined_text,
        truncation=True,
        padding='max_length',
        max_length=512,  # Specify max_length explicitly
        return_tensors="pt"
    )
    return {
        'input_ids': tokenized['input_ids'].squeeze(),
        'attention_mask': tokenized['attention_mask'].squeeze()
    }

# Apply tokenization to the DataFrame
df['Tokenized'] = df.apply(tokenize_function, axis=1)

# Extract input_ids and attention_mask into separate columns
df['input_ids'] = df['Tokenized'].apply(lambda x: x['input_ids'].tolist())
df['attention_mask'] = df['Tokenized'].apply(lambda x: x['attention_mask'].tolist())

# Drop the Tokenized column
df = df.drop(columns=['Tokenized'])

# IMPORTANT: Ensure labels are in the correct range [0, 1, 2] for 3-class classification
print("Label distribution:")
print(df['Label'].value_counts())
print(f"Label range: {df['Label'].min()} to {df['Label'].max()}")

Label distribution:
Label
2    133488
0    133362
1    133150
Name: count, dtype: int64
Label range: 0 to 2


In [95]:
# Split the data with stratification to maintain label distribution
train_df, val_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42, 
    stratify=df['Label']  # Ensure balanced splits
)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print("Training label distribution:")
print(train_df['Label'].value_counts())
print("Validation label distribution:")
print(val_df['Label'].value_counts())

# Reset indices after splitting
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

class DialogueDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        input_ids = torch.tensor(self.data['input_ids'].iloc[idx], dtype=torch.long)
        attention_mask = torch.tensor(self.data['attention_mask'].iloc[idx], dtype=torch.long)
        label = torch.tensor(self.data['Label'].iloc[idx], dtype=torch.long)
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label
        }

# Create datasets and dataloaders
train_dataset = DialogueDataset(train_df)
val_dataset = DialogueDataset(val_df)

# Use smaller batch size and add drop_last to avoid issues with small batches
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, drop_last=False)


Training set size: 320000
Validation set size: 80000
Training label distribution:
Label
2    106790
0    106690
1    106520
Name: count, dtype: int64
Validation label distribution:
Label
2    26698
0    26672
1    26630
Name: count, dtype: int64


In [96]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=3,  # Specify number of classes
    cache_dir='my_cache'
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [97]:
# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Using device: {device}")

Using device: cuda


In [98]:
def validate(model, val_loader, device):
    model.eval()
    total_val_loss = 0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()
            
            # Get predictions
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            
            # Store for detailed analysis
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_val_loss = total_val_loss / len(val_loader)
    accuracy = correct / total
    
    # Print detailed validation info
    print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")
    print(f"Predictions distribution: {np.bincount(all_preds)}")
    print(f"True labels distribution: {np.bincount(all_labels)}")
    
    return avg_val_loss, accuracy

In [99]:
def train(model, train_loader, val_loader, optimizer, device, num_epochs=3):
    # Learning rate scheduler
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=int(0.1 * total_steps),  # 10% warmup
        num_training_steps=total_steps
    )
    
    best_val_accuracy = 0
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_loss = 0
        correct_train = 0
        total_train = 0
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")
        
        for batch_idx, batch in enumerate(progress_bar):
            # Move data to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            # Backward pass
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            # Update weights
            optimizer.step()
            scheduler.step()
            
            # Track training metrics
            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            correct_train += (preds == labels).sum().item()
            total_train += labels.size(0)
            
            # Update progress bar
            progress_bar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'lr': f'{scheduler.get_last_lr()[0]:.2e}'
            })
        
        # Calculate training metrics
        avg_train_loss = total_loss / len(train_loader)
        train_accuracy = correct_train / total_train
        
        print(f"\nEpoch {epoch + 1}/{num_epochs}:")
        print(f"Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")
        
        # Validation phase
        val_loss, val_accuracy = validate(model, val_loader, device)
        
        # Save best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_model.pth')
        
        print("-" * 50)

# Initialize optimizer with proper learning rate
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

In [None]:
# Start training
print("Starting training...")
train(model, train_loader, val_loader, optimizer, device, num_epochs=12)

Starting training...


Epoch 1/12: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40000/40000 [1:27:12<00:00,  7.64it/s, loss=0.2478, lr=1.67e-05]



Epoch 1/12:
Training Loss: 0.6016, Training Accuracy: 0.7342
Validation Loss: 0.4274, Accuracy: 0.8275
Predictions distribution: [27712 24584 27704]
True labels distribution: [26672 26630 26698]
--------------------------------------------------


Epoch 2/12: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40000/40000 [1:27:12<00:00,  7.64it/s, loss=0.2656, lr=1.85e-05]



Epoch 2/12:
Training Loss: 0.4162, Training Accuracy: 0.8394
Validation Loss: 0.4060, Accuracy: 0.8460
Predictions distribution: [26633 28315 25052]
True labels distribution: [26672 26630 26698]
--------------------------------------------------


Epoch 3/12: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40000/40000 [1:27:07<00:00,  7.65it/s, loss=0.0471, lr=1.67e-05]



Epoch 3/12:
Training Loss: 0.3383, Training Accuracy: 0.8905
Validation Loss: 0.4556, Accuracy: 0.8504
Predictions distribution: [29635 25725 24640]
True labels distribution: [26672 26630 26698]
--------------------------------------------------


Epoch 4/12: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40000/40000 [1:27:08<00:00,  7.65it/s, loss=0.3649, lr=1.48e-05]



Epoch 4/12:
Training Loss: 0.2794, Training Accuracy: 0.9243
Validation Loss: 0.5778, Accuracy: 0.8626
Predictions distribution: [26664 26879 26457]
True labels distribution: [26672 26630 26698]
--------------------------------------------------


Epoch 5/12: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40000/40000 [1:27:07<00:00,  7.65it/s, loss=0.0018, lr=1.30e-05]



Epoch 5/12:
Training Loss: 0.2159, Training Accuracy: 0.9481
Validation Loss: 0.7219, Accuracy: 0.8575
Predictions distribution: [28393 23435 28172]
True labels distribution: [26672 26630 26698]
--------------------------------------------------


Epoch 6/12: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40000/40000 [1:27:11<00:00,  7.65it/s, loss=0.0008, lr=1.11e-05]



Epoch 6/12:
Training Loss: 0.1596, Training Accuracy: 0.9643
Validation Loss: 0.8539, Accuracy: 0.8621
Predictions distribution: [28328 25182 26490]
True labels distribution: [26672 26630 26698]
--------------------------------------------------


Epoch 7/12: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40000/40000 [1:27:15<00:00,  7.64it/s, loss=0.0003, lr=9.26e-06]



Epoch 7/12:
Training Loss: 0.1150, Training Accuracy: 0.9760
Validation Loss: 0.8956, Accuracy: 0.8642
Predictions distribution: [26759 26071 27170]
True labels distribution: [26672 26630 26698]
--------------------------------------------------


Epoch 8/12: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40000/40000 [1:27:14<00:00,  7.64it/s, loss=0.0003, lr=7.41e-06]



Epoch 8/12:
Training Loss: 0.0806, Training Accuracy: 0.9839
Validation Loss: 0.9948, Accuracy: 0.8651
Predictions distribution: [28146 25618 26236]
True labels distribution: [26672 26630 26698]
--------------------------------------------------


Epoch 9/12: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40000/40000 [1:27:14<00:00,  7.64it/s, loss=0.0000, lr=5.56e-06]



Epoch 9/12:
Training Loss: 0.0573, Training Accuracy: 0.9887
Validation Loss: 1.0140, Accuracy: 0.8687
Predictions distribution: [27238 26515 26247]
True labels distribution: [26672 26630 26698]
--------------------------------------------------


Epoch 10/12: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40000/40000 [1:30:36<00:00,  7.36it/s, loss=0.0000, lr=3.70e-06]



Epoch 10/12:
Training Loss: 0.0390, Training Accuracy: 0.9926
Validation Loss: 1.0813, Accuracy: 0.8666
Predictions distribution: [28645 25575 25780]
True labels distribution: [26672 26630 26698]
--------------------------------------------------


Epoch 11/12:  54%|███████████████████████████████████████████████████████████████████████████████████                                                                        | 21435/40000 [46:44<40:29,  7.64it/s, loss=0.0001, lr=2.71e-06]

In [64]:
# Method 1: Save the complete model (RECOMMENDED)
# This saves both the model architecture and weights
def save_complete_model(model, tokenizer, save_directory="./saved_model"):
    """
    Save the complete model and tokenizer for easy reuse
    """
    # Create directory if it doesn't exist
    os.makedirs(save_directory, exist_ok=True)
    
    # Save model and tokenizer (HuggingFace format)
    model.save_pretrained(save_directory)
    tokenizer.save_pretrained(save_directory)
    
    print(f"Model and tokenizer saved to: {save_directory}")

In [101]:
save_complete_model(model, tokenizer, "./my_dialogue_classifier_12_400")

Model and tokenizer saved to: ./my_dialogue_classifier_12_400
