In [2]:
import pandas as pd
# Assuming 'text' is the column containing strings and 'depressed' is the boolean column
df = pd.read_csv('/kaggle/input/text-dataset/dataset.csv')

In [7]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score
from tqdm import tqdm



# Data Preprocessing
def preprocess_text(text):
    # Add any custom preprocessing steps here
    return text.lower()  # Example: Convert text to lowercase

df['clean_text'] = df['clean_text'].apply(preprocess_text)

import re

df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

# Split the dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['clean_text'], df['is_depression'], test_size=0.2, random_state=42
)

# Load pre-trained DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Tokenize and encode the text data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

# Create PyTorch DataLoader
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels.values)  # Convert to numpy array to avoid shape issues
)
test_dataset = TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_labels.values)  # Convert to numpy array to avoid shape issues
)

# Training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)  # Add this line

num_epochs = 50
early_stopping_patience = 5
best_validation_loss = float('inf')
no_improvement_count = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        inputs = {'input_ids': batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  'labels': batch[2].to(device)}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        
        # Calculate accuracy
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        total_correct += (predicted_labels == inputs['labels']).sum().item()
        total_samples += len(inputs['labels'])
        
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(train_loader)
    accuracy = total_correct / total_samples
    
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')
    
    # Validation
    model.eval()
    with torch.no_grad():
        validation_loss = 0
        validation_correct = 0
        validation_samples = 0
        
        for batch in tqdm(test_loader, desc=f'Validation'):
            inputs = {'input_ids': batch[0].to(device),
                      'attention_mask': batch[1].to(device),
                      'labels': batch[2].to(device)}
            outputs = model(**inputs)
            validation_loss += outputs.loss.item()
            
            # Calculate accuracy
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            validation_correct += (predicted_labels == inputs['labels']).sum().item()
            validation_samples += len(inputs['labels'])
        
        avg_validation_loss = validation_loss / len(test_loader)
        validation_accuracy = validation_correct / validation_samples
        
        print(f'Validation Loss: {avg_validation_loss:.4f}, Validation Accuracy: {validation_accuracy:.4f}')
        
        # Early Stopping Check
        if avg_validation_loss < best_validation_loss:
            best_validation_loss = avg_validation_loss
            no_improvement_count = 0
        else:
            no_improvement_count += 1
            if no_improvement_count >= early_stopping_patience:
                print(f'No improvement for {early_stopping_patience} epochs. Early stopping...')
                break


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/50: 100%|██████████| 773/773 [02:47<00:00,  4.61it/s]


Epoch 1/50, Average Loss: 0.1136, Accuracy: 0.9571


Validation: 100%|██████████| 194/194 [00:13<00:00, 14.65it/s]


Validation Loss: 0.0933, Validation Accuracy: 0.9625


Epoch 2/50: 100%|██████████| 773/773 [02:47<00:00,  4.61it/s]


Epoch 2/50, Average Loss: 0.0427, Accuracy: 0.9845


Validation: 100%|██████████| 194/194 [00:13<00:00, 14.66it/s]


Validation Loss: 0.0673, Validation Accuracy: 0.9780


Epoch 3/50: 100%|██████████| 773/773 [02:47<00:00,  4.61it/s]


Epoch 3/50, Average Loss: 0.0218, Accuracy: 0.9939


Validation: 100%|██████████| 194/194 [00:13<00:00, 14.65it/s]


Validation Loss: 0.0700, Validation Accuracy: 0.9793


Epoch 4/50: 100%|██████████| 773/773 [02:47<00:00,  4.61it/s]


Epoch 4/50, Average Loss: 0.0111, Accuracy: 0.9971


Validation: 100%|██████████| 194/194 [00:13<00:00, 14.67it/s]


Validation Loss: 0.1118, Validation Accuracy: 0.9793


Epoch 5/50: 100%|██████████| 773/773 [02:47<00:00,  4.61it/s]


Epoch 5/50, Average Loss: 0.0060, Accuracy: 0.9982


Validation: 100%|██████████| 194/194 [00:13<00:00, 14.66it/s]


Validation Loss: 0.0850, Validation Accuracy: 0.9741


Epoch 6/50: 100%|██████████| 773/773 [02:47<00:00,  4.61it/s]


Epoch 6/50, Average Loss: 0.0030, Accuracy: 0.9994


Validation: 100%|██████████| 194/194 [00:13<00:00, 14.66it/s]


Validation Loss: 0.1105, Validation Accuracy: 0.9774


Epoch 7/50: 100%|██████████| 773/773 [02:47<00:00,  4.61it/s]


Epoch 7/50, Average Loss: 0.0119, Accuracy: 0.9974


Validation: 100%|██████████| 194/194 [00:13<00:00, 14.69it/s]

Validation Loss: 0.1431, Validation Accuracy: 0.9729
No improvement for 5 epochs. Early stopping...





In [36]:
def detect_depression(input_text):
    input_encoding = tokenizer(input_text, return_tensors='pt')
    input_encoding = {key: value.to(device) for key, value in input_encoding.items()}
    output = model(**input_encoding)
    probability = torch.sigmoid(output.logits)
    probability_positive_class = probability[:, 1].item()  # Extract probability for class 1
    prediction = 1 if probability_positive_class >=0.5 else 0
    return prediction


# Example usage
new_input_text = input("enter")
new_prediction = detect_depression(new_input_text)
print(f"Predicted class: {new_prediction}")

enter  feel very depressed.


Predicted class: 1


In [37]:
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'train_loss': avg_loss,
    'train_accuracy': accuracy,
    'validation_loss': avg_validation_loss,
    'validation_accuracy': validation_accuracy
}, 'distilbert_model.pth')