In [15]:
import pandas as pd
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import MobileBertTokenizer, MobileBertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import joblib

# 1. Load the data from CSV
df = pd.read_csv('train.csv')  # Path to your training data file
df['labels'] = df['labels'].apply(lambda x: x.split(", "))  # Convert labels to lists

# 2. Create MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['labels'])  # Transform labels to binary format

# 3. Check the distribution of classes
class_counts = y.sum(axis=0)  # Sum instances for each class
print("Class distribution before splitting:")
for label, count in zip(mlb.classes_, class_counts):
    print(f"{label}: {count}")

# 4. Split the data into training, validation, and test sets (70% train, 15% val, 15% test)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['text'].tolist(), 
    y, 
    test_size=0.3,  # 30% data for temporary split (validation and test)
    random_state=42,  # Random state for reproducibility
)

# Split the temporary data into validation and test sets (50/50 split)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, 
    temp_labels, 
    test_size=0.5,  # 50% of the temp set for validation, 50% for test
    random_state=42,  # Random state for reproducibility
)

# 5. Define a custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(label)  # Use FloatTensor for multi-label binary classification
        }

# 6. Initialize the tokenizer
tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')

# 7. Create the datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)

# 8. Load the MobileBERT model
model = MobileBertForSequenceClassification.from_pretrained(
    'google/mobilebert-uncased', 
    num_labels=len(mlb.classes_), 
    problem_type="multi_label_classification"
)

# 9. Set up the Trainer and Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Provide validation dataset here
)

# 10. Train the Model
trainer.train()

# 11. Save the Model and Tokenizer
model.save_pretrained('my_mobilebert_model')
tokenizer.save_pretrained('my_mobilebert_model')

# Save the MultiLabelBinarizer for encoding/decoding labels
joblib.dump(mlb, 'mlb.joblib')

# 12. Define the inference function
def predict(texts, model):
    model.eval()
    predictions = []
    device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Check device availability
    model.to(device)  # Move model to the correct device
    
    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(text, truncation=True, padding=True, return_tensors='pt', max_length=128)
            # Move inputs to the correct device
            for key in inputs.keys():
                inputs[key] = inputs[key].to(device)
            
            logits = model(**inputs).logits
            probabilities = torch.sigmoid(logits).cpu().numpy().flatten()  # Move probabilities back to CPU for numpy
            predictions.append(probabilities)
    return np.array(predictions)

# 13. Test the inference
test_texts = ["Can you recommend a hobby that involves creativity and nature?"]
predictions = predict(test_texts, model)  # Pass the model as an argument

# Pair labels with their probabilities
label_prob_pairs = list(zip(mlb.classes_, predictions.flatten()))

# Sort pairs by probability in descending order
sorted_label_prob_pairs = sorted(label_prob_pairs, key=lambda x: x[1], reverse=True)

# Display results
for label, prob in sorted_label_prob_pairs:
    print(f"Label: {label}, Probability: {prob:.4f}")

Class distribution before splitting:
Acting: 2
Adventure: 14
Advocacy: 1
Agility: 1
Analytical skills: 7
Animal care: 1
Animation: 1
Archaeology: 1
Art therapy: 1
Artistic: 7
Artistic design: 1
Artistic expression: 62
Artistic skills: 5
Astronomy: 1
Athletic training: 1
Attention to detail: 2
Baking: 1
Board games: 1
Book clubs: 1
Challenge: 6
Challenges: 1
Cognitive: 3
Cognitive skills: 12
Collaboration: 21
Communication: 17
Community: 45
Community engagement: 1
Community involvement: 1
Community outreach: 1
Community service: 14
Compassion: 10
Competition: 14
Connection with animals: 6
Content creation: 1
Cooking: 6
Cooking classes: 1
Cooperative games: 1
Coordination: 1
Costume design: 1
Crafting: 17
Craftsmanship: 16
Creative: 2
Creative cooking: 1
Creative problem-solving: 1
Creative projects: 1
Creative writing: 6
Creativity: 104
Critical thinking: 6
Culinary adventure: 5
Culinary arts: 11
Culinary challenges: 1
Culinary skills: 2
Culinary techniques: 1
Cultural appreciation: 8
C

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/270 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

{'eval_loss': 7418.646484375, 'eval_runtime': 0.4014, 'eval_samples_per_second': 112.119, 'eval_steps_per_second': 14.949, 'epoch': 1.0}


  0%|          | 0/6 [00:00<?, ?it/s]

{'eval_loss': 0.8920109272003174, 'eval_runtime': 0.4052, 'eval_samples_per_second': 111.069, 'eval_steps_per_second': 14.809, 'epoch': 2.0}


  0%|          | 0/6 [00:00<?, ?it/s]

{'eval_loss': 0.412678599357605, 'eval_runtime': 0.4352, 'eval_samples_per_second': 103.389, 'eval_steps_per_second': 13.785, 'epoch': 3.0}


  0%|          | 0/6 [00:00<?, ?it/s]

{'eval_loss': 0.10461199283599854, 'eval_runtime': 0.4285, 'eval_samples_per_second': 105.009, 'eval_steps_per_second': 14.001, 'epoch': 4.0}


  0%|          | 0/6 [00:00<?, ?it/s]

{'eval_loss': 0.0842367634177208, 'eval_runtime': 0.4371, 'eval_samples_per_second': 102.959, 'eval_steps_per_second': 13.728, 'epoch': 5.0}


  0%|          | 0/6 [00:00<?, ?it/s]

{'eval_loss': 0.0805240198969841, 'eval_runtime': 0.4215, 'eval_samples_per_second': 106.754, 'eval_steps_per_second': 14.234, 'epoch': 6.0}


  0%|          | 0/6 [00:00<?, ?it/s]

{'eval_loss': 0.07935456931591034, 'eval_runtime': 0.4044, 'eval_samples_per_second': 111.289, 'eval_steps_per_second': 14.839, 'epoch': 7.0}


  0%|          | 0/6 [00:00<?, ?it/s]

{'eval_loss': 0.07871467620134354, 'eval_runtime': 0.4508, 'eval_samples_per_second': 99.824, 'eval_steps_per_second': 13.31, 'epoch': 8.0}


  0%|          | 0/6 [00:00<?, ?it/s]

{'eval_loss': 0.07831087708473206, 'eval_runtime': 0.4978, 'eval_samples_per_second': 90.397, 'eval_steps_per_second': 12.053, 'epoch': 9.0}


  0%|          | 0/6 [00:00<?, ?it/s]

{'eval_loss': 0.07821638882160187, 'eval_runtime': 0.5483, 'eval_samples_per_second': 82.069, 'eval_steps_per_second': 10.943, 'epoch': 10.0}
{'train_runtime': 91.3496, 'train_samples_per_second': 22.989, 'train_steps_per_second': 2.956, 'train_loss': 123525.04444444444, 'epoch': 10.0}
Label: Creativity, Probability: 0.2489
Label: Artistic expression, Probability: 0.1651
Label: Collaboration, Probability: 0.1371
Label: Community, Probability: 0.1272
Label: Recreation, Probability: 0.1150
Label: Hands-on skills, Probability: 0.0999
Label: Physical fitness, Probability: 0.0900
Label: Mindfulness, Probability: 0.0687
Label: Outdoor activity, Probability: 0.0681
Label: Storytelling, Probability: 0.0635
Label: Exploration, Probability: 0.0625
Label: Nature appreciation, Probability: 0.0549
Label: Communication, Probability: 0.0540
Label: DIY, Probability: 0.0501
Label: Personal growth, Probability: 0.0500
Label: Teamwork, Probability: 0.0458
Label: Social interaction, Probability: 0.0447
La

In [17]:
# 12. Define the inference function
def predict(texts, model):
    model.eval()
    predictions = []
    device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Check device availability
    model.to(device)  # Move model to the correct device
    
    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(text, truncation=True, padding=True, return_tensors='pt', max_length=128)
            # Move inputs to the correct device
            for key in inputs.keys():
                inputs[key] = inputs[key].to(device)
            
            logits = model(**inputs).logits
            probabilities = torch.sigmoid(logits).cpu().numpy().flatten()  # Move probabilities back to CPU for numpy
            predictions.append(probabilities)
    return np.array(predictions)

# 13. Test the inference
test_texts = ["Can you recommend a hobby that helps improving my agility and speed?"]
predictions = predict(test_texts, model)  # Pass the model as an argument

# Pair labels with their probabilities
label_prob_pairs = list(zip(mlb.classes_, predictions.flatten()))

# Sort pairs by probability in descending order
sorted_label_prob_pairs = sorted(label_prob_pairs, key=lambda x: x[1], reverse=True)

# Display results
for label, prob in sorted_label_prob_pairs:
    print(f"Label: {label}, Probability: {prob:.4f}")

Label: Creativity, Probability: 0.3711
Label: Artistic expression, Probability: 0.2213
Label: Community, Probability: 0.2102
Label: DIY, Probability: 0.1128
Label: Recreation, Probability: 0.1028
Label: Hands-on skills, Probability: 0.0811
Label: Collaboration, Probability: 0.0751
Label: Social interaction, Probability: 0.0725
Label: Exploration, Probability: 0.0711
Label: Physical fitness, Probability: 0.0655
Label: Performance, Probability: 0.0625
Label: Communication, Probability: 0.0602
Label: Outdoor activity, Probability: 0.0547
Label: Relaxation, Probability: 0.0503
Label: Competition, Probability: 0.0487
Label: Nature appreciation, Probability: 0.0477
Label: Education, Probability: 0.0473
Label: Mindfulness, Probability: 0.0463
Label: Community service, Probability: 0.0453
Label: Teamwork, Probability: 0.0431
Label: Personal growth, Probability: 0.0404
Label: Crafting, Probability: 0.0392
Label: Culinary arts, Probability: 0.0362
Label: Craftsmanship, Probability: 0.0355
Label:

In [5]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

# Load dataset
data = pd.read_csv('train.csv')  # Update this with your actual path
texts = data['text'].tolist()
labels = [label.split(',') for label in data['labels'].tolist()]

# Multi-label binarization
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(labels)

# Split the dataset into train, validation, and test sets (70% train, 15% val, 15% test)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, labels, test_size=0.3
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5
)

# Tokenization for train, validation, and test sets
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors='pt')

# Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

# Save the tokenized datasets and labels for use in the second cell if needed
torch.save(train_dataset, 'train_dataset.pt')
torch.save(val_dataset, 'val_dataset.pt')
torch.save(test_dataset, 'test_dataset.pt')

# Save the MultiLabelBinarizer for encoding/decoding labels
import joblib
joblib.dump(mlb, 'mlb.joblib')



['mlb.joblib']

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

dataset = pd.read_csv('train.csv')
input_texts = dataset['text'].tolist()
input_labels = [label.split(',') for label in dataset['labels'].tolist()]

label_binarizer = MultiLabelBinarizer()
encoded_labels = label_binarizer.fit_transform(input_labels)

train_texts, remaining_texts, train_labels, remaining_labels = train_test_split(
    input_texts, encoded_labels, test_size=0.3
)
validation_texts, test_texts, validation_labels, test_labels = train_test_split(
    remaining_texts, remaining_labels, test_size=0.5
)

text_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_data_encodings = text_tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')
validation_data_encodings = text_tokenizer(validation_texts, truncation=True, padding=True, return_tensors='pt')
test_data_encodings = text_tokenizer(test_texts, truncation=True, padding=True, return_tensors='pt')

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __getitem__(self, index):
        item = {key: val[index] for key, val in self.encodings.items()}
        item['labels'] = self.labels[index]
        return item

    def __len__(self):
        return len(self.labels)

training_dataset = TextDataset(train_data_encodings, train_labels)
validation_dataset = TextDataset(validation_data_encodings, validation_labels)
testing_dataset = TextDataset(test_data_encodings, test_labels)

torch.save(training_dataset, 'training_dataset.pt')
torch.save(validation_dataset, 'validation_dataset.pt')
torch.save(testing_dataset, 'testing_dataset.pt')

import joblib
joblib.dump(label_binarizer, 'label_binarizer.joblib')

import torch
from transformers import MobileBertForSequenceClassification, Trainer, TrainingArguments, MobileBertTokenizer
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

mobilebert_tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')

def calculate_metrics(predictions):
    binary_preds = (predictions.predictions > 0.5).astype(int)
    true_labels = predictions.label_ids

    precision, recall, f1_score, _ = precision_recall_fscore_support(
        true_labels, binary_preds, average='weighted', zero_division=0
    )
    accuracy = accuracy_score(true_labels, binary_preds)

    return {
        "accuracy": accuracy,
        "f1": f1_score,
        "precision": precision,
        "recall": recall,
    }

loaded_training_dataset = torch.load('training_dataset.pt')
loaded_validation_dataset = torch.load('validation_dataset.pt')
loaded_testing_dataset = torch.load('testing_dataset.pt')

classification_model = MobileBertForSequenceClassification.from_pretrained(
    'google/mobilebert-uncased',
    num_labels=len(loaded_training_dataset.labels[0]),
    problem_type="multi_label_classification"
)

training_config = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
)

model_trainer = Trainer(
    model=classification_model,
    args=training_config,
    train_dataset=loaded_training_dataset,
    eval_dataset=loaded_validation_dataset,
    compute_metrics=calculate_metrics,
)

model_trainer.train()

validation_results = model_trainer.evaluate()
print(validation_results)

validation_predictions = model_trainer.predict(loaded_validation_dataset)
binary_predictions = (validation_predictions.predictions > 0.5).astype(int)
true_validation_labels = validation_predictions.label_ids

print("Predictions:", binary_predictions)
print("True Labels:", true_validation_labels)

test_results = model_trainer.evaluate(loaded_testing_dataset)
print(test_results)

classification_model.save_pretrained('fine_tuned_model')
mobilebert_tokenizer.save_pretrained('fine_tuned_model')
