In [None]:
import pandas as pd
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load data
train_data = pd.read_csv('training_set.csv')
test_data = pd.read_csv('test_set.csv')

# Combine all features into a single text column
features = [col for col in train_data.columns if col not in ['ID', 'ind']]
train_data['combined_text'] = train_data[features].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
test_data['combined_text'] = test_data[features].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Initialize the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def batch_tokenize(texts, tokenizer, batch_size=100):
    tokenized_batches = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        tokenized = tokenizer(batch, truncation=True, padding=True, max_length=64, return_tensors='tf')
        tokenized_batches.append(tokenized)
    return tokenized_batches

# Tokenize data in batches
train_tokenized_batches = batch_tokenize(list(train_data['combined_text']), tokenizer)

# Concatenate the tokenized batches
def concatenate_batches(tokenized_batches):
    concatenated_encodings = {key: [] for key in tokenized_batches[0].keys()}
    for batch in tokenized_batches:
        for key in batch.keys():
            concatenated_encodings[key].extend(batch[key])
    return concatenated_encodings

# Concatenate train tokenized batches
train_encodings = concatenate_batches(train_tokenized_batches)

# Split training data for validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_encodings, train_data['ind'], test_size=0.2, random_state=42
)

# Prepare training and validation datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_texts), train_labels)).shuffle(1000).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_texts), val_labels)).batch(32)

from transformers import TFDistilBertForSequenceClassification, DistilBertConfig

# Define DistilBERT configuration
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=2)

# Define mirrored strategy for multi-GPU training
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    # Initialize model with specified configuration
    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Custom F1 score metric
def f1_metric(y_true, y_pred):
    return f1_score(y_true, tf.round(tf.nn.sigmoid(y_pred)))

# Train the model
model.fit(train_dataset, validation_data=val_dataset, epochs=3)

# Tokenize test data
test_tokenized_batches = batch_tokenize(list(test_data['combined_text']), tokenizer)
test_encodings = concatenate_batches(test_tokenized_batches)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings))).batch(32)

# Predict on the test set
test_predictions = model.predict(test_dataset)['logits']

# Convert logits to 0 or 1 predictions
test_preds = tf.nn.sigmoid(test_predictions)
test_preds = tf.where(test_preds < 0.5, 0, 1)

# Create a DataFrame for the output
output_df = pd.DataFrame({
    'ID': test_data['ID'],
    'ind': test_preds.numpy().flatten()
})

# Display the output DataFrame to ensure it's correct
print(output_df.head())

2023-12-16 19:54:39.370093: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

  0%|          | 0/112 [00:00<?, ?it/s]2023-12-16 19:55:29.578489: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9267 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:08:00.0, compute capability: 7.5
  1%|          | 1/112 [00:26<49:37, 26.82s/it]2023-12-16 19:55:29.579909: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 9267 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:09:00.0, compute capability: 7.5
2023-12-16 19:55:29.581107: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 9267 MB memory:  -> device: 2, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:87:00.0, compute capability: 7.5
2023-12-16 19:55:29.582299: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/repl