In [None]:
# Hyper Parameters
import tensorflow as tf
from transformers import AutoTokenizer,TFAutoModelForSequenceClassification,create_optimizer,AutoConfig
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

LR = 1e-6
MAX_TOKEN_LEN = 256
NUM_EPOCHS = 2
PATIENCE = 2  
BATCH_SIZE = 1
ORIGINAL_WEIGHT = 1.0
PSEUDO_WEIGHT = 0.75 
WT_DECAY = 0.1
CONFIDENCE_THRESH = 0.85 
AGREEMENT_THRESH = 0.10
CONFIDENCE_WEIGHT = 0.7
NUM_PSEUDO_PAIRS = 200
STRATEGY = tf.distribute.MirroredStrategy()
SAVE_PATH = f'./best_models'
MODEL_NAME = 'roberta-large'
PRED_SAVE_PATH = './pred_proba'
SAVED_PATHS = []
MODEL_NAMES = [
    'albert-base-v2',
    'distilbert-base-uncased',
    'google/mobilebert-uncased'
]

# Reinforcement Data Generation

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

def get_semantic_similarity(text1, text2):
    """
    Calculates the semantic similarity between two texts using sentence embeddings.
    """
    embeddings = sbert_model.encode([text1, text2])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
total_similarity = []
for _,row in test_df.iterrows():
    similarity = get_semantic_similarity(row['text_2'],row['text_1'])
    total_similarity.append(similarity)
test_df['similarity_score'] = total_similarity

In [None]:
test_df = test_df.sort_values(by='similarity_score',ascending=False)

ids_similar = test_df[:35]['id']
ids_psuedo  = pseudo_train_data['id']
ids_to_exclude = pd.concat([ids_similar, ids_psuedo]).unique()

In [None]:
deberta_test_proba = pd.read_csv('/kaggle/input/pred-probas-from-different-models-for-stacking/deberta_large_READX200_test_proba.csv')

deberta_test_proba['confidence'] = (deberta_test_proba['deberta_large_READX200_prob1'] - deberta_test_proba['deberta_large_READX200_prob2']).abs()
deberta_test_proba = debrta_test_proba.sort_values(by='confidence')

deberta_test_proba['id'] = deberta_test_proba['Unnamed: 0']
deberta_test_proba.drop(['Unnamed: 0'],axis=1,inplace=True)

deberta_test_proba_filtered = deberta_test_proba[~deberta_test_proba['id'].isin(ids_to_exclude)]

In [None]:
org_test_df = pd.read_csv(TEST_DATA_PATH)
reinforcement_test_data = org_test_df[org_test_df['id'].isin(deberta_test_proba_filtered.head(200)['id'])]

In [None]:
reinforcement_test_data.to_csv('reinforcement_test_data_200_version.csv')

# Loading the Data and Using the Manual Train Loop

In [None]:
train_df = pd.read_csv(TRAIN_DATA_PATH)
test_df = pd.read_csv(TEST_DATA_PATH)
psuedo_train_df = pd.read_csv(PSUEDO_DATA_PATH)
reinforcement_train_data = pd.read_csv(RT_PATH)

In [None]:
def pack_tokenize_dataset(combined_df, 
                          reinforced_df,
                          tokenizer, 
                          batch_size, 
                          max_len):
    """Combines all data sources and prepares train and val tf.data.Dataset for the custom loop."""
    orginal_train_df = combined_df[combined_df['category']=='original']
    psuedo_df = combined_df[combined_df['category']=='psuedo']
    
    original_df,val_df = train_test_split(orginal_train_df,
                                        test_size=0.15,
                                        random_state=42,
                                        startegy=orginal_train_df['label'])
    
    original_df['sample_weight'] = ORIGINAL_WEIGHT
    pseudo_df['sample_weight'] = PSEUDO_WEIGHT
    labeled_df = pd.concat([original_df, pseudo_df], ignore_index=True)
    
    unlabeled_df = reinforced_df.copy()
    unlabeled_df['label'] = -1
    unlabeled_df['sample_weight'] = 0.0
    
    combined_df = pd.concat([labeled_df, unlabeled_df], ignore_index=True).sample(frac=1).reset_index(drop=True)

    train_encodings = tokenizer(combined_df['text'].tolist(), 
                                truncation=True, 
                                padding=True, 
                                max_length=max_len)
    
    train_dataset = tf.data.Dataset.from_tensor_slices((
        dict(train_encodings),
        combined_df['label'].values,
        combined_df['sample_weight'].values.astype('float32')
    ))
    val_encodings = tokenizer(val_df['text'].tolist(), 
                          truncation=True, 
                          padding=True, 
                          max_length=max_len)
    val_dataset = tf.data.Dataset.from_tensor_slices((
        dict(val_encodings)
    ))
    return train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE),val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [None]:
def get_model_parts(model_name, 
                    lr, 
                    weight_decay, 
                    num_train_steps, 
                    num_warmup_steps):
    """Initializes the model and a suitable optimizer."""
    
    config = AutoConfig.from_pretrained(model_name)
    config.num_labels = 2
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name, config=config)
    optimizer, _ = create_optimizer(
        init_lr=lr,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        weight_decay_rate=weight_decay
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, optimizer ,tokenizer

In [None]:
@tf.function
def train_step(model, 
               batch, 
               optimizer, 
               supervised_loss_fn, 
               confidence_weight):
    """Performs one training step with a supervised and unsupervised loss."""
    inputs, labels, sample_weights = batch

    with tf.GradientTape() as tape:
        logits = model(inputs, training=True).logits
        
        labeled_mask = tf.not_equal(labels, -1)
        unlabeled_mask = tf.equal(labels, -1)
        

        labeled_logits = tf.boolean_mask(logits, labeled_mask)
        labeled_labels = tf.boolean_mask(labels, labeled_mask)
        labeled_weights = tf.boolean_mask(sample_weights, labeled_mask)

        per_sample_loss = supervised_loss_fn_no_reduction(labeled_labels, labeled_logits)
        supervised_loss = tf.reduce_mean(per_sample_loss * labeled_weights)
        
        unlabeled_logits = tf.boolean_mask(logits, unlabeled_mask)
        if tf.shape(unlabeled_logits)[0] > 0:
            probs_unlabeled = tf.nn.softmax(unlabeled_logits)
            confidence_loss = -tf.reduce_mean(tf.reduce_sum(probs_unlabeled * tf.math.log(probs_unlabeled + 1e-9), axis=-1))
        else:
            confidence_loss = 0.0

        total_loss = supervised_loss + confidence_weight * tf.cast(confidence_loss, dtype=supervised_loss.dtype)

    gradients = tape.gradient(total_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return total_loss ,logits
@tf.function
def validation_step(model, batch, loss_fn):
    """Performs one step of validation."""
    
    inputs, labels = batch
    logits = model(inputs, training=False).logits
    loss = loss_fn(labels, logits)
    return loss, logits

In [None]:
def run_training(
    model_name,
    combined_df,
    reinforced_df,
    epochs,
    save_path
):
    """Orchestrates the full training and validation loop for a model."""
    best_val_loss = float('inf')
    with STRATEGY.scope():        
        model, optimizer,tokenizer = get_model_parts(
            model_name=model_name,
            lr=LR,
            weight_decay=WT_DECAY,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps
        )
        supervised_loss_fn_no_reduction = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
        val_loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    train_dataset,val_dataset = pack_tokenize_dataset(combined_df,
                                                      reinforced_df,
                                                      tokenizer, 
                                                      batch_size, 
                                                      max_len)
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        model.trainable = True
        for batch in tqdm(train_dataset, desc="Training"):
            loss, _ = train_step(model,
                                 batch, 
                                 optimizer, 
                                 supervised_loss_fn_no_reduction,
                                 confidence_weight)
        val_losses = []
        all_val_preds = []
        all_val_labels = []
        for batch in tqdm(val_dataset, desc="Validating"):
            val_loss, val_logits = validation_step(model, 
                                                   batch, 
                                                   val_loss_fn)
            val_losses.append(val_loss)
            all_val_preds.extend(np.argmax(val_logits, axis=1))
            all_val_labels.extend(batch[1].numpy())
            
        avg_val_loss = np.mean(val_losses)
        val_accuracy = accuracy_score(all_val_labels, all_val_preds)
        
        print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

        if avg_val_loss < best_val_loss:
            print(f"Validation loss improved. Saving model to {save_path}")
            best_val_loss = avg_val_loss
            model.save_pretrained(save_path)
    return model

In [None]:
trained_model = run_training(
    model_name = MODEL_NAME,
    optimizer = optimizer,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    epochs=NUM_EPOCHS,
    save_path= SAVE_PATH
)

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, create_optimizer, AutoConfig
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
import gc

TEST_DATA_PATH = ''
TRAIN_DATA_PATH = ''
PSEUDO_TRAIN_DATA_PATH = ''
RT_PATH = ''

# --- 1. Hyperparameters and Configuration ---
DEBUG = False
LR = 1.5e-5
MAX_LEN = 256  # Using 256 for better memory performance with large models
NUM_EPOCHS = 3
PATIENCE = 2
BATCH_SIZE_PER_REPLICA = 2 # This is the batch size for each GPU
ORIGINAL_WEIGHT = 1.0
PSEUDO_WEIGHT = 0.75
CONFIDENCE_WEIGHT = 0.5 # Weight for the unsupervised loss
WT_DECAY = 0.1
MODEL_PATH = '/kaggle/input/deberta-large-models/deberta_large_on_READX200'
MODEL_NAME = MODEL_PATH 
SAVE_PATH = f'./final_model'
STATE = 42
strategy = tf.distribute.MirroredStrategy()
GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync


In [None]:

def augment_data(df):
    """Unpairs a DataFrame of paired texts into a single-text classification format."""
    if 'labels' in df.columns or 'label' in df.columns:
        # --- Labeled Data Logic (Optimized) ---
        label_col = 'labels' if 'labels' in df.columns else 'label'
        all_texts = pd.concat([df['text_1'], df['text_2']], ignore_index=True)
        labels_for_text1 = np.where(df[label_col] == 1, 1, 0)
        labels_for_text2 = np.where(df[label_col] == 1, 0, 1)  
        all_labels = np.concatenate([labels_for_text1, labels_for_text2])
        return pd.DataFrame({'text': all_texts, 'label': all_labels})

    else:
        all_texts = pd.concat([df['text_1'], df['text_2']], ignore_index=True)
        all_ids = pd.concat([df['id'], df['id']], ignore_index=True)
        original_pos = ['text_1'] * len(df) + ['text_2'] * len(df)

        return pd.DataFrame({
            'id': all_ids,
            'original_pos': original_pos,
            'text': all_texts
        })
        

def create_datasets(original_train_df, pseudo_df, reinforcement_df, val_df, tokenizer,batch_size):
    """Creates the final tf.data.Dataset objects for training and validation."""

    original_train_df['sample_weight'] = ORIGINAL_WEIGHT
    pseudo_df['sample_weight'] = PSEUDO_WEIGHT
    labeled_df = pd.concat([original_train_df, pseudo_df], ignore_index=True)
    

    unlabeled_df = reinforcement_df.copy()
    unlabeled_df['label'] = -1
    unlabeled_df['sample_weight'] = 0.0
    

    final_train_df = pd.concat([labeled_df, 
                                unlabeled_df], 
                               ignore_index=True).sample(frac=1, 
                                                         random_state=STATE).reset_index(drop=True)


    train_encodings = tokenizer(final_train_df['text'].fillna('').tolist(), 
                                truncation=True, 
                                padding=True,
                                max_length=MAX_LEN)
    train_dataset = tf.data.Dataset.from_tensor_slices((
        dict(train_encodings),
        final_train_df['label'].values,
        final_train_df['sample_weight'].values.astype('float32')
    )).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    val_encodings = tokenizer(val_df['text'].fillna('').tolist(), 
                              truncation=True, 
                              padding=True, 
                              max_length=MAX_LEN)
    val_dataset = tf.data.Dataset.from_tensor_slices((
        dict(val_encodings),
        val_df['label'].values
    )).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    return train_dataset, val_dataset

In [None]:
df_original = pd.read_csv(TRAIN_DATA_PATH)
df_pseudo = pd.read_csv(PSEUDO_TRAIN_DATA_PATH)
df_reinforcement  = pd.read_csv(RT_PATH)

original_train_df, original_val_df = train_test_split(df_original, test_size=0.25, random_state=STATE, stratify=df_original['labels'])
aug_original_train = augment_data(original_train_df)
aug_val = augment_data(original_val_df)
aug_pseudo = augment_data(df_pseudo)
aug_reinforce = augment_data(df_reinforcement) 

In [None]:
if DEBUG:
    print("="*30)
    print("--- RUNNING IN DEBUG MODE ---")
    print("="*30)
    aug_original_train = aug_original_train.sample(n=8, random_state=42).reset_index(drop=True)
    aug_val = aug_val.sample(n=4, random_state=42).reset_index(drop=True)
    aug_pseudo = aug_pseudo.sample(n=10, random_state=42).reset_index(drop=True)
    aug_reinforce = aug_reinforce.sample(n=10, random_state=42).reset_index(drop=True)
    NUM_EPOCHS = 1
    MAX_TOKEN_LEN = 128
else:
    print("="*30)
    print("--- RUNNING IN NORMAL MODE ---")
    print("="*30)


with strategy.scope():
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False if 'deberta-v3' in MODEL_NAME else True)
        num_train_steps = (len(aug_original_train) + len(aug_pseudo) + len(aug_reinforce)) // GLOBAL_BATCH_SIZE * NUM_EPOCHS
        
        config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=2)
        model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
        optimizer, _ = create_optimizer(init_lr=LR, num_warmup_steps=0, num_train_steps=num_train_steps, weight_decay_rate=WT_DECAY)
        
        supervised_loss_fn_no_reduction = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
        val_loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                                    reduction=tf.keras.losses.Reduction.NONE)

@tf.function
def train_step(model, optimizer, supervised_loss_fn, confidence_weight, batch):
        features, labels, sample_weights = batch
        with tf.GradientTape() as tape:
            logits = model(features, training=True).logits
            
            labeled_mask = tf.not_equal(labels, -1)
            labeled_logits = tf.boolean_mask(logits, labeled_mask)
            labeled_labels = tf.boolean_mask(labels, labeled_mask)
            labeled_weights = tf.boolean_mask(sample_weights, labeled_mask)
            
            per_sample_loss = supervised_loss_fn(labeled_labels, labeled_logits)
            supervised_loss = tf.nn.compute_average_loss(per_sample_loss, sample_weight=labeled_weights)
            
            unlabeled_logits = tf.boolean_mask(logits, tf.equal(labels, -1))
            if tf.shape(unlabeled_logits)[0] > 0:
                probs_unlabeled = tf.nn.softmax(unlabeled_logits)
                confidence_loss = -tf.reduce_mean(tf.reduce_sum(probs_unlabeled * tf.math.log(probs_unlabeled + 1e-9), axis=-1))
            else:
                confidence_loss = 0.0
    
            total_loss = supervised_loss + confidence_weight * tf.cast(confidence_loss, dtype=supervised_loss.dtype)
            
        gradients = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        return total_loss, logits

@tf.function
def validation_step(inputs):
        """Performs one distributed validation step."""
        features, labels = inputs
        logits = model(features, training=False).logits
        per_sample_loss = val_loss_fn(labels, logits)
        loss = tf.nn.compute_average_loss(per_sample_loss, global_batch_size=GLOBAL_BATCH_SIZE)
        return loss, logits, labels

train_dataset, val_dataset = create_datasets(aug_original_train, 
                                             aug_pseudo, 
                                             aug_reinforce, 
                                             aug_val, 
                                             tokenizer, 
                                             batch_size=GLOBAL_BATCH_SIZE)

best_val_loss = float('inf')
patience_counter = 0

for epoch in range(NUM_EPOCHS):    
    print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS}")
    running_loss = 0.0
    pbar = tqdm(train_dataset, desc="Training")
    for i,batch in enumerate(pbar):
        per_replica_loss, per_replica_logits = strategy.run(
            train_step, 
            args=(model, optimizer, supervised_loss_fn_no_reduction, CONFIDENCE_WEIGHT, batch)
        )
        batch_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None)    
        mean_batch_loss = batch_loss / strategy.num_replicas_in_sync
        running_loss = (running_loss * i + mean_batch_loss.numpy()) / (i + 1)
            
        pbar.set_postfix({'training_loss': f'{running_loss:.4f}'})
        
    
    val_losses, all_val_preds, all_val_labels = [], [], []
    dist_val_dataset = strategy.experimental_distribute_dataset(val_dataset)
    for per_replica_batch  in tqdm(dist_val_dataset, desc="Validating"):
        per_replica_loss, per_replica_logits, per_replica_labels = strategy.run(
            validation_step, args=(per_replica_batch,)
        )
        
        gathered_labels = strategy.gather(per_replica_labels, axis=0)
        gathered_logits = strategy.gather(per_replica_logits, axis=0)
        

        loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None)
        

        val_losses.append(loss.numpy())
        all_val_preds.extend(np.argmax(gathered_logits, axis=1))
        all_val_labels.extend(gathered_labels.numpy())
    avg_val_loss = np.mean(val_losses) / strategy.num_replicas_in_sync
    val_accuracy = accuracy_score(all_val_labels, all_val_preds)
    
    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    if avg_val_loss < best_val_loss:
        print(f"Validation loss improved. Saving model to {SAVE_PATH}")
        
        best_val_loss = avg_val_loss
        model.save_pretrained(SAVE_PATH)
        tokenizer.save_pretrained(SAVE_PATH)
        patience_counter = 0
    else:
        patience_counter += 1
        print(f"Patience: {patience_counter}/{PATIENCE}")
    
    if patience_counter >= PATIENCE:
        print("Early stopping triggered.")
        break
    gc.collect()

print("\n--- Pipeline Complete ---")

In [None]:
def pack_data(test_df_text, tokenizer, mode='pred'):
    encodings = tokenizer(test_df_text['text'].fillna('').tolist(), 
                                truncation=True, 
                                padding=True,
                                max_length=MAX_LEN)
    dataset = tf.data.Dataset.from_tensor_slices((
        dict(encodings),
    )).batch(1).prefetch(tf.data.AUTOTUNE)
    return dataset

In [None]:
test_df = pd.read_csv('/kaggle/input/fake-or-real-the-imposter-x-train/test_data.csv')
CURR_SAVE_PATHS = ['/kaggle/working/final_model']
test_df_text1 = test_df[['text_1']].rename(columns={'text_1': 'text'})
test_df_text2 = test_df[['text_2']].rename(columns={'text_2': 'text'})
model_predictions = {}
tf.keras.mixed_precision.set_global_policy('mixed_float16')
for model_path in CURR_SAVE_PATHS:
    print(f'---------------predicting with saved model{model_path}---------------')
    print('---------------Loading Model and Tokenizer---------------')
    with strategy.scope():
        model = TFAutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    print('---------------Tokenizing and Predicting on Text1---------------')
    
    test_dataset1 = pack_data(test_df_text1, tokenizer, mode='pred')
    logits1 = model.predict(test_dataset1).logits
    probs1 = tf.nn.softmax(logits1)[:, 1].numpy() 
    
    print('---------------Tokenizing and Predicting on Text2---------------')
    test_dataset2 = pack_data(test_df_text2, tokenizer, mode='pred')
    logits2 = model.predict(test_dataset2).logits
    probs2 = tf.nn.softmax(logits2)[:, 1].numpy() 

    model_predictions[f'{model_path}_prob1'] = probs1
    model_predictions[f'{model_path}_prob2'] = probs2
    print('---------------Finished Model Predictions---------------')
    tf.keras.backend.clear_session()
    gc.collect()

In [None]:
predictions_df = pd.DataFrame(model_predictions)
predictions_df['id'] = test_df['id'] 
PRED_SAVE_PATH = 'deberta(0.91493)_large_on_RL_system_.csv'
predictions_df.to_csv(PRED_SAVE_PATH, index=False)
print(f"\nIndividual model predictions saved to {PRED_SAVE_PATH}")
print(predictions_df.head())


prob1_cols = [col for col in predictions_df.columns if '_prob1' in col]
prob2_cols = [col for col in predictions_df.columns if '_prob2' in col]

final_avg_probs_text1 = predictions_df[prob1_cols].mean(axis=1)
final_avg_probs_text2 = predictions_df[prob2_cols].mean(axis=1)

final_labels = np.where(final_avg_probs_text1 > final_avg_probs_text2, 1, 2)

In [None]:
def make_submission_csv(results,name=None):
    df_results = pd.DataFrame(results)
    output_df = df_results.copy()
    output_df.columns = ['real_text_id']
    output_df.reset_index(inplace=True)
    output_df.rename(columns={'index': 'id'}, inplace=True)
    if name!=None:
        output_df.to_csv(name, index=False)
    return output_df
pred_df = make_submission_csv(final_labels,name='Deberta_large_on_RL_system_.csv')

In [None]:
# !zip -r best_deberta_large_models.zip /kaggle/working/best_models_fine_tuned_fine_tuned
pred_df