# **Preparing the data function**

In [None]:
TRAIN_DATA_PATH = '/kaggle/input/fake-or-real-the-imposter-x-train/back_translated_train_dataset.csv' 
import pandas as pd
import numpy as np
import random
df = pd.read_csv(TRAIN_DATA_PATH)
df['text_1'] = df['text_1'].fillna('')
df['text_2'] = df['text_2'].fillna('')
def shift_randomize_data(df,slide_val):
    n = len(df)
    real_texts = pd.concat([
        df.loc[df['labels'] == 1, 'text_1'],
        df.loc[df['labels'] == 2, 'text_2']
    ]).sort_index()

    fake_texts = pd.concat([
        df.loc[df['labels'] == 1, 'text_2'],
        df.loc[df['labels'] == 2, 'text_1']
    ]).sort_index()
    slided_fake_texts = np.roll(fake_texts.values, -slide_val)
    part1_df = pd.DataFrame({
        'text_1': real_texts.values,
        'text_2': slided_fake_texts,
        'label': np.zeros(n, dtype=int)
    })
    part2_df = pd.DataFrame({
        'text_1': slided_fake_texts,
        'text_2': real_texts.values,
        'label': np.ones(n, dtype=int)
    })
    final_df = pd.concat([part1_df, part2_df], ignore_index=True)

    return final_df.sample(frac=1).reset_index(drop=True)

# **Hyper Parameters**

In [None]:
# HYPER PARAMETRS
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification,create_optimizer
EPOCHS = 4
LR = 3e-5
BATCH_SIZE = 4
WT_DECAY = 0.01
NUM_MODELS = 5
SLIDE_PER_RUN = int(len(df) * 0.8 / NUM_MODELS)
MODEL_NAME = 'microsoft/deberta-v3-base'
MAX_TOKEN_LEN = 256
STATE = 42
STRATEGY = tf.distribute.MirroredStrategy()
MODEL_PATHS = []
HISTORY_OF_MODELS = []

In [None]:
from sklearn.model_selection import train_test_split
def tokenize_pack_data(X,y):
    encodings = tokenizer(
        X['text_1'].fillna('').tolist(),
        X['text_2'].fillna('').tolist(),
        truncation=True,
        padding=True,
        max_length=MAX_TOKEN_LEN
    )
    labels = y.values
    dataset = tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        labels
    ))
    dataset = dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    return dataset
    
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
X = df[['text_1', 'text_2']]
y = df['labels']
X_train, X_val, y_train, y_val = train_test_split(
                                        X, y,
                                        test_size=0.2,  
                                        random_state=STATE 
                                      )
y_val = y_val - 1
val_dataset = tokenize_pack_data(X_val,y_val)
train_df = pd.concat([X_train,y_train],axis=1).sort_index()

# **Model Training**

In [None]:
import gc
from transformers import Trainer, TrainingArguments
from tensorflow.keras.callbacks import ModelCheckpoint
for i in range(NUM_MODELS):
    shift_val = i * SLIDE_PER_RUN
    print(f"--- Training Model {i+1}/{NUM_MODELS} with a shift of {shift_val} ---")
    shifted_train_df = shift_randomize_data(train_df,shift_val)
    train_dataset = tokenize_pack_data(shifted_train_df[['text_1', 'text_2']],
                                       shifted_train_df['label']
                                      )
    with STRATEGY.scope():
        model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_NAME,num_labels=1)
        num_train_steps = len(train_dataset) * EPOCHS
        optimizer, _ = create_optimizer(init_lr=LR, 
                                        num_warmup_steps=int(0.1 * num_train_steps), 
                                        num_train_steps=num_train_steps
                                        )
        loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        model.compile(optimizer=optimizer,loss=loss, metrics=['accuracy'])
    model_path = f"./my_trained_model_{i}"
    MODEL_PATHS.append(model_path)
    history = model.fit(
        train_dataset,
        epochs=EPOCHS,
        validation_data=val_dataset,
    )
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)
    HISTORY_OF_MODELS.append(history)
    # TRAINED_MODELS.append(model)
    tf.keras.backend.clear_session()
    gc.collect()

In [None]:
val_encodings=tokenizer(
        X_val['text_1'].fillna('').tolist(),
        X_val['text_2'].fillna('').tolist(),
        truncation=True,
        padding=True,
        max_length=MAX_TOKEN_LEN,
        return_tensors="tf"
    )

In [None]:
from sklearn.metrics import accuracy_score
test_df = pd.read_csv('/kaggle/input/fake-or-real-the-imposter-x-train/test_data.csv')
test_encodings=tokenizer(
        test_df['text_1'].fillna('').tolist(),
        test_df['text_2'].fillna('').tolist(),
        truncation=True,
        padding=True,
        max_length=MAX_TOKEN_LEN,
        return_tensors="tf"
    )
all_logits = []
for path in MODEL_PATHS:
    model = TFAutoModelForSequenceClassification.from_pretrained(path, num_labels=1)
    print(f"Loading weights from {path}...")    
    logits = model.predict(dict(test_encodings)).logits
    all_logits.append(tf.nn.sigmoid(logits).numpy().flatten())
preds_proba = np.mean(all_logits,axis=0)
preds_labels = np.round(preds_proba)
ensemble_accuracy = accuracy_score(y_val, preds_proba) 
print(f"\nFinal Ensemble Validation Accuracy: {ensemble_accuracy:.4f}")

In [None]:
labels = np.where(preds_labels==0,1,2)
def make_submission_csv(results):
    df_results = pd.DataFrame(results)
    output_df = df_results.copy()
    output_df.columns = ['real_text_id']
    output_df.reset_index(inplace=True)
    output_df.rename(columns={'index': 'id'}, inplace=True)
    output_df.to_csv('5_BERTs_on_shiffting_data_debiasing_BT.csv', index=False)
    return output_df
make_submission_csv(labels)

# **Smart Combining**

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
def create_hard_negative_dataset(train_pool_df: pd.DataFrame, top_k: int = 3) -> pd.DataFrame:
    """
    Performs hard negative mining to create a high-quality, debiased training set.

    For each real text in the training pool, it finds the 'top_k' most similar
    fake texts and creates debiased pairs for training.

    Args:
        train_pool_df: The DataFrame containing the training portion of the data.
                       Must have 'text_1', 'text_2', and 'labels' columns.
        top_k: The number of most similar fakes to pair with each real text.

    Returns:
        A new DataFrame with debiased (real, fake) and (fake, real) pairs,
        ready for model training.
    """
    print("Starting hard negative mining data augmentation...")
    real_texts = pd.concat([
        train_pool_df.loc[train_pool_df['labels'] == 1, 'text_1'],
        train_pool_df.loc[train_pool_df['labels'] == 2, 'text_2']
    ]).sort_index().tolist()

    fake_texts = pd.concat([
        train_pool_df.loc[train_pool_df['labels'] == 1, 'text_2'],
        train_pool_df.loc[train_pool_df['labels'] == 2, 'text_1']
    ]).sort_index().tolist()
    print("Generating text embeddings...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    real_embeddings = model.encode(real_texts, show_progress_bar=True)
    fake_embeddings = model.encode(fake_texts, show_progress_bar=True)

    print("Calculating similarity and finding hard negatives...")

    similarity_matrix = cosine_similarity(real_embeddings, fake_embeddings)

    hard_negative_indices = np.argsort(similarity_matrix, axis=1)[:, -top_k:]

    augmented_pairs = []
    for i in range(len(real_texts)):
        real_text = real_texts[i]
        for fake_idx in hard_negative_indices[i]:
            hard_fake_text = fake_texts[fake_idx]
            augmented_pairs.append({'text_1': real_text, 'text_2': hard_fake_text, 'label': 1})
            augmented_pairs.append({'text_1': hard_fake_text, 'text_2': real_text, 'label': 0})
            
    final_df = pd.DataFrame(augmented_pairs)
    print(f"Data augmentation complete. Generated {len(final_df)} training samples.")
    return final_df.sample(frac=1).reset_index(drop=True)

In [None]:
from sklearn.model_selection import train_test_split
PATH = '/kaggle/input/fake-or-real-the-imposter-x-train/train_dataset.csv'
STATE =42
df = pd.read_csv(PATH)
train_df ,val_df = train_test_split(df,test_size=0.15,random_state=STATE)

val_df['labels'] = np.where(val_df['labels']==1,1,0)

augmented_train_df = create_hard_negative_dataset(train_df)

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification,create_optimizer
EPOCHS = 4
LR = 2.5e-5
BATCH_SIZE = 4
WT_DECAY = 0.01
NUM_MODELS = 5
SLIDE_PER_RUN = int(len(df) * 0.8 / NUM_MODELS)
MODEL_NAME = 'microsoft/deberta-v3-base'
MAX_TOKEN_LEN = 256
STRATEGY = tf.distribute.MirroredStrategy()

In [None]:
def tokenize_pack_data(X,y):
    encodings = tokenizer(
        X['text_1'].fillna('').tolist(),
        X['text_2'].fillna('').tolist(),
        truncation=True,
        padding=True,
        max_length=MAX_TOKEN_LEN
    )
    labels = y.values
    dataset = tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        labels
    ))
    dataset = dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    return dataset

In [None]:
with STRATEGY.scope():
        model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_NAME,num_labels=1)
        num_train_steps = len(augmented_train_df) * EPOCHS
        optimizer, _ = create_optimizer(init_lr=LR, 
                                        num_warmup_steps=int(0.1 * num_train_steps), 
                                        num_train_steps=num_train_steps,
                                        weight_decay_rate=WT_DECAY
                                        )
        # loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        model.compile(optimizer=optimizer, metrics=['accuracy'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
augmented_train_dataset = tokenize_pack_data(augmented_train_df[['text_1','text_2']],
                                             augmented_train_df['label'])
val_dataset = tokenize_pack_data(val_df[['text_1','text_2']],
                                 val_df['labels'])

In [None]:
history = model.fit(
        augmented_train_dataset,
        epochs=EPOCHS,
        validation_data=val_dataset,
    )

In [None]:
test_df = pd.read_csv('/kaggle/input/fake-or-real-the-imposter-x-train/test_data.csv')
test_encodings = tokenizer(
        test_df['text_1'].fillna('').tolist(),
        test_df['text_2'].fillna('').tolist(),
        truncation=True,
        padding=True,
        max_length=MAX_TOKEN_LEN,
        return_tensors="tf"
    )

In [None]:
test_logits = model.predict(dict(test_encodings)).logits
test_proba = tf.nn.sigmoid(test_logits).numpy().flatten()

In [None]:
test_proba

In [None]:
test_labels = np.where(test_proba>0.5,1,2)

In [None]:
def make_submission_csv(results):
    df_results = pd.DataFrame(results)
    output_df = df_results.copy()
    output_df.columns = ['real_text_id']
    output_df.reset_index(inplace=True)
    output_df.rename(columns={'index': 'id'}, inplace=True)
    output_df.to_csv('DEBERTa_on_hard_negative_dataset(k=3).csv', index=False)
    return output_df
make_submission_csv(test_labels)