In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# TRAIN_DATA_PATH = '/kaggle/input/fake-or-real-the-imposter-x-train/train_data.csv'
TEST_DATA_PATH = '/kaggle/input/fake-or-real-the-imposter-x-train/test_data.csv'

In [None]:
# train_df = pd.read_csv(TRAIN_DATA_PATH)
TEST_DATA_PATH = '/kaggle/input/fake-or-real-the-imposter-x-train/test_data.csv'
test_df = pd.read_csv(TEST_DATA_PATH)

# **Preparing the Data for our Deep Learning Model...**

In [None]:
import os 
path = '/kaggle/input/fake-or-real-the-impostor-hunt/data/train'
train_df = []
for articles in os.listdir(path):
    data_point = []
    data_point.append(articles)
    for text in os.listdir(path+'/'+articles):
        with open(path+'/'+articles+'/'+text,'r') as file:
            contents = file.read()
            data_point.append(contents)
            file.close()
    train_df.append(data_point)
train_df[:1]

In [None]:
train_df = pd.DataFrame(train_df)
train_df.columns = ['article_number','text_2','text_1']

In [None]:
train_df['id'] = train_df['article_number'].str.split('_').str[1].astype(int)
train_df = train_df.sort_values(by='id').reset_index(drop=True)

train_labels_path = '/kaggle/input/fake-or-real-the-impostor-hunt/data/train.csv'
train_labels = pd.read_csv(train_labels_path)
print(len(train_labels))

pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None)

train_labels.drop(index = [10,14],inplace=True)
train_df.drop(index = [10,14],inplace=True)

print(len(train_labels),len(train_df))

train_df_total = train_df
train_df_total['labels'] = train_labels['real_text_id']

# RUN FROM HERE FOR TRAINING THE TRANSFORMER MODEL 

In [None]:
train_dataset_path = '/kaggle/input/fake-or-real-the-imposter-x-train/train_dataset.csv'
train_dataset = pd.read_csv(train_dataset_path)

In [None]:
train_dataset.head()

In [None]:
df = train_dataset

In [None]:
from sklearn.model_selection import train_test_split
df['final_text1'] = df['text_1']
df['final_text2'] = df['text_2']
df['final_label'] = np.where(df['labels'] == 1, 1, 0) # 1 if text1 is real, 0 otherwise

# Find rows where text2 was the real one (original label == 2)
swap_indices = df['labels'] == 2

# Swap text1 and text2 for these rows
df.loc[swap_indices, 'final_text1'] = df.loc[swap_indices, 'text_2']
df.loc[swap_indices, 'final_text2'] = df.loc[swap_indices, 'text_1']

# Now, 'final_text1' is always the "real" text, 'final_text2' is the "fake" text.
# The label can be simplified. Let's create a balanced dataset.
# We'll create two examples for each original row to avoid positional bias.

part1_df = pd.DataFrame({
    'text1': df['final_text1'],
    'text2': df['final_text2'],
    'label': 1  # In this order, the first text is real
})

part2_df = pd.DataFrame({
    'text1': df['final_text2'], # Fake text is now first
    'text2': df['final_text1'],
    'label': 0  # In this order, the first text is not real
})

final_df = pd.concat([part1_df, part2_df], ignore_index=True)

# Split the data
train_df, val_df = train_test_split(final_df, test_size=0.2, random_state=42, stratify=final_df['label'])

In [None]:
from transformers import BertTokenizer, TFAutoModelForSequenceClassification

In [None]:
model_name = 'roberta-base'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenize the data
# The tokenizer takes the two lists of texts directly.
train_encodings = tokenizer(
    train_df['text1'].tolist(),
    train_df['text2'].tolist(),
    truncation=True,
    padding=True,
    max_length=512
)

val_encodings = tokenizer(
    val_df['text1'].tolist(),
    val_df['text2'].tolist(),
    truncation=True,
    padding=True,
    max_length=512
)

# Extract labels
train_labels = train_df['label'].values
val_labels = val_df['label'].values

In [None]:
import tensorflow as tf
# Create the datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

# Shuffle and batch the datasets
BATCH_SIZE = 8
train_dataset = train_dataset.shuffle(len(train_df)).batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)

In [None]:
# Load the pre-trained BERT model for sequence classification
# We use num_labels=1 for binary classification with a sigmoid output.
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

# Compile the model
# It's crucial to use a very low learning rate for fine-tuning.
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True) # Use from_logits=True for stability
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Train the model
NUM_EPOCHS = 5
history = model.fit(
    train_dataset,
    epochs=NUM_EPOCHS,
    validation_data=val_dataset
)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

def plot_training_history(history):
    """
    Plots the training and validation loss/accuracy curves and prints a summary.

    Args:
        history: A Keras History object returned by the model.fit() method.
    """
    # Convert the history.history dict to a pandas DataFrame
    hist_df = pd.DataFrame(history.history)
    
    # --- Print a statistical summary ---
    # Find the epoch with the best validation accuracy
    best_epoch_acc = hist_df['val_accuracy'].idxmax() + 1
    best_val_acc = hist_df['val_accuracy'].max()
    
    # Find the epoch with the best validation loss
    best_epoch_loss = hist_df['val_loss'].idxmin() + 1
    best_val_loss = hist_df['val_loss'].min()

    print("--- Training Summary ---")
    print(f"Best Validation Accuracy: {best_val_acc:.4f} at epoch {best_epoch_acc}")
    print(f"Lowest Validation Loss: {best_val_loss:.4f} at epoch {best_epoch_loss}")
    print("------------------------\n")

    # --- Plot the training curves ---
    sns.set_style("whitegrid")
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))

    # Plot Loss
    axes[0].plot(hist_df.index + 1, hist_df['loss'], label='Training Loss', color='blue')
    axes[0].plot(hist_df.index + 1, hist_df['val_loss'], label='Validation Loss', color='orange')
    axes[0].set_title('Loss Curves', fontsize=14)
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].legend()

    # Plot Accuracy
    axes[1].plot(hist_df.index + 1, hist_df['accuracy'], label='Training Accuracy', color='blue')
    axes[1].plot(hist_df.index + 1, hist_df['val_accuracy'], label='Validation Accuracy', color='orange')
    axes[1].set_title('Accuracy Curves', fontsize=14)
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy')
    axes[1].legend()

    plt.tight_layout()
    plt.show()

In [None]:
plot_training_history(history)

In [None]:
# Define a path to save the model
save_path = './my_bert_tf_model'

# Save the model's learned weights and configuration
model.save_pretrained(save_path)

# Save the tokenizer
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to {save_path}")

In [None]:
!zip -r bert_tf_model.zip /kaggle/working/my_bert_tf_model

# RUN FROM HERE FOR ONLY SCORE CHECKING
# Checking With Both Machine Learning Esemble Model and Transformer model
### Assigned weights to each of the model's prediction probability based on their score on test_data


In [None]:
import joblib
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import logging
TEST_DATA_PATH = '/kaggle/input/fake-or-real-the-imposter-x-train/test_data.csv'
test_df = pd.read_csv(TEST_DATA_PATH)

In [None]:
# Suppress the benign tokenizer warning for cleaner output
logging.set_verbosity_error()

def predict_hybrid_ensemble(
    classical_models, 
    transformer_model, 
    transformer_tokenizer, 
    test_df, 
    weights,
    max_length=512,
    batch_size=32,
    inverse_classical=False
):
    """
    Generates predictions by blending a classical ensemble and a transformer model.

    Args:
        classical_models (list): A list of loaded scikit-learn pipeline models.
        transformer_model: The fine-tuned Hugging Face TensorFlow model.
        transformer_tokenizer: The corresponding tokenizer.
        test_df (pd.DataFrame): The test DataFrame with 'text1' and 'text2' columns.
        weights (tuple): A tuple of (classical_weight, transformer_weight), e.g., (0.5, 0.5).
        max_length (int): Max sequence length for the tokenizer.
        batch_size (int): Batch size for transformer inference.

    Returns:
        np.array: An array of final predictions (1s and 2s).
    """
    print("--- Starting Hybrid Ensemble Prediction ---")
    
    # --- Step 1: Get Predictions from the Classical Ensemble ---
    print("Step 1: Getting predictions from classical models...")
    all_text_1s = test_df['text_1'].fillna('').tolist()
    all_text_2s = test_df['text_2'].fillna('').tolist()
    
    classical_probas_t1 = [model.predict_proba(all_text_1s)[:, 1] for model in classical_models]
    classical_probas_t2 = [model.predict_proba(all_text_2s)[:, 1] for model in classical_models]
    
    # Average the probabilities from the classical models
    avg_classical_prob_t1 = np.mean(classical_probas_t1, axis=0)
    avg_classical_prob_t2 = np.mean(classical_probas_t2, axis=0)
    
    # The final classical probability is the one for text1
    classical_final_probs = np.where(avg_classical_prob_t1 > avg_classical_prob_t2, avg_classical_prob_t1, 1 - avg_classical_prob_t2)
    if inverse_classical==True:
        print('Inversing the Classical Models Probabilites.\n')
        classical_final_probs = 1 - classical_final_probs

    # --- Step 2: Get Predictions from the Transformer Model ---
    print("Step 2: Getting predictions from the transformer model...")
    test_encodings = transformer_tokenizer(
        all_text_1s,
        all_text_2s,
        truncation=True,
        padding=True,
        max_length=max_length
    )
    test_dataset = tf.data.Dataset.from_tensor_slices(dict(test_encodings)).batch(batch_size)
    
    logits = transformer_model.predict(test_dataset).logits
    transformer_final_probs = tf.nn.sigmoid(logits).numpy().flatten()
    

    # --- Step 3: Blend the Probabilities ---
    print("Step 3: Blending model probabilities...")
    weight_classical, weight_transformer = weights
    classical_final_probs_np = np.array(classical_final_probs)
    transformer_final_probs_np = np.array(transformer_final_probs)
    blended_probs = (weight_classical * classical_final_probs) + (weight_transformer * transformer_final_probs)

    
    # --- Step 4: Make Final Predictions ---
    print("Step 4: Generating final labels...")
    final_predictions = np.where(blended_probs > 0.5, 1, 2)
    
    print("--- Prediction process finished. ---")
    return final_predictions

In [None]:
print("Missing values in text1:", test_df['text_1'].isnull().sum())
print("Missing values in text2:", test_df['text_2'].isnull().sum())

In [None]:
import joblib
import os
from transformers import BertTokenizer, TFAutoModelForSequenceClassification
MODEL_PATH = '/kaggle/input/tf-bert-model-for-fake-or-real-the-imposter-compt'
transformer_model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
transformer_tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
classical_models_path = '/kaggle/input/best-machine-learning-models-for-this-dataset'
classical_models = []
for model in os.listdir(classical_models_path):
    classical_models.append(joblib.load(classical_models_path+'/'+model))
score_classical = 0.84024
score_transformer = 0.83195
total_score = score_classical + score_transformer
weights = (score_classical / total_score, score_transformer / total_score)
print(f"Using weights: Classical={weights[0]:.2f}, Transformer={weights[1]:.2f}")

In [None]:
final_predictions = predict_hybrid_ensemble(
    classical_models=classical_models,
    transformer_model=transformer_model,
    transformer_tokenizer=transformer_tokenizer,
    test_df=test_df,
    weights=weights, 
    inverse_classical=True
)

In [None]:
def make_submission_csv(results):
    df_results = pd.DataFrame(results)
    output_df = df_results.copy()
    output_df.columns = ['real_text_id']
    output_df.reset_index(inplace=True)
    output_df.rename(columns={'index': 'id'}, inplace=True)
    output_df.to_csv('sample_submission.csv', index=False)
    return output_df

In [None]:
make_submission_csv(final_predictions)

# Psuedo Labelling the TEST DATA for Higher Amount of training data

In [None]:
# Suppress the benign tokenizer warning for cleaner output
logging.set_verbosity_error()

def predict_hybrid_ensemble_proba(
    classical_models, 
    transformer_model, 
    transformer_tokenizer, 
    test_df, 
    weights,
    max_length=512,
    batch_size=32,
    inverse_classical=False
):
    """
    Generates predictions by blending a classical ensemble and a transformer model.

    Args:
        classical_models (list): A list of loaded scikit-learn pipeline models.
        transformer_model: The fine-tuned Hugging Face TensorFlow model.
        transformer_tokenizer: The corresponding tokenizer.
        test_df (pd.DataFrame): The test DataFrame with 'text1' and 'text2' columns.
        weights (tuple): A tuple of (classical_weight, transformer_weight), e.g., (0.5, 0.5).
        max_length (int): Max sequence length for the tokenizer.
        batch_size (int): Batch size for transformer inference.

    Returns:
        np.array: An array of final predictions (1s and 2s).
    """
    print("--- Starting Hybrid Ensemble Prediction ---")
    
    # --- Step 1: Get Predictions from the Classical Ensemble ---
    print("Step 1: Getting predictions from classical models...")
    all_text_1s = test_df['text_1'].fillna('').tolist()
    all_text_2s = test_df['text_2'].fillna('').tolist()
    
    classical_probas_t1 = [model.predict_proba(all_text_1s)[:, 1] for model in classical_models]
    classical_probas_t2 = [model.predict_proba(all_text_2s)[:, 1] for model in classical_models]
    
    # Average the probabilities from the classical models
    avg_classical_prob_t1 = np.mean(classical_probas_t1, axis=0)
    avg_classical_prob_t2 = np.mean(classical_probas_t2, axis=0)
    
    # The final classical probability is the one for text1
    classical_final_probs = np.where(avg_classical_prob_t1 > avg_classical_prob_t2, avg_classical_prob_t1, 1 - avg_classical_prob_t2)
    if inverse_classical==True:
        print('Inversing the Classical Models Probabilites.\n')
        classical_final_probs = 1 - classical_final_probs

    # --- Step 2: Get Predictions from the Transformer Model ---
    print("Step 2: Getting predictions from the transformer model...")
    test_encodings = transformer_tokenizer(
        all_text_1s,
        all_text_2s,
        truncation=True,
        padding=True,
        max_length=max_length
    )
    test_dataset = tf.data.Dataset.from_tensor_slices(dict(test_encodings)).batch(batch_size)
    
    logits = transformer_model.predict(test_dataset).logits
    transformer_final_probs = tf.nn.sigmoid(logits).numpy().flatten()
    

    # --- Step 3: Blend the Probabilities ---
    print("Step 3: Blending model probabilities...")
    weight_classical, weight_transformer = weights
    classical_final_probs_np = np.array(classical_final_probs)
    transformer_final_probs_np = np.array(transformer_final_probs)
    blended_probs = (weight_classical * classical_final_probs) + (weight_transformer * transformer_final_probs)
    
    print("--- Prediction process finished. ---")
    return blended_probs

In [None]:
pseudo_probabilities = predict_hybrid_ensemble_proba(
    classical_models=classical_models,
    transformer_model=transformer_model,
    transformer_tokenizer=transformer_tokenizer,
    test_df=test_df,
    weights=weights, 
    inverse_classical=True
)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assume 'pseudo_probabilities' is the array of your blended probabilities
plt.figure(figsize=(10, 6))
sns.histplot(pseudo_probabilities, bins=50)
plt.title('Distribution of Prediction Probabilities on Test Set')
plt.xlabel('Predicted Probability (Confidence)')
plt.ylabel('Count')
plt.show()

In [None]:
confidence_threshold = 0.90 # 90% confidence

# This checks for probabilities > 0.90 OR < 0.10 (which is 1 - 0.90)
high_confidence_indices = np.where(
    (pseudo_probabilities > confidence_threshold) | (pseudo_probabilities < (1 - confidence_threshold))
)[0]

print(f"Samples found with >90% or <10% probability: {len(high_confidence_indices)}")

In [None]:
confidence_threshold = 0.90

high_confidence_indices = np.where(
    (pseudo_probabilities > confidence_threshold) | (pseudo_probabilities < (1 - confidence_threshold))
)[0]

pseudo_df = test_df.iloc[high_confidence_indices].copy()
confident_probs = pseudo_probabilities[high_confidence_indices]

# Assign the pseudo-labels (1 if prob > 0.5, else 0)
pseudo_df['label'] = np.where(confident_probs > 0.5, 1, 2)

In [None]:
pseudo_df.head()

In [None]:
len(pseudo_df)

In [None]:
pseudo_df.to_csv('psuedo_data(from_mix_ml_and_BERT)_1.csv')