In [None]:
import pandas as pd

# Define the file paths
features_path = '/content/drive/MyDrive/TruthSeeker/Features_For_Traditional_ML_Techniques.csv'
truth_seeker_path = '/content/drive/MyDrive/TruthSeeker/Truth_Seeker_Model_Dataset.csv'

# Load the datasets
features_df = pd.read_csv(features_path)
truth_seeker_df = pd.read_csv(truth_seeker_path)

# Display first few rows
features_df.head(), truth_seeker_df.head()

# Rename the unnamed columns to 'ID' in both DataFrames for easier reference
features_df.rename(columns={features_df.columns[0]: 'ID'}, inplace=True)
truth_seeker_df.rename(columns={truth_seeker_df.columns[0]: 'ID'}, inplace=True)

# Drop duplicate columns ('tweet' and 'statement') from one of the DataFrames
features_df_dropped = features_df.drop(columns=['tweet', 'statement'])

# Perform the join operation using both 'ID' and 'BinaryNumTarget'
joined_df = pd.merge(features_df_dropped, truth_seeker_df, on=['ID', 'BinaryNumTarget'])

# Display the first few rows of the joined DataFrame
joined_df.head()

# Descriptive statistics
joined_df.describe()

# Information about the dataframe
joined_df.info()

import matplotlib.pyplot as plt

# Count the occurrences of each unique value in the 'BinaryNumTarget' column
value_counts = joined_df['BinaryNumTarget'].value_counts()

# Create lists for the bar graph
labels = ['True', 'Fake']
counts = [value_counts.get(1, 0), value_counts.get(0, 0)]

# Create the bar graph
plt.figure(figsize=(10, 6))
plt.bar(labels, counts, color=['blue', 'red'])

# Add title and labels
plt.title('Distribution of True and Fake News Articles')
plt.xlabel('News Type')
plt.ylabel('Count')

# Add text annotations on each bar
for i, count in enumerate(counts):
    plt.text(i, count, str(count), ha='center')

# Show the plot
plt.show()

# Identify numerical and categorical columns
numerical_cols = joined_df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = joined_df.select_dtypes(include=['object']).columns

# Impute missing values in numerical columns with the column mean
for col in numerical_cols:
    joined_df[col].fillna(joined_df[col].mean(), inplace=True)

# Impute missing values in categorical columns with the column mode
for col in categorical_cols:
    joined_df[col].fillna(joined_df[col].mode()[0], inplace=True)

# Remove duplicates
joined_df.drop_duplicates(inplace=True)

import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Batch size for batch processing
batch_size = 2000

# Function for advanced text preprocessing
def advanced_preprocessing(doc):
    lemmatized = [token.lemma_ for token in doc]
    return ' '.join(lemmatized)

# Initialize an empty list to hold the preprocessed texts
preprocessed_texts = []

# Perform batch processing
for doc in nlp.pipe(joined_df['statement'].values, batch_size=batch_size):
    preprocessed_texts.append(advanced_preprocessing(doc))

# Add the preprocessed texts back to the DataFrame
joined_df['statement_advanced'] = preprocessed_texts

  from transformers import BertTokenizer, BertModel
  import torch
  import numpy as np

  # Initialize the BERT tokenizer and model
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  model = BertModel.from_pretrained('bert-base-uncased')

  # Function to get BERT embeddings for a batch of text
  def get_bert_embeddings_for_batch(text_batch):
      inputs = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
      outputs = model(**inputs)
      embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()
      return embeddings

  # Batch size for BERT embeddings
  batch_size = 500

  # Initialize an empty list to hold the BERT embeddings
  bert_embeddings = []

  # Loop through the DataFrame in batches
  for i in range(0, len(joined_df), batch_size):
      text_batch = joined_df['statement_advanced'].iloc[i:i+batch_size].tolist()
      embeddings_batch = get_bert_embeddings_for_batch(text_batch)
      bert_embeddings.extend(embeddings_batch)

  # Convert the list of embeddings to a NumPy array
  bert_embeddings = np.array(bert_embeddings)

  # Add the BERT embeddings back to the DataFrame
  joined_df['bert_embeddings'] = list(bert_embeddings)

import os
import tensorflow as tf
from tensorflow.keras import layers

# Initialize TPU
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)

# Create a distribution strategy
tpu_strategy = tf.distribute.TPUStrategy(resolver)

# Model architecture with dropout and regularization
with tpu_strategy.scope():
    model = tf.keras.Sequential([
        layers.Input(shape=(768,)),  # BERT embeddings size
        layers.Reshape((1, 768)),
        layers.Bidirectional(layers.LSTM(50, return_sequences=True)),
        layers.Dropout(0.4),
        layers.Bidirectional(layers.LSTM(25)),
        layers.Dense(30, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        layers.Dense(1, activation='sigmoid')
    ])


    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

import optuna
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Extract features and labels
X = np.stack(joined_df['bert_embeddings'].to_numpy())
y = joined_df['BinaryNumTarget'].values

# Compute class weights
unique_classes = np.unique(y)
class_weights = compute_class_weight('balanced', classes=unique_classes, y=y)
class_weights_dict = {i: w for i, w in enumerate(class_weights)}

def objective(trial):
    # Hyperparameters to be optimized
    lstm_units = trial.suggest_int('lstm_units', 20, 50)
    dense_units = trial.suggest_int('dense_units', 10, 30)
    dropout_rate = trial.suggest_float('dropout_rate', 0.4, 0.7)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    l1_reg = trial.suggest_float('l1_reg', 1e-6, 1e-4, log=True)
    l2_reg = trial.suggest_float('l2_reg', 1e-6, 1e-4, log=True)

    # Initialize variables for k-fold cross-validation
    k = 5
    kf = StratifiedKFold(n_splits=k)
    val_accuracies = []

    for train_index, val_index in kf.split(X, y):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Early stopping
        early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

        # Model architecture with hyperparameters
        with tpu_strategy.scope():
            model = tf.keras.Sequential([
                layers.Input(shape=(768,)),
                layers.Reshape((1, 768)),
                layers.Bidirectional(layers.LSTM(lstm_units, return_sequences=True, kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg))),
                layers.Dropout(dropout_rate),
                layers.Bidirectional(layers.LSTM(lstm_units//2, kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg))),
                layers.Dense(dense_units, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)),
                layers.Dense(1, activation='sigmoid')
            ])

            opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
            model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

        history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_val, y_val), callbacks=[early_stop], class_weight=class_weights_dict)

        val_accuracies.append(history.history['val_accuracy'][-1])

    return np.mean(val_accuracies)

# Initialize Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras import layers, regularizers
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import tensorflow as tf

# Extract features and labels for final evaluation
X = np.stack(joined_df['bert_embeddings'].to_numpy())
y = joined_df['BinaryNumTarget'].values

# Extract best parameters from Optuna study
best_params = study.best_params

# Initialize variables for Stratified K-Fold
k = 5
kf = StratifiedKFold(n_splits=k)
val_accuracies = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Model architecture with hyperparameters from Step 4
    with tpu_strategy.scope():
        model = tf.keras.Sequential([
            layers.Input(shape=(768,)),  # BERT embeddings size
            layers.Reshape((1, 768)),
            layers.Bidirectional(layers.LSTM(best_params['lstm_units'], return_sequences=True, kernel_regularizer=regularizers.l1_l2(l1=best_params['l1_reg'], l2=best_params['l2_reg']))),
            layers.Dropout(best_params['dropout_rate']),
            layers.Bidirectional(layers.LSTM(best_params['lstm_units']//2, kernel_regularizer=regularizers.l1_l2(l1=best_params['l1_reg'], l2=best_params['l2_reg']))),
            layers.Dense(best_params['dense_units'], activation='relu', kernel_regularizer=regularizers.l1_l2(l1=best_params['l1_reg'], l2=best_params['l2_reg'])),
            layers.Dense(1, activation='sigmoid')
        ])

        # Using the Adam optimizer with the suggested learning rate
        opt = tf.keras.optimizers.Adam(learning_rate=best_params['learning_rate'])

        model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping callback with restore_best_weights
    early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    # Train the model
    history = model.fit(X_train, y_train, epochs=10, batch_size=64, callbacks=[early_stop], validation_split=0.1)

    # Model predictions
    y_pred = model.predict(X_test).flatten()
    y_pred_binary = np.round(y_pred)

    # Display classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred_binary))

    # Store the validation accuracy for this fold
    val_accuracies.append(history.history['val_accuracy'][-1])

# Print the mean validation accuracy across all folds
print("Mean Validation Accuracy: ", np.mean(val_accuracies))

from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Model predictions
y_pred = model.predict(X_test).flatten()
y_pred_binary = np.round(y_pred)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_binary)
sns.heatmap(cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred)
plt.figure()
plt.plot(recall, precision, color='b', lw=1, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="upper right")
plt.show()

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf

# Setting random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Split the data into training+validation and test sets, ensuring it's stratified.
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Training final model using X_train_val and y_train_val
with tpu_strategy.scope():
    final_model = tf.keras.Sequential([
        layers.Input(shape=(768,)),  # BERT embeddings size
        layers.Reshape((1, 768)),
        layers.Bidirectional(layers.LSTM(
            best_params['lstm_units'], return_sequences=True,
            kernel_regularizer=regularizers.l1_l2(l1=best_params['l1_reg'], l2=best_params['l2_reg'])
        )),
        layers.Dropout(best_params['dropout_rate']),
        layers.Bidirectional(layers.LSTM(
            best_params['lstm_units'] // 2,
            kernel_regularizer=regularizers.l1_l2(l1=best_params['l1_reg'], l2=best_params['l2_reg'])
        )),
        layers.Dense(
            best_params['dense_units'], activation='relu',
            kernel_regularizer=regularizers.l1_l2(l1=best_params['l1_reg'], l2=best_params['l2_reg'])
        ),
        layers.Dense(1, activation='sigmoid')
    ])

    # Compile the model
    opt = tf.keras.optimizers.Adam(learning_rate=best_params['learning_rate'])
    final_model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

# Use Early stopping and class weights during training
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the final model with a validation split and early stopping
history = final_model.fit(
    X_train_val, y_train_val, epochs=10, batch_size=64,
    validation_split=0.1, callbacks=[early_stop], class_weight=class_weights_dict
)

# Step 3: Evaluate the model on the test set
y_pred = final_model.predict(X_test).flatten()
y_pred_binary = np.round(y_pred)

# Calculate and print test accuracy
test_accuracy = accuracy_score(y_test, y_pred_binary)
print(f'Test Accuracy: {test_accuracy}')

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_binary))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_binary)
sns.heatmap(cm, annot=True, fmt="d")
plt.show()

# Plotting Training History
plt.figure()
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()