In [None]:
from transformers import BertTokenizer, BertModel
import numpy as np
from scipy import sparse
from collections import Counter
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from collections import defaultdict
from tqdm import tqdm
import re
import os
import pandas as pd
from scipy.sparse import lil_matrix, save_npz, load_npz
import h5py
import torch.nn as nn
import torch.optim as optim
from scipy.sparse import load_npz
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

directory_path = r"D:\New folder (5)"
embeddings_dir = os.path.join(directory_path, 'embeddings')
gcn_file_path = os.path.join(directory_path, 'node_embeddings_gcn16.npy')
labels_file_path = os.path.join(directory_path, 'label.txt')
source_tweets_file = r"D:\New folder (5)\source_tweets.txt"
dependency_matrix_path = 'dependency_matrix16.npz'
pmi_matrix_path = 'pmi_matrix16.npz'
tweets_file_path = r"D:\New folder (5)\source_tweets.txt"
tree_folder_path = r"D:\New folder (5)\tree"
embeddings_dir = r"D:\New folder (5)\embeddings"
output_dir = r"D:\New folder (5)"

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


def load_and_tokenize(file_path, tokenizer):
    with open(file_path, 'r', encoding='utf-8') as file:
        tweets = [line.strip() for line in file.readlines()]
    token_ids = []
    skipgram_counts = Counter()
    for tweet in tweets:
        tokens = tokenizer.tokenize(tweet)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        token_ids.extend(indexed_tokens)
        for i in range(len(indexed_tokens)):
            for j in range(max(0, i-2), min(len(indexed_tokens), i+3)):
                if i != j:
                    skipgram_counts[(indexed_tokens[i], indexed_tokens[j])] += 1
    return token_ids, skipgram_counts

file_path = r"C:\Users\AI-BIO\Untitled Folder\source_tweets.txt"
token_ids, skipgram_counts = load_and_tokenize(file_path, tokenizer)


vocab_size = tokenizer.vocab_size
total_skipgrams = sum(skipgram_counts.values())
sum_over_words = np.zeros(vocab_size)
sum_over_contexts = np.zeros(vocab_size)
for token_id in token_ids:
    sum_over_words[token_id] += 1
sum_over_contexts = sum_over_words ** 0.75
nca_denom = sum(sum_over_contexts)

pmi_values, spmi_values = [], []
rows, cols = [], []

for (tok1, tok2), count in skipgram_counts.items():
    Pwc = count / total_skipgrams
    Pw = sum_over_words[tok1] / total_skipgrams
    Pc = sum_over_words[tok2] / total_skipgrams
    Pca = sum_over_contexts[tok2] / nca_denom

    pmi = np.log2(Pwc / (Pw * Pc)) if Pwc > 0 else 0
    spmi = np.log2(Pwc / (Pw * Pca)) if Pwc > 0 else 0
    ppmi = max(pmi, 0)
    sppmi = max(spmi, 0)

    rows.append(tok1)
    cols.append(tok2)
    pmi_values.append(pmi)
    spmi_values.append(spmi)


pmi_mat = sparse.csr_matrix((pmi_values, (rows, cols)), shape=(vocab_size, vocab_size))
spmi_mat = sparse.csr_matrix((spmi_values, (rows, cols)), shape=(vocab_size, vocab_size))

print("Vocabulary size from BERT tokenizer:", vocab_size)
print("Shape of PMI matrix:", pmi_mat.shape)
print("Shape of SPMI matrix:", spmi_mat.shape)


model_bert = BertModel.from_pretrained('bert-base-uncased')
model_bert.eval()


pmi_coo = pmi_mat.tocoo()


edge_index = np.vstack((pmi_coo.row, pmi_coo.col))
edge_weight = np.array(pmi_coo.data, dtype=np.float32)


data_pmi = Data(edge_index=torch.tensor(edge_index, dtype=torch.long),
                edge_attr=torch.tensor(edge_weight, dtype=torch.float))
from scipy import sparse
import torch


sparse.save_npz('pmi_matrix.npz', pmi_mat)
sparse.save_npz('spmi_matrix.npz', spmi_mat)
torch.save(data_pmi, 'pmi_graph_data.pt')


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    return text

def create_dependency_matrix(tweets):
    dependency_counts = defaultdict(int)
    total_dependencies = 0

    for tweet in tqdm(tweets, desc="Processing tweets"):
        tokens = tweet.split()
        for i, token in enumerate(tokens):
            if i > 0:
                dependency_counts[(tokens[i-1], token)] += 1
                total_dependencies += 1
            if i < len(tokens) - 1:
                dependency_counts[(token, tokens[i+1])] += 1
                total_dependencies += 1

    return dependency_counts, total_dependencies


def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        tweets = [preprocess_text(line.strip()) for line in file.readlines()]
    return tweets

class GAT(torch.nn.Module):
    def __init__(self, in_features, out_features):
        super(GAT, self).__init__()
        self.conv1 = GATConv(in_features, 64, heads=2, concat=True)
        self.conv2 = GATConv(64 * 2, out_features, heads=1, concat=False)

    def forward(self, x, edge_index, edge_weight):
        x = torch.relu(self.conv1(x, edge_index, edge_weight))
        x = self.conv2(x, edge_index, edge_weight)
        return x


dependency_counts, total_dependencies = create_dependency_matrix(tweets)


rows, cols, data = [], [], []
for (tok1, tok2), count in tqdm(dependency_counts.items(), desc="Building sparse matrix"):
    rows.append(tokenizer.convert_tokens_to_ids([tok1])[0])
    cols.append(tokenizer.convert_tokens_to_ids([tok2])[0])
    data.append(count / total_dependencies)

dependency_mat = sparse.csr_matrix((data, (rows, cols)), shape=(tokenizer.vocab_size, tokenizer.vocab_size))


dependency_coo = dependency_mat.tocoo()


edge_index_dep = np.vstack((dependency_coo.row, dependency_coo.col))
edge_weight_dep = np.array(dependency_coo.data, dtype=np.float32)


data_dep = Data(edge_index=torch.tensor(edge_index_dep, dtype=torch.long),
                edge_attr=torch.tensor(edge_weight_dep, dtype=torch.float))


def load_embeddings(tokenizer, model):
    embeddings = []
    for token_id in tqdm(range(tokenizer.vocab_size), desc="Loading embeddings"):
        token = tokenizer.convert_ids_to_tokens([token_id])
        encoded_input = tokenizer(token, return_tensors='pt')
        with torch.no_grad():
            output = model(**encoded_input)
        embeddings.append(output.last_hidden_state[:, 0, :].squeeze().cpu().numpy())
    return np.array(embeddings)

from transformers import BertModel
model_bert = BertModel.from_pretrained('bert-base-uncased')
embeddings = load_embeddings(tokenizer, model_bert)


node_features = torch.tensor(embeddings, dtype=torch.float32)
from scipy import sparse
import torch


sparse.save_npz('dependency_matrix16.npz', dependency_mat)

torch.save(data_dep, 'dependency_graph_data16.pt')

torch.save(node_features, 'node_features16.pt')


if not os.path.exists(output_dir):
    os.makedirs(output_dir)

tweets_df = pd.read_csv(tweets_file_path, sep='\t', header=None, names=['tweet_id', 'tweet_text'])

tree_files = [os.path.join(tree_folder_path, file_name) for file_name in os.listdir(tree_folder_path) if file_name.endswith('.txt')]


graph_data = defaultdict(list)
for file_path in tqdm(tree_files, desc="Processing Tree Files"):
    with open(file_path, 'r') as file:
        parent_id = os.path.splitext(os.path.basename(file_path))[0]  # Parent tweet ID from file name
        for line in file:
            child_id = line.strip()
            graph_data[parent_id].append(child_id)


all_tweet_ids = list(set(graph_data.keys()).union(set(child_id for children in graph_data.values() for child_id in children)))

all_tweet_ids = all_tweet_ids[:75990]


tweet_id_to_index = {tweet_id: idx for idx, tweet_id in enumerate(all_tweet_ids)}


num_tweets = len(all_tweet_ids)
adjacency_matrix = lil_matrix((num_tweets, num_tweets), dtype=np.float32)

for parent_id, children_ids in graph_data.items():
    parent_idx = tweet_id_to_index.get(parent_id)
    if parent_idx is not None:
        for child_id in children_ids:
            child_idx = tweet_id_to_index.get(child_id)
            if child_idx is not None:
                adjacency_matrix[parent_idx, child_idx] = 1


adjacency_matrix_csr = adjacency_matrix.tocsr()


save_npz(os.path.join(output_dir, 'adjacency_matrix16.npz'), adjacency_matrix_csr)


adjacency_matrix_csr = load_npz(os.path.join(output_dir, 'adjacency_matrix16.npz'))


rows, cols = adjacency_matrix_csr.nonzero()
edge_index = np.vstack((rows, cols))
edge_weight = adjacency_matrix_csr.data


edge_index = torch.tensor(edge_index, dtype=torch.long)
edge_weight = torch.tensor(edge_weight, dtype=torch.float)


embeddings_path = os.path.join(output_dir, 'combined_embeddings_matrix16.h5')
if not os.path.exists(embeddings_path):
    embedding_dim = 128
    embeddings_matrix = np.random.rand(num_tweets, embedding_dim).astype(np.float32)
    with h5py.File(embeddings_path, 'w') as hf:
        hf.create_dataset('embeddings', data=embeddings_matrix)
    print(f"Embeddings matrix generated and saved to {embeddings_path}")
else:
    with h5py.File(embeddings_path, 'r') as hf:
        embeddings_matrix = hf['embeddings'][:]

if embeddings_matrix.shape[0] != num_tweets:
    raise ValueError(f"Number of tweets in embeddings matrix ({embeddings_matrix.shape[0]}) does not match the number of tweets in adjacency matrix ({num_tweets}).")

node_features = torch.tensor(embeddings_matrix, dtype=torch.float32)
data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_weight)


class GCN(torch.nn.Module):
    def __init__(self, in_features, out_features):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_features, 64)
        self.conv2 = GCNConv(64, out_features)

    def forward(self, x, edge_index, edge_weight):
        x = torch.relu(self.conv1(x, edge_index, edge_weight))
        x = self.conv2(x, edge_index, edge_weight)
        return x


model_gcn = GCN(in_features=node_features.shape[1], out_features=128)


model_gcn.eval()
with torch.no_grad():
    node_embeddings_gcn = model_gcn(data.x, data.edge_index, data.edge_attr)

print("Output embeddings shape for GCN:", node_embeddings_gcn.shape)


node_embeddings_gcn_np = node_embeddings_gcn.cpu().numpy()
np.save(os.path.join(output_dir, 'node_embeddings_gcn16.npy'), node_embeddings_gcn_np)


class GAT(torch.nn.Module):
    def __init__(self, in_features, out_features):
        super(GAT, self).__init__()
        self.conv1 = GATConv(in_features, 64, heads=2, concat=True)
        self.conv2 = GATConv(64 * 2, out_features, heads=1, concat=False)

    def forward(self, x, edge_index, edge_weight):
        x, (edge_index, edge_weight) = self.conv1(x, edge_index, edge_weight, return_attention_weights=True)
        x = torch.relu(x)
        x = self.conv2(x, edge_index, edge_weight)
        return x, edge_weight

model_gat = GAT(in_features=node_embeddings_gcn.shape[1], out_features=128)


model_gat.eval()
with torch.no_grad():
    node_embeddings_gat, att_weights = model_gat(node_embeddings_gcn, data.edge_index, data.edge_attr)

print("Output embeddings shape for GAT:", node_embeddings_gat.shape)


node_embeddings_gat_np = node_embeddings_gat.cpu().numpy()
np.save(os.path.join(output_dir, 'node_embeddings_gat16.npy'), node_embeddings_gat_np)

gat_attention_graph = np.dot(att_weights.detach().cpu().numpy(), att_weights.detach().cpu().numpy().T)

print("\nGAT Attention Graph:")
print(gat_attention_graph)


np.save(os.path.join(output_dir, 'gat_attention_graph16.npy'), gat_attention_graph)


def load_text_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.readlines()
        print(f"Successfully loaded text file: {file_path}")
        return [line.strip() for line in data]
    except Exception as e:
        print(f"Error loading text file {file_path}: {e}")
        return None


embeddings_gcn = verify_and_load(gcn_file_path)


labels = load_text_file(labels_file_path)
tweets = load_text_file(source_tweets_file)


if embeddings_gcn is None:
    raise RuntimeError("One or more embeddings could not be loaded. Check file paths and permissions.")

valid_labels = {"false", "true"}
filtered_indices = [i for i, label in enumerate(labels) if label.split(':')[0] in valid_labels]


embeddings_gcn = embeddings_gcn[filtered_indices]
labels = [labels[i] for i in filtered_indices]
tweets = [tweets[i] for i in filtered_indices]


label_map = {"false": 0, "true": 1}
labels = np.array([label_map[label.split(':')[0]] for label in labels])


def load_bert_embeddings(embeddings_dir, indices):
    embeddings = []
    for i in indices:
        embeddings.append(np.load(os.path.join(embeddings_dir, f'tweet_{i}_embeddings.npy')))
    return np.array(embeddings)

embeddings_bert = load_bert_embeddings(embeddings_dir, filtered_indices)


min_samples = min(embeddings_bert.shape[0], embeddings_gcn.shape[0])
embeddings_bert = embeddings_bert[:min_samples]
embeddings_gcn = embeddings_gcn[:min_samples]
labels = labels[:min_samples]


embeddings_bert = embeddings_bert.reshape(min_samples, -1)
embeddings_gcn = embeddings_gcn.reshape(min_samples, -1)


def normalize(tensor):
    return (tensor - tensor.mean()) / tensor.std()

embeddings_bert = normalize(torch.tensor(embeddings_bert, dtype=torch.float32))
embeddings_gcn = normalize(torch.tensor(embeddings_gcn, dtype=torch.float32))
labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)


def check_for_nan_inf(tensor, name):
    if torch.isnan(tensor).any():
        print(f"{name} contains NaN values.")
    if torch.isinf(tensor).any():
        print(f"{name} contains Inf values.")

check_for_nan_inf(embeddings_bert, "embeddings_bert")
check_for_nan_inf(embeddings_gcn, "embeddings_gcn")
check_for_nan_inf(labels, "labels")


class InconsistencyModule(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(InconsistencyModule, self).__init__()
        self.self_attention = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text_features):

        text_self_attn_output, _ = self.self_attention(text_features, text_features, text_features)
        text_self_attn_output = self.dropout(text_self_attn_output)
        text_features = self.norm1(text_features + text_self_attn_output)

        ffn_output = self.ffn(text_features)
        ffn_output = self.dropout(ffn_output)
        output = self.norm2(text_features + ffn_output)

        return output

class GATModel(nn.Module):
    def __init__(self, in_features, out_features):
        super(GATModel, self).__init__()
        self.conv1 = GATConv(in_features, 64, heads=2, concat=True)
        self.conv2 = GATConv(64 * 2, out_features, heads=1, concat=False)

    def forward(self, x, edge_index, edge_weight):
        x = torch.relu(self.conv1(x, edge_index, edge_weight))
        x = self.conv2(x, edge_index, edge_weight)
        return x

class FakeNewsDetectionModel(nn.Module):
    def __init__(self, input_dim, d_model, n_heads, d_ff, dropout):
        super(FakeNewsDetectionModel, self).__init__()
        self.embedding_projection = nn.Linear(input_dim, d_model)
        self.inconsistency_module = InconsistencyModule(d_model, n_heads, d_ff, dropout)
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, 1),
            nn.Sigmoid()
        )
        self.gat_dep = GATModel(d_model, d_model)
        self.gat_pmi = GATModel(d_model, d_model)
        self.gat_gcn = GATModel(d_model, d_model)

    def forward(self, bert_embeddings, dep_graph, pmi_graph, gcn_graph):

        bert_embeddings_proj = self.embedding_projection(bert_embeddings)


        x = self.gat_pmi(bert_embeddings_proj, pmi_graph.edge_index, pmi_graph.edge_attr)
        v = self.gat_dep(bert_embeddings_proj, dep_graph.edge_index, dep_graph.edge_attr)
        p = self.gat_gcn(bert_embeddings_proj, gcn_graph.edge_index, gcn_graph.edge_attr)


        output = self.inconsistency_module(x + v + p)


        output = self.classifier(output)
        return output



model = FakeNewsDetectionModel(input_dim, d_model, n_heads, d_ff, dropout)



input_dim = embeddings_bert.shape[1]

d_model_values = [384, 512, 768, 1024]
n_heads_values = [2, 4, 8, 10, 12]
d_ff_values = [1024, 1536, 2048, 3072, 4096]
dropout_values = [0.2, 0.3, 0.4, 0.5, 0.6]
learning_rates = [1e-5, 3e-5, 5e-5, 7e-5]

def create_graph_data(matrix_path, num_nodes):
    matrix = load_npz(matrix_path)
    edge_index = torch.tensor(matrix.nonzero(), dtype=torch.long)
    edge_attr = torch.tensor(matrix.data, dtype=torch.float32)


    mask = (edge_index[0] < num_nodes) & (edge_index[1] < num_nodes)
    edge_index = edge_index[:, mask]
    edge_attr = edge_attr[mask]

    return Data(edge_index=edge_index, edge_attr=edge_attr)

num_nodes = embeddings_bert.shape[0]

dep_graph = create_graph_data(dependency_matrix_path, num_nodes)
pmi_graph = create_graph_data(pmi_matrix_path, num_nodes)


gcn_edge_index = torch.tensor([[i, i] for i in range(num_nodes)], dtype=torch.long).t()
gcn_edge_attr = torch.ones(num_nodes, dtype=torch.float32)
gcn_graph = Data(edge_index=gcn_edge_index, edge_attr=gcn_edge_attr)


device = torch.device('cpu')
embeddings_bert = embeddings_bert.to(device)
labels = labels.to(device)
dep_graph = dep_graph.to(device)
pmi_graph = pmi_graph.to(device)
gcn_graph = gcn_graph.to(device)


def train_and_evaluate(d_model, n_heads, d_ff, dropout, lr):

    model = FakeNewsDetectionModel(input_dim, d_model, n_heads, d_ff, dropout).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCELoss()


    X_train, X_test, y_train, y_test = train_test_split(embeddings_bert, labels, test_size=0.2, random_state=42)


    train_losses = []
    test_losses = []
    train_accuracies = []
    test_accuracies = []

    early_stopping_patience = 5
    best_test_loss = float('inf')
    patience_counter = 0
    best_model_state = None

    num_epochs = 100
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()

        outputs = model(X_train, dep_graph, pmi_graph, gcn_graph)
        train_loss = criterion(outputs, y_train)
        train_losses.append(train_loss.item())

        train_predictions = (outputs > 0.5).float()
        train_accuracy = (train_predictions == y_train).float().mean()
        train_accuracies.append(train_accuracy.item())

        train_loss.backward()
        optimizer.step()


        model.eval()
        with torch.no_grad():
            test_outputs = model(X_test, dep_graph, pmi_graph, gcn_graph)
            test_loss = criterion(test_outputs, y_test)
            test_losses.append(test_loss.item())

            test_predictions = (test_outputs > 0.5).float()
            test_accuracy = (test_predictions == y_test).float().mean()
            test_accuracies.append(test_accuracy.item())


        if test_loss.item() < best_test_loss:
            best_test_loss = test_loss.item()
            best_model_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= early_stopping_patience:
                break


    model.load_state_dict(best_model_state)
    model.eval()
    with torch.no_grad():
        final_outputs = model(X_test, dep_graph, pmi_graph, gcn_graph)
        final_predictions = (final_outputs > 0.5).float()
        final_accuracy = (final_predictions == y_test).float().mean()

    return {
        'd_model': d_model,
        'n_heads': n_heads,
        'd_ff': d_ff,
        'dropout': dropout,
        'learning_rate': lr,
        'final_accuracy': final_accuracy.item(),
        'classification_report': classification_report(y_test.numpy(), final_predictions.numpy(), target_names=['False', 'True'], output_dict=True),
        'train_losses': train_losses,
        'test_losses': test_losses,
        'train_accuracies': train_accuracies,
        'test_accuracies': test_accuracies
    }


parameter_combinations = list(itertools.product(d_model_values, n_heads_values, d_ff_values, dropout_values, learning_rates))


results = []
for params in parameter_combinations:
    d_model, n_heads, d_ff, dropout, lr = params
    if d_model % n_heads != 0:
        continue
    print(f"Running experiment with d_model={d_model}, n_heads={n_heads}, d_ff={d_ff}, dropout={dropout}, lr={lr}")
    result = train_and_evaluate(d_model, n_heads, d_ff, dropout, lr)
    results.append(result)


results_df = pd.DataFrame(results)
results_df.to_excel('model_experiment_results_with_epochs2.xlsx', index=False)
print("Results saved to model_experiment_results_with_epochs2.xlsx")


def plot_results(result):

    plt.figure(figsize=(10, 6))
    plt.plot(result['train_losses'], label='Training Loss')
    plt.plot(result['test_losses'], label='Testing Loss')
    plt.title('Training and Testing Loss over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()


    plt.figure(figsize=(10, 6))
    plt.plot(result['train_accuracies'], label='Training Accuracy')
    plt.plot(result['test_accuracies'], label='Testing Accuracy')
    plt.title('Training and Testing Accuracy over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()


results_df = pd.read_excel('model_experiment_results_with_epochs2.xlsx')


example_result = results_df.iloc[0]
plot_results(example_result)

