- Pre process
- Get BERT embeddings for sentances and words
- Make all sentance Equal length
- Make all articles of equal length
- Make graph taking sentance as rows and words and label as column
- Feed in graph attention model for sentance classification

In [None]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords


import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel


from IPython.display import clear_output # to clear the large outputs

In [None]:
# Initialize NLTK
nltk.download('punkt')
clear_output()

In [None]:
# df = pd.read_csv("../EnglishNews_train.csv", encoding="utf-8", nrows=10)
df = pd.read_csv("./newEnglishNews_train.csv", encoding="utf-8", nrows=1000).dropna().reset_index().drop(['index'], axis=1)
df.tail()

In [None]:
df.shape

# This is for one article

# Create vocabulary

In [None]:
articles = df["Article"]
# articles.head()
article = articles[0]

In [None]:
all_summary = df["Summary"]
# all_summary.head()
summary = all_summary[0]

In [None]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
clear_output()

# Preprocess the text
def preprocess(text):
    text = ' '.join(nltk.word_tokenize(text))
    
    # Lowercase
    text = text.lower()

    # Replace the newlines and punctuations with space
    filters = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    text = text.translate(str.maketrans(filters, ' '*len(filters)))

    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Remove punctuations and numbers
    text = ' '.join([word for word in text.split() if word.isalpha()])
    # Remove single character
    text = ' '.join([word for word in text.split() if len(word) > 2])
    return text

In [None]:
sentences = nltk.sent_tokenize(article)
# sentences = nltk.sent_tokenize(article) + nltk.sent_tokenize(summary)
preprocessed_sentences = [preprocess(sentence) for sentence in sentences]
word_tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in preprocessed_sentences]

In [None]:
print(sentences)
print(word_tokenized_sentences[0])
# nltk.word_tokenize(sentences[0])

## Get total frequency, IDF by (total sentances containing that word)

- we can get TF value for each word in each sentance saperatly
- Get the IDF value for complete article at once

In [None]:
words_frequency = {}
total_sentences_containing_word = {}
words_idf = {}

for sentence in preprocessed_sentences:
    # Tokenize the sentence into words
    words = nltk.word_tokenize(sentence)
    for word in words:
        if word not in words_frequency.keys():
            words_frequency[word] = 1
        else:
            words_frequency[word] += 1
    
    for word in set(words):
        if word not in total_sentences_containing_word.keys():
            total_sentences_containing_word[word] = 1
        else:
            total_sentences_containing_word[word] += 1


for word in words_frequency.keys():
    words_idf[word] = np.log(len(preprocessed_sentences) / words_frequency[word])

# Generate Word and Sentance Embeddings

In [None]:
# Define a BERT model and tokenizer (replace with the specific BERT model you are using)
model_name = "bert-base-uncased"  # Example: You can use a different pretrained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModel.from_pretrained(model_name)
clear_output()

In [None]:
# Initialize lists to store sentence and word embeddings
sentence_embeddings = []
word_embeddings = []

# Store the tokenized input IDs, attention masks and token type IDs
input_ids = []
attention_masks = []

# Attention should be done as TF-IDF values are calculated for each word in the sentence


# Loop through sentences and tokenize words using NLTK
for sentence in preprocessed_sentences:
    # Tokenize the sentence into words
    words = nltk.word_tokenize(sentence)

    # Convert words to BERT tokens
    tokens = [tokenizer.cls_token] + words + [tokenizer.sep_token]

    # Convert tokens to input IDs
    _input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Create attention masks by tf-idf values (freq of word in sentence / total sentences containing word)
    # attention_mask = [1] * len(_input_ids)
    words_tf = {}
    for word in words:
        if word not in words_tf.keys():
            words_tf[word] = 1/words_frequency[word]
        else:
            words_tf[word] += 1/words_frequency[word]
    
    attention_mask = [0] + [words_tf[word] * words_idf[word] for word in words] + [0]

    # Create an input dictionary in the expected format
    input_dict = {
        'input_ids': tf.constant([_input_ids]),
        'attention_mask': tf.constant([attention_mask]),
    }

    # Get BERT model output
    with tf.device('/GPU:0'):
        output = model(input_dict)

    # Extract sentence and word embeddings
    sentence_embedding = tf.reduce_mean(output.last_hidden_state, axis=1).numpy()  # Sentence embedding
    word_embedding = output.last_hidden_state.numpy()  # Word embeddings

    # Append to lists
    sentence_embeddings.append(sentence_embedding.reshape(768, ))
    word_embeddings.append(word_embedding.reshape(-1, 768))

    # Append to lists Attention masks and input IDs
    input_ids.append(tf.constant([_input_ids]).numpy().reshape(-1))
    attention_masks.append(tf.constant([attention_mask]).numpy().reshape(-1))

In [None]:
sentence_embeddings[0].shape, word_embeddings[0].shape, input_ids[0].shape, attention_masks[0].shape

# BERT Graph Initializers Phase

## Graph Attention Layer

In [None]:
from keras.layers import Input, Dense, Dropout, Activation, Multiply, Concatenate, RepeatVector, Permute, Flatten
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K

def GraphAttentionLayer(inputs, attention_head, activation='relu'):
    # Perform linear transformation and add self-attention weights
    W = Dense(attention_head, activation=None)(inputs)
    attention = Dense(1, activation=None)(W)
    attention = Flatten()(attention)
    attention = Activation('softmax')(attention)

    # Apply attention to the input data
    attention = RepeatVector(attention_head)(attention)
    attention = Permute([2, 1])(attention)
    output = Multiply()([inputs, attention])

    # Aggregate the output from all attention heads
    output = K.sum(output, axis=-2)

    output = Activation(activation)(output)
    output = Dropout(0.5)(output)

    return output

def build_gat_model(input_dim, hidden_dim, output_dim, num_heads, num_nodes):
    inputs = Input(shape=(input_dim,))

    # Apply Graph Attention Layers
    attention_heads = []
    for _ in range(num_heads):
        attention_head = GraphAttentionLayer(inputs, attention_head=num_heads)
        attention_heads.append(attention_head)

    # Concatenate the outputs from all attention heads
    output_layer = Concatenate()(attention_heads)

    # Fully connected layer for final prediction
    output_layer = Dense(hidden_dim, activation='relu')(output_layer)
    output_layer = Dropout(0.5)(output_layer)

    # # Output layer for each node
    # output_layers = []
    # for _ in range(num_nodes):
    #     node_output = Dense(output_dim, activation='softmax')(output_layer)
    #     output_layers.append(node_output)


    # Output layer with softmax activation for all nodes
    output_layer = Dense(num_nodes * output_dim, activation='softmax')(output_layer)


    model = Model(inputs=inputs, outputs=output_layer)
    return model


In [None]:

input_dim = max_sentence_len # node features
hidden_dim = 64 
output_dim = 1 # Define your output dimension (number of classes)
num_heads = 4  # Number of attention heads
num_nodes = max_article_len # Number of nodes in each graph (number of sentences)

In [None]:
# Build the model
gat_model = build_gat_model(input_dim, hidden_dim, output_dim, num_heads, num_nodes)

# Compile the model
gat_model.compile(optimizer=Adam(lr=0.005), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model using your training data
# gat_model.fit(train_data, [train_labels]*num_nodes, epochs=num_epochs, batch_size=batch_size, validation_split=0.1)

## Evaluate the model

In [None]:
# Make predictions on new graphs using the trained model
# new_graph_predictions = gat_model.predict(new_graph_data)