In [None]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords


import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel


from IPython.display import clear_output # to clear the large outputs

In [None]:
# Initialize NLTK
nltk.download('punkt')
clear_output()

In [None]:
# df = pd.read_csv("../EnglishNews_train.csv", encoding="utf-8", nrows=10)
df = pd.read_csv("./newEnglishNews_train.csv", encoding="utf-8", nrows=1000).dropna().reset_index().drop(['index'], axis=1)
df.tail()

In [None]:
df.shape

# This is for one article

# Create vocabulary

In [None]:
articles = df["Article"]
# articles.head()
article = articles[0]

In [None]:
all_summary = df["Summary"]
# all_summary.head()
summary = all_summary[0]

In [None]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
clear_output()

# Preprocess the text
def preprocess(text):
    text = ' '.join(nltk.word_tokenize(text))
    
    # Lowercase
    text = text.lower()

    # Replace the newlines and punctuations with space
    filters = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    text = text.translate(str.maketrans(filters, ' '*len(filters)))

    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Remove punctuations and numbers
    text = ' '.join([word for word in text.split() if word.isalpha()])
    # Remove single character
    text = ' '.join([word for word in text.split() if len(word) > 2])
    return text

In [None]:
sentences = nltk.sent_tokenize(article)
# sentences = nltk.sent_tokenize(article) + nltk.sent_tokenize(summary)
preprocessed_sentences = [preprocess(sentence) for sentence in sentences]
word_tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in preprocessed_sentences]

In [None]:
print(sentences)
print(word_tokenized_sentences[0])
# nltk.word_tokenize(sentences[0])

# Generate Word and Sentance Embeddings

In [None]:
# Define a BERT model and tokenizer (replace with the specific BERT model you are using)
model_name = "bert-base-uncased"  # Example: You can use a different pretrained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModel.from_pretrained(model_name)
clear_output()

In [None]:
# Initialize lists to store sentence and word embeddings
sentence_embeddings = []
word_embeddings = []

# Loop through sentences and tokenize words using NLTK
for sentence in preprocessed_sentences:
    # Tokenize the sentence into words
    words = nltk.word_tokenize(sentence)

    # Convert words to BERT tokens
    tokens = [tokenizer.cls_token] + words + [tokenizer.sep_token]

    # Convert tokens to input IDs
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Create an input dictionary in the expected format
    input_dict = {
        'input_ids': tf.constant([input_ids]),
        'attention_mask': tf.constant([[1] * len(input_ids)]),
    }

    # Get BERT model output
    with tf.device('/GPU:0'):
        output = model(input_dict)

    # Extract sentence and word embeddings
    sentence_embedding = tf.reduce_mean(output.last_hidden_state, axis=1).numpy()  # Sentence embedding
    word_embedding = output.last_hidden_state.numpy()  # Word embeddings

    # Append to lists
    sentence_embeddings.append(sentence_embedding)
    word_embeddings.append(word_embedding)

## Generate sentance embeddings using CNN and BiLSTM

In [None]:
def get_max_length(sentences):
    max_length = 0
    for sentence in sentences:
        if len(sentence) > max_length:
            max_length = len(sentence)
    return max_length

In [None]:
def get_word_embeddings(word):
    try:
        return model.wv[word]
    except:
        return np.zeros(300)

In [None]:
# Equal length padding for all sentences
max_sentence_length = get_max_length(sentences)
for i in range(len(sentences)):
    if len(sentences[i]) < max_sentence_length:
        sentences[i] = sentences[i] + ['<PAD>'] * (max_sentence_length - len(sentences[i]))

In [None]:
# from keras.layers import Input, Conv1D, MaxPooling1D
# sentence_input = Input(shape=(max_sentence_length, embedding_dim)) # Total Words in a sentence * Embedding dimension
# cnn_layer = Conv1D(filters=32, kernel_size=3, activation='relu')(sentence_input)
# cnn_layer = MaxPooling1D(pool_size=2)(cnn_layer)


# from keras.layers import Bidirectional, LSTM
# lstm_input = Input(shape=(len(sentences), max_sentence_length, 32)) # Total Sentences * Total Words in a sentence * Embedding dimension
# lstm_layer = Bidirectional(LSTM(64, return_sequences=True))(lstm_input)

# from keras.layers import GlobalMaxPooling1D
# pooled_features = GlobalMaxPooling1D()(lstm_layer)

In [None]:
from keras.layers import Input, Conv1D, Bidirectional, LSTM, Dense, RepeatVector
from keras.models import Model

# Encoder
input_article = Input(shape=(max_sentence_length, embedding_dim)) # Total Words in a sentence * Embedding dimension
cnn_features = Conv1D(filters=32, kernel_size=3, activation='relu', padding='same')(input_article)
lstm_features = Bidirectional(LSTM(64, return_sequences=True))(cnn_features)

# Decoder
decoded = LSTM(64, return_sequences=True)(lstm_features)
decoded = Dense(embedding_dim, activation='sigmoid')(decoded)

# Autoencoder Model
autoencoder = Model(input_article, decoded)

In [None]:
X_train = np.array([np.array([get_word_embeddings(word) for word in sentence]) for sentence in sentences])
X_train.shape

In [None]:
# Compile and train the autoencoder with articles as both input and target
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(X_train, X_train, epochs=30)

In [None]:
# Feature Extraction Model
encoder = Model(input_article, [cnn_features, lstm_features])

In [None]:
local_features, contextual_features = encoder.predict(X_train[0].reshape(1, max_sentence_length, embedding_dim))
local_features.shape, contextual_features.shape

In [None]:
local_features = np.reshape(local_features, (max_sentence_length, 32))
contextual_features = np.reshape(contextual_features, (max_sentence_length, 128))
np.concatenate((local_features, contextual_features), axis=1).shape

In [None]:
# Generate embedding for each sentence in the article
def generate_sentence_embedding(sentence): # sentence is a list of words
    sentence = [get_word_embeddings(word) for word in sentence]
    local_features, contextual_features = encoder.predict(sentence.reshape(1, max_sentence_length, embedding_dim))
    local_features = np.reshape(local_features, (max_sentence_length, 32))
    contextual_features = np.reshape(contextual_features, (max_sentence_length, 128))
    sentence_embedding = np.concatenate((local_features, contextual_features), axis=1)
    return sentence_embedding

In [None]:
for sentence in sentences:
    sentence_embedding = generate_sentence_embedding(sentence)
    clear_output()

In [None]:
sentence_embedding.shape

## Create a file for embeding values as features for each sentance

In [None]:
# allData.to_csv('./features/embeddings_using_word2vec.csv', index=False)