In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets

#Read data and store in dictionary by category.
keys = ['business','entertainment','politics','sport','tech']
dataset = {}
for key in keys:
    df = pd.read_pickle("../input/bbcwithlabel/train_df_label_"+key+".pickle" )
    df_result = df[['Text', 'Summary', 'text_clean', "summary_clean", "labels", "labels_idx_list"]].copy()
    dataset[key] = df_result
    
dataset["business"].head()

In [None]:
#We get dataframe have all articles without divided by category.
df_all_doc = pd.read_pickle("../input/train-df-not-shuffle/train_df_not_shuffle.pickle" )
df_all_doc.tail()

In [None]:
#We take all text from dataframe. Because we need to get dictionary of dataset's vocabulary.
X_all_doc = df_all_doc["Text"]
X_all_doc = np.array(X_all_doc)
len(X_all_doc)

In [None]:
import tensorflow as tf
import keras

#text_vectorizer is a layer that transforms list of sentences to vector of integer.
#We padding every vector to the same length of 500 elements.
text_vectorizer = keras.layers.TextVectorization(max_tokens=34500, standardize="lower_and_strip_punctuation",
                                                 split="whitespace", output_mode="int", output_sequence_length=500)

#We adapt text_vectorizer to all text so that we can have dataset's vocabulary.
text_vectorizer.adapt(X_all_doc, batch_size=2225)

#Dataset's vocabulary is a dictionary. For example: 'hi':1, "bye":2.
vocab = text_vectorizer.get_vocabulary()
print("Vocab : {}".format(vocab[:10]))
print("Vocab Size : {}".format(text_vectorizer.vocabulary_size()))


In [None]:
keys = ['business','entertainment','politics','sport','tech']

#We vectorized text in dataset of every category to be a matrix of integer.
#Each vector in matrix present a sentence in text.
#The matrix is "text_embedding" attribute of dataset.
for key in keys:
    df_category = dataset[key]
    vectorized_text_list = []
    for i in range(len(df_category)):
        vectorized_text = text_vectorizer(df_category.iloc[i]["text_clean"])
        vectorized_text = np.array(vectorized_text)
        vectorized_text_list.append(vectorized_text)
    df_category["text_embedding"] = vectorized_text_list
        
#dataset["sport"].head()

In [None]:
from sklearn.model_selection import train_test_split
#We split dataset to train set and test set. Ratio is 9:1.
train_sport, test_sport = train_test_split(dataset["sport"], test_size=0.1)
train_business, test_business = train_test_split(dataset["business"], test_size=0.1)
train_entertainment, test_entertainment = train_test_split(dataset["entertainment"], test_size=0.1)
train_tech, test_tech = train_test_split(dataset["tech"], test_size=0.1)
train_politics, test_politics = train_test_split(dataset["politics"], test_size=0.1)

#Then we store in dictionary by category.
train_test_sets = {}
train_test_sets["sport"] = {"train": train_sport, "test": test_sport}
train_test_sets["business"] = {"train": train_business, "test": test_business}
train_test_sets["entertainment"] = {"train": train_entertainment, "test": test_entertainment}
train_test_sets["tech"] = {"train": train_tech, "test": test_tech}
train_test_sets["politics"] = {"train": train_politics, "test": test_politics}

In [None]:
import pickle
#Save dataset for future comparing.
test_data_file = 'test_data.pickle'
with open(test_data_file, 'wb') as handle:                                     
    pickle.dump(train_test_sets, handle)

In [None]:
import gc
gc.collect()

In [None]:
!pip install rouge-score
import numpy as np
import pandas as pd


import re
import string
import csv
import os
from keras.models import Sequential
import torch
from tensorflow.keras import optimizers, utils
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import LSTM, Dense, Input, Embedding,Dropout, Concatenate, TimeDistributed, Bidirectional, GRU, BatchNormalization, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils.vis_utils import plot_model
from rouge_score import rouge_scorer
from tensorflow.keras import backend as K

In [None]:
from rouge_score import rouge_scorer

#Calculate F1 of ROUGE for Evaluation.
def calc_rouge_scores(pred_summaries, gold_summaries, 
                                 keys=['rouge1', 'rougeL'], use_stemmer=True):
    #Calculate rouge scores
    scorer = rouge_scorer.RougeScorer(keys, use_stemmer= use_stemmer)
    
    n = len(pred_summaries)
    
    #Calculate ROUGE score for every test in testset.
    scores = [scorer.score(pred_summaries[j], gold_summaries[j]) for 
              j in range(n)] 
    
    dict_scores={}                                                            
    for key in keys:
        dict_scores.update({key: {}})
        
    
    for key in keys:
        
        #Get precision for every test in testset.
        precision_list = [scores[j][key][0] for j in range(len(scores))]
        #Get recall for every test in testset.
        recall_list = [scores[j][key][1] for j in range(len(scores))]
        #Get F1 for every test in testset.
        f1_list = [scores[j][key][2] for j in range(len(scores))]

        #Calculate mean ROUGE score of all test.
        precision = np.mean(precision_list)
        recall = np.mean(recall_list)
        f1 = np.mean(f1_list)
        
        dict_results = {'recall': recall, 'precision': precision, 'f1': f1}
        
        dict_scores[key] = dict_results
        
    return dict_scores

In [None]:
# Standardizing every text_embedding to a matrix with shape 246,500. 
# We use padding of 0 to do this. 
def padding_sentence(X, Y):
    max_number_sentence = 246
    padding_X=np.empty(500)
    padding_X.fill(0)
    for i in range(len(X)):
        while(len(X[i]) < max_number_sentence):
            X[i] = np.append(X[i], [padding_X], axis = 0)
            Y[i] = np.append(Y[i], [0], axis = 0)
    
    return X, Y

In [None]:
#Diving batch of dataset.
def get_batch(tasks_key, batch_size, number_of_shot = None):
    batch_sets = {}
    test_sets = {}
    for key in tasks_key:
 

        #In scenario of training on 12-Shot dataset, Few-shot Learning.
        if number_of_shot:
            X_train = np.array(train_test_sets[key]["train"]["text_embedding"])[:number_of_shot]
            y_train = np.array(train_test_sets[key]["train"]["labels"])[:number_of_shot]
        
        #In scenario of training on full dataset.
        else:
            X_train = np.array(train_test_sets[key]["train"]["text_embedding"])
            y_train = np.array(train_test_sets[key]["train"]["labels"])

        X_test = np.array(train_test_sets[key]["test"]["text_embedding"])
        y_test = np.array(train_test_sets[key]["test"]["labels"])
        
        
        #Standardizing text_embedding.
        X_train, y_train = padding_sentence(X_train, y_train)
        X_test, y_test = padding_sentence(X_test, y_test)
        
        
        
        #Calculate number of batch based on batch_size.
        num_batches = (len(X_train) + batch_size - 1) // batch_size
        
        
        #Batch_sets is a dictionary by category.
        #Value of each category is a vector that named batches.
        #Each element of batches is a set of data that present a batch in training.
        
        batches = []
        for i in range(num_batches):
            if batch_size*i+batch_size <= len(X_train):
                batches.append({"X_train": X_train[i*batch_size:i*batch_size+batch_size],
                                "y_train": y_train[i*batch_size:i*batch_size+batch_size]})
            else:
                batches.append({"X_train": X_train[i*batch_size:],
                                "y_train": y_train[i*batch_size:]})
                
            
        batch_sets[key]=batches
        test_sets[key] = {"X_test": X_test,
                                "y_test": y_test}
        
        
    return batch_sets, test_sets

In [None]:
from tensorflow.keras import layers, models, losses
import tensorflow as tf
import numpy as np


class MAML:
    def __init__(self):
        self.meta_model = self.get_maml_model()

    def get_maml_model(self):
        # define model
        model = Sequential()
        model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(246, 500)))
        model.add(Dense(1,activation='sigmoid'))
        model.add(tf.keras.layers.Reshape((-1,), input_shape=(246,1)))
        return model

    
    #Training step of each batch.
    def train_on_batch(self, support_train_data, query_train_data, inner_optimizer, outer_optimizer=None):

        batch_acc = []
        batch_loss = []
        task_weights = []

        support_task_key = ["tech","politics","entertainment","sport"]
        query_task_keys = ["business"]
        
        #Get currrent model's weight to make sure that model's weight is reset in beginning
        #of each inner loop.    
        meta_weights = self.meta_model.get_weights()

                
        #Inner loops.
        #Loop through all support dataset and update model weight.
        for key in support_task_key:            
            #Get starting initialized weight. 
            self.meta_model.set_weights(meta_weights)
            
            X = np.array([np.array(val) for val in support_train_data[key]["X_train"]])
            y = np.array([np.array(val) for val in support_train_data[key]["y_train"]])
            with tf.GradientTape() as tape:
                pred = self.meta_model(X)
                loss = losses.binary_crossentropy(y, pred)
                
            # Calculate the gradients for the variables
            gradients = tape.gradient(loss, self.meta_model.trainable_variables)
            # Apply the gradients and update the optimizer
            inner_optimizer.apply_gradients(zip(gradients, self.meta_model.trainable_variables))
           
            #Save optimized weight of each support task. 
            task_weights.append(self.meta_model.get_weights())

    
        #Calculate loss of each optimized weight on query training dataset set.
        with tf.GradientTape() as tape:
            for i in range(len(support_task_key)):
            
                query_task_key = query_task_keys[0]
                
                #Get each saved optimized weight.
                self.meta_model.set_weights(task_weights[i])
                
                X = np.array([np.array(val) for val in query_train_data[query_task_key]["X_train"]])
                y = np.array([np.array(val) for val in query_train_data[query_task_key]["y_train"]])
  
                pred = self.meta_model(X)
                loss = losses.binary_crossentropy(y, pred)
                
                batch_loss.append(loss)
                
            #Calculate sum loss
            #Calculate mean loss only for visualizing.
            sum_loss = tf.reduce_sum(batch_loss)
            mean_loss = tf.reduce_mean(batch_loss)

        #Get starting initialized weight. 
        self.meta_model.set_weights(meta_weights)

        #Backpropagation of outer loop.
        if outer_optimizer:
            grads = tape.gradient(sum_loss, self.meta_model.trainable_variables)
            outer_optimizer.apply_gradients(zip(grads, self.meta_model.trainable_variables))
                      
        return mean_loss
        
        
    

In [None]:
import math

#Set parameter of training data. 
# Diving training data into batches.

support_tasks_key = ["sport", "entertainment", "tech", "politics"]
query_tasks_key = ["business"]

number_of_query_batch = 12
number_of_shot = 12
query_batch_size = math.ceil(number_of_shot / number_of_query_batch)

#query_batch_size = 30
support_batch_size = 30



support_batch_sets, support_test_sets = get_batch(support_tasks_key, support_batch_size)
query_batch_sets, query_test_sets = get_batch(query_tasks_key, query_batch_size, number_of_shot)

In [None]:
#Evaluation while training.
def val_on_batch(model):
    y_pred_list =[]
    idx_list=[]

    key_query_task="business"

    #Get text_embedding of test set.
    X_test = np.array([np.array(val) for val in query_test_sets[key_query_task]["X_test"]])
    
    #Prediction on test set.
    #Every prediction is a vector of values between 0 and 1.
    # Each value equivalent a sentences in the same position.
    # Each value is prob that sentence is picked for summary.
    y_preds = model.predict(X_test, verbose=0)
    
    print(len(y_preds))
    
    #Loop through prediction.
    # If the prob is equal or higher 0.5, we store it index.
    # If the number of stored index if less than 5, we store 5 prob-highest index.
    for j in range(len(y_preds)):
        idx = []
        for i in range(len(y_preds[j])):
            pred_percent = y_preds[j][i]
            if(pred_percent >= 0.5):
                idx.append(i)
        if len(idx) < 5.0:
            idx = np.argsort(y_preds[j][-5:])
        idx = sorted(idx)
        idx_list.append(idx)
    
    val_sets = train_test_sets[key_query_task]["test"]

    #Retrieve picked sentences from source texts.
    df_text_clean = val_sets["text_clean"]
    pred_summaries = []
    for doc in range(len(idx_list)):
        pred_summary_sentences_list = []
        text_clean = np.array(df_text_clean.iloc[doc])
        idx_doc = idx_list[doc]
        for i in range(len(text_clean)):
            if(i in idx_doc):
                sentence = text_clean[i]
                pred_summary_sentences_list.append(sentence)
                
        pred_summary = " ".join(pred_summary_sentences_list)
        pred_summaries.append(pred_summary)
    
    
    #Get golden summary.
    df_gold = val_sets["Summary"]

    gold_summaries = [df_gold.iloc[m] for m in range(len(df_gold))]
    



    summaries_comp = tuple(zip(pred_summaries, gold_summaries))


    #calculate rouge score
    scores = calc_rouge_scores(pred_summaries, gold_summaries, 
                                  keys=['rouge1', 'rougeL'], use_stemmer=True)
    
    return scores


In [None]:
epochs = 50


maml = MAML()

inner_optimizer = optimizers.Adam(learning_rate=0.001)
outer_optimizer = optimizers.Adam(learning_rate=0.001)

#print(y_test)

query_key = "business"

#Find min number of batch.
#Because each dataset of different category have different size.
#So the number of batch will be different for each category.
#We need to make sure have all the category in a training step.
training_steps = 1000
for key in support_batch_sets:
    if(len(support_batch_sets[key]) < training_steps):
        training_steps = len(support_batch_sets[key])
        
for key in query_batch_sets:
    if(len(query_batch_sets[key]) < training_steps):
        training_steps = len(query_batch_sets[key])

                             
valuating_steps = len(query_test_sets[query_key]["X_test"])

            
train_progbar = utils.Progbar(training_steps)

loss_plot = []
f1_score_plot = []
precision_plot = []
recall_plot = []


#Loop by number of epochs
for epoch in range(epochs):
    train_meta_loss = []
    val_meta_loss = []
    
    #In each epoch, we loop through each batch. Each batch will be sent to training step function.
    for i in range(training_steps):
        support_train_data = {}
        query_train_data = {}
        
        for support_key in support_batch_sets:
            support_train_data[support_key] = support_batch_sets[support_key][i]
        for query_key in query_batch_sets: 
            query_train_data[query_key] = query_batch_sets[query_key][i]
            
        batch_train_loss = maml.train_on_batch(support_train_data,
                                                        query_train_data,
                                                        inner_optimizer,
                                                        outer_optimizer=outer_optimizer)

        train_meta_loss.append(batch_train_loss)
        train_progbar.update(i+1, [('loss', np.mean(train_meta_loss))])
    
    #Store number for ploting
    loss_plot.append( np.mean(train_meta_loss))
    
    scores = val_on_batch(maml.meta_model)
    f1_score_plot.append(scores["rouge1"]["f1"])
    precision_plot.append(scores["rouge1"]["precision"])
    recall_plot.append(scores["rouge1"]["recall"])
    
    print("\n")
    print(scores)
    print("\n")
    
    


#Save trained model
maml.meta_model.save("./model.h5")


In [None]:

from matplotlib import pyplot as plt

#Ploting F1 and loss graph.
plt.title('model accuracy') 
plt.ylabel('accuracy')
plt.xlabel('epoch')

epochs_plot = [i for i in range(epochs)]


plt.plot(epochs_plot, loss_plot , color="red", label = "loss")
plt.plot(epochs_plot, f1_score_plot, color="blue", label = "validation")

plt.legend(loc='upper left')

plt.show()


In [None]:

from matplotlib import pyplot as plt

#Ploting Recall and Precision graph.
plt.title('rouge score')
plt.ylabel('score')
plt.xlabel('epoch')

epochs_plot = [i for i in range(epochs)]

plt.plot(epochs_plot, recall_plot, color="red", label = "recall")
plt.plot(epochs_plot, precision_plot, color="blue", label = "precision")

plt.legend(loc='upper left')

plt.show()


In [None]:
#This cell is the same to cell "val_on_batch".
#It calculate score,save pair golden_summary-model_summary, save evaluation information.
output_file = 'result.pickle'


y_pred_list =[]
idx_list=[]

key_query_task="business"

X_test = np.array([np.array(val) for val in query_test_sets[key_query_task]["X_test"]])
 
    
y_preds = maml.meta_model.predict(X_test, verbose=0)
    

    
for j in range(len(y_preds)):
    idx = []
    for i in range(len(y_preds[j])):
        pred_percent = y_preds[j][i]
        if(pred_percent > 0.5):
            idx.append(i)
    if len(idx) < 5.0:
        idx = np.argsort(y_preds[j][-5:])
    idx = sorted(idx)
    idx_list.append(idx)
    
val_sets = train_test_sets[key_query_task]["test"]

#retrieve summary pairs
df_text_clean = val_sets["text_clean"]
pred_summaries = []
for doc in range(len(idx_list)):
    pred_summary_sentences_list = []
    text_clean = np.array(df_text_clean.iloc[doc])
    idx_doc = idx_list[doc]
    for i in range(len(text_clean)):
        if(i in idx_doc):
            sentence = text_clean[i]
            pred_summary_sentences_list.append(sentence)
                
    pred_summary = " ".join(pred_summary_sentences_list)
    pred_summaries.append(pred_summary)
    
    
df_gold = val_sets["Summary"]

gold_summaries = [df_gold.iloc[m] for m in range(len(df_gold))]
    



summaries_comp = tuple(zip(pred_summaries, gold_summaries))


#calculate rouge score
scores = calc_rouge_scores(pred_summaries, gold_summaries, 
                                  keys=['rouge1', 'rougeL'], use_stemmer=True)
    
results_dict ={'summaries_comp': summaries_comp,
               'sent_index_number': idx, 'Rouge': scores, 'mod_summary': maml.meta_model.summary()}

with open(output_file, 'wb') as handle:                                     
    pickle.dump(results_dict, handle)


In [None]:
result = pd.read_pickle("./result.pickle")
print(result["Rouge"])
print("\nPrediction\n")
print(result["summaries_comp"][0][0])
print("\nReal summary\n")
print(result["summaries_comp"][0][1])