In [None]:
!pip install bert-extractive-summarizer
!pip install transformers
!pip install spacy
!pip install pysummarization
!pip install rouge-score

In [None]:
import numpy as np
import pandas as pd
import spacy
import string
from sklearn import datasets
from summarizer import Summarizer,TransformerSummarizer
from tensorflow import keras
from rouge_score import rouge_scorer

In [None]:
# Get testing dataset.
# Text attribute is source text, string type.
# text_clean attribute is list of sentence for each text, list of string type.
# text_embedding attribute is list of vectorized-to-number sentence for each text, list of integer type.
# Summary attribute is golden summary, string type.
train_test_sets = pd.read_pickle("../input/test-data-ver22/test_data_ver_22.pickle")
X_val_text = [val for val in train_test_sets["business"]["test"]["Text"]]
X_val_clean_sent = [val for val in train_test_sets["business"]["test"]["text_clean"]]
X_val_sent_embedded = [val for val in train_test_sets["business"]["test"]["text_embedding"]]
gold_summaries = [val for val in train_test_sets["business"]["test"]["Summary"]]


In [None]:
#Calcualte F1 score of ROUGE
def calc_rouge_scores(pred_summaries, gold_summaries, 
                                 keys=['rouge1', 'rougeL'], use_stemmer=True):
    #Calculate rouge scores
    scorer = rouge_scorer.RougeScorer(keys, use_stemmer= use_stemmer)
    
    n = len(pred_summaries)
    
    scores = [scorer.score(pred_summaries[j], gold_summaries[j]) for 
              j in range(n)] 
    
    dict_scores={}                                                            
    for key in keys:
        dict_scores.update({key: {}})
        
    
    for key in keys:
        
        precision_list = [scores[j][key][0] for j in range(len(scores))]
        recall_list = [scores[j][key][1] for j in range(len(scores))]
        f1_list = [scores[j][key][2] for j in range(len(scores))]

        precision = np.mean(precision_list)
        recall = np.mean(recall_list)
        f1 = np.mean(f1_list)
        
        dict_results = {'recall': recall, 'precision': precision, 'f1': f1}
        
        dict_scores[key] = dict_results
        
    return dict_scores

# Using our model

In [None]:
# Standardizing every text_embedding to a matrix with shape 246,500.
def padding_sentence(X):
    max_number_sentence = 246
    padding_X=np.empty(500)
    padding_X.fill(0)
    #print(len(X[0]))
    #print(Y[0])
    for i in range(len(X)):
        while(len(X[i]) < max_number_sentence):
            X[i] = np.append(X[i], [padding_X], axis = 0)
    
        if(len(X[i]) > max_number_sentence):
            X[i] = X[i][:246] 
    
    return X

In [None]:
X_val_sent_embedded_padding = padding_sentence(X_val_sent_embedded)
X_val_sent_embedded_padding[0].shape

In [None]:
#Load model
model_file_name = "../input/maml-lstm-ver22/maml_lstm_model_ver_22.h5"
lstm_model = keras.models.load_model(model_file_name)


In [None]:
#Generating our model's summary.
def summary_by_maml_model(maml_model, doc, sents_in_doc):
    y_pred_probs = maml_model.predict(doc, verbose = 0)
    y_pred_probs = y_pred_probs[0]
    y_pred_idx = []
    for y_pred_prob_idx in range(len(y_pred_probs)):
        y_pred_prob = y_pred_probs[y_pred_prob_idx]
        if (y_pred_prob > 0.5):
            y_pred_idx.append(y_pred_prob_idx)
    
    if (len(y_pred_idx) < 5):
        y_pred_idx = np.argsort(y_pred_probs[-5:])
    
    y_pred_idx = sorted(y_pred_idx)
            
    pred_summary_sentences_list = []
    for sent_idx in range(len(sents_in_doc)):
        if sent_idx in y_pred_idx:
            sent = sents_in_doc[sent_idx]
            pred_summary_sentences_list.append(sent)
            

    maml_model_summary = " ".join(pred_summary_sentences_list)
    return maml_model_summary

In [None]:
#Evaluation our model summary.
def val_maml_model():
    pred_summaries =[]
    

    
    for i in range(len(X_val_sent_embedded_padding)):
        print(i)
        X_val = X_val_sent_embedded_padding[i].reshape(1, X_val_sent_embedded_padding[i].shape[0]
                                                 , X_val_sent_embedded_padding[i].shape[1])
        summary = summary_by_maml_model(lstm_model, X_val, X_val_clean_sent[i])
        pred_summaries.append(summary)
        

    #calculate rouge score
    scores = calc_rouge_scores(pred_summaries, gold_summaries, 
                                  keys=['rouge1', 'rougeL'], use_stemmer=True)
    
    return scores


In [None]:
score_maml_model = val_maml_model()
print(score_maml_model)


# Using BERT

In [None]:
#Generating BERT summary.
def summary_by_bert(bert_model, doc):
    bert_summary = ''.join(bert_model(doc, num_sentences = 5))
    return bert_summary
    

In [None]:
#Evaluation BERT summary.
def val_bert():
    pred_summaries =[]
    bert_model = Summarizer()
    

    
    for i in range(len(X_val_text)):
        print(i)
        summary = summary_by_bert(bert_model, X_val_text[i])
        pred_summaries.append(summary)
        

    #calculate rouge score
    scores = calc_rouge_scores(pred_summaries, gold_summaries, 
                                  keys=['rouge1', 'rougeL'], use_stemmer=True)
    
    return scores


In [None]:
score_bert = val_bert()
print(score_bert)

# Using GPT2

In [None]:
#Generating GPT-2 summary.
def summary_by_GPT2(GPT2_model, doc):
    summary = ''.join(GPT2_model(doc, num_sentences = 5))
    return summary

In [None]:
#Evaluation GPT-2 summary.
def val_GPT2():
    pred_summaries =[]
    GPT2_model = TransformerSummarizer(transformer_type="GPT2",transformer_model_key="gpt2-medium")

    
    

    
    for i in range(len(X_val_text)):
        print(i)
        summary = summary_by_GPT2(GPT2_model, X_val_text[i])
        pred_summaries.append(summary)

        



    #calculate rouge score
    scores = calc_rouge_scores(pred_summaries, gold_summaries, 
                                  keys=['rouge1', 'rougeL'], use_stemmer=True)
    
    return scores


In [None]:
score_GPT2 = val_GPT2()
print(score_GPT2)

# Using XLNET

In [None]:
#Generating XLNET summary.
def summary_by_XLNET(XLNET_model, doc):
    summary = ''.join(XLNET_model(doc, num_sentences = 5))
    return summary

In [None]:
#Evaluation XLNET summary.
def val_XLNET():
    pred_summaries =[]
    XLNET_model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")

    
    for i in range(len(X_val_text)):
        print(i)
        summary = summary_by_XLNET(XLNET_model, X_val_text[i])
        pred_summaries.append(summary)
        



    #calculate rouge score
    scores = calc_rouge_scores(pred_summaries, gold_summaries, 
                                  keys=['rouge1', 'rougeL'], use_stemmer=True)
    
    return scores


In [None]:
score_XLNET = val_XLNET()
print(score_XLNET)