In [None]:
!pip install sentence-transformers
import pickle
import pandas as pd
import numpy as np
import spacy
from sentence_transformers import SentenceTransformer
from datetime import datetime as dt


### Function help to separate text to list of sentences
def text_to_sent_list(text, 
                      nlp = spacy.load("en_core_web_lg"), 
                      embedder = SentenceTransformer('distilbert-base-nli-mean-tokens'),
                      min_len=2):
    
    #convert to list of sentences
    text = nlp(text)
    sents = list(text.sents)
    #remove newline
    sents_clean = [sentence.replace('\n', ' ') for sentence in sents]
    #remove entries with empty list
    sents_clean = [sentence.text for sentence in sents_clean if len(sentence)!=0]
    #remove entries with only white space
    sents_clean = [sentence.text for sentence in sents_clean if sentence != " "]
    #embed sentences. We only use this step for adding "label" attribute.
    sents_embedding= np.array(embedder.encode(sents_clean, convert_to_tensor=True))
    
    return sents_clean, sents_embedding

In [None]:
#Read data
data_business = pd.read_csv('../input/bbcnewstocsv/businessDataset.csv')
data_entertainment = pd.read_csv('../input/bbcnewstocsv/entertainmentDataset.csv')
data_politics = pd.read_csv('../input/bbcnewstocsv/politicsDataset.csv')
data_tech = pd.read_csv('../input/bbcnewstocsv/techDataset.csv')
data_sport = pd.read_csv('../input/bbcnewstocsv/sportDataset.csv', encoding= 'unicode_escape')

#Drop unused column
data_business = data_business.drop(['ID'], axis=1)
data_business = data_business.reset_index(drop=True)
data_entertainment = data_entertainment.drop(['ID'], axis=1)
data_entertainment = data_entertainment.reset_index(drop=True)
data_politics = data_politics.drop(['ID'], axis=1)
data_politics = data_politics.reset_index(drop=True)
data_tech = data_tech.drop(['ID'], axis=1)
data_tech = data_tech.reset_index(drop=True)
data_sport = data_sport.drop(['ID'], axis=1)
data_sport = data_sport.reset_index(drop=True)

# make dictionary of datasets by category
datasets = {"business": data_business, 
        "entertainment": data_entertainment,
        "politics":data_politics,
        "sport":data_sport,
        "tech":data_tech}

datasets["business"].head()


In [None]:
for key in datasets:
    
    dataset = datasets[key]
    output_file = 'train_df_'+key+'.pickle' 
    
    #load nlp and embedder
    nlp = spacy.load("en_core_web_lg")
    embedder = SentenceTransformer('distilbert-base-nli-mean-tokens')

    #extract clean sentence list and sentence embedding for each article's text
    f = lambda text: text_to_sent_list(text, nlp=nlp, embedder=embedder, min_len=2)
    s_interim_tuple = dataset['Text'].apply(f)

    dataset['text_clean'] = s_interim_tuple.apply(lambda x: x[0])
    dataset['text_embedding'] = s_interim_tuple.apply(lambda x: x[1])

    #extract clean sentence list and sentence embedding for each article's summary
    f = lambda summ: text_to_sent_list(summ, nlp=nlp, embedder=embedder, min_len=0)
    s_interim_tuple = dataset['Summary'].apply(f)

    dataset['summary_clean'] = s_interim_tuple.apply(lambda x: x[0])
    dataset['summary_embedding'] = s_interim_tuple.apply(lambda x: x[1])

    with open(output_file, 'wb') as handle:                                     
        pickle.dump(dataset, handle)


In [None]:

import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

from datetime import datetime as dt

### Helper Functions


#Calculate cosine between two vector for checking similarity.
def find_sim_single_summary(summary_sentence_embed, doc_emedding):
    cos_sim_mat = cosine_similarity(doc_emedding, summary_sentence_embed)
    #Pick most similar sentences.
    idx_arr = np.argmax(cos_sim_mat, axis=0)
    return idx_arr

#Adding attribute "label" for data.
def label_sent_in_summary(s_text, s_summary):
    doc_num = s_text.shape[0]
    
    #initialize zeros. All sentences is labeled by "0"
    labels = [np.zeros(doc.shape[0]) for doc in s_text.tolist()] 
    
    #find index of summary-picked sentences. Check every pair sentences of text and summary.
    idx_list = [np.sort(find_sim_single_summary(s_summary[j], s_text[j])) for j 
                                                            in range(doc_num)]
    #Change label to "1" for summary-picked sentences  
    for j in range(doc_num):
        labels[j][idx_list[j]]= 1 
    
    return idx_list, labels


In [None]:
df = pd.read_pickle('../input/bbcpreprocess/train_df_tech.pickle' )
df.head()

In [None]:

datasets_key=["sport","politics","business","tech","entertainment"]
for key in datasets_key:
    output_file = 'train_df_label_'+key+'.pickle'

    df = pd.read_pickle('../input/bbcpreprocess/train_df_'+key+'.pickle' )

    #get index list and target labels
    idx_list, labels = label_sent_in_summary(df.text_embedding, df.summary_embedding)

    #wrap in dataframe
    df['labels'] = labels
    df['labels_idx_list'] = idx_list

    #save to pickle
    with open(output_file, 'wb') as handle:                                     
        pickle.dump(df, handle)