# Main notebook to process indexed datasets into summaries.

## Generate sentence pairs

In [7]:
import itertools
import json
import nltk
import pandas as pd

In [2]:
save_dir = input("enter path to base directory to save outputs to:\n")

enter path to base directory to save outputs to:
 ../data/test3


In [3]:
input_path = input("enter path to indexed dataset:\n")  # default encoding is utf-8
with open(input_path) as f:
    sents_df = pd.read_csv(f, delimiter=',')
sents_df.head()

enter path to indexed dataset:
 ../data/test3/MP3Player.csv


Unnamed: 0,index,sentence
0,0,each battery lasts about 10-11 hours.
1,1,"the zen micro is a sleek, stylish device that ..."
2,2,"battery life is exceptional, lasting longer th..."
3,3,there are 2 things that need fixing first is t...
4,4,i imagine if i left my player untouched (no ba...


In [4]:
indices = list(sents_df.index)

# generate all pairs in the set of sentences.
indices_list = list(itertools.combinations(indices, 2))
print('no. of sentences: ', len(indices))
print('no. of sentence pairs: ', len(indices_list), '\n')

no. of sentences:  20
no. of sentence pairs:  190 



In [5]:
# generate all sentence pairs and save into a csv for manual labelling.
data = []
for pair in indices_list:
    data.append([pair[0], pair[1], sents_df.iloc[pair[0]]['sentence'], sents_df.iloc[pair[1]]['sentence']])

df = pd.DataFrame(data=data, columns=['first_index', 'second_index', 'first', 'second'])
display(df)

Unnamed: 0,first_index,second_index,first,second
0,0,1,each battery lasts about 10-11 hours.,"the zen micro is a sleek, stylish device that ..."
1,0,2,each battery lasts about 10-11 hours.,"battery life is exceptional, lasting longer th..."
2,0,3,each battery lasts about 10-11 hours.,there are 2 things that need fixing first is t...
3,0,4,each battery lasts about 10-11 hours.,i imagine if i left my player untouched (no ba...
4,0,5,each battery lasts about 10-11 hours.,"the battery, as others have said, also seems w..."
...,...,...,...,...
185,16,18,the software is simple and straight forward.,"it's easy to use, and makes transfering your c..."
186,16,19,the software is simple and straight forward.,the software that comes along with it is not v...
187,17,18,"the software is very easy to use, and the tran...","it's easy to use, and makes transfering your c..."
188,17,19,"the software is very easy to use, and the tran...",the software that comes along with it is not v...


In [7]:
is_save = input('save indexed paired dataset? (y/n):\n')
if is_save.lower() == 'y':
    df.to_csv(save_dir + '/' + 'paired_sentences.csv')
else:
    print('not saving indexed paired dataset.')

save indexed paired dataset? (y/n):
 y


## Perform inferences of distances between sentences in a sentence pair

In [8]:
import torch
from fairseq.data.data_utils import collate_tokens
from math import exp
import numpy as np

class RobertaMNLI:
    # todo: create similar wrapper classes for other NLI engines.
    # todo: create a wrapper superclass and subclass from there.
    
    def __init__(self, rel_path):
        """
        :param rel_path: relative path to pytorch hub directory.
        """
        self.output_map = {
            0: 'contradiction',
            1: 'neutral',
            2: 'entailment'
        }
        
        torch.hub.set_dir(rel_path)
        self.roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')  # works
        self.roberta.cuda()
        self.roberta.eval()
        
    def predict_one(self, S1, S2, return_probs=False):
        batch = collate_tokens(
            [self.roberta.encode(S1, S2)], pad_idx=1
        )
        logprobs = self.roberta.predict('mnli', batch)
        classes_tsr = logprobs.argmax(dim=1)
        classes = classes_tsr.tolist()  
        
        if return_probs == True:
            logprobs_list=[logprob.item() for logprob in logprobs[0]]
            prob_list =  [pow(exp(1), logprob) for logprob in logprobs_list]
            return prob_list
        else:
            return classes[0]  # 0 is contradiction, 1 is neutral, 2 is entailment.

In [9]:
predictor = RobertaMNLI(rel_path="../../roberta/hub")

Using cache found in ../../roberta/hub\pytorch_fairseq_master


In [10]:
results = list()
for i in range(len(df)):
    s1 = df.iloc[i]['first']
    s2 = df.iloc[i]['second']
    CNE1 = predictor.predict_one(s1, s2, return_probs=True)
    CNE2 = predictor.predict_one(s2, s1, return_probs=True)
    row = CNE1+ CNE2
    results.append(row)

scores_df = pd.DataFrame(data=results, columns=['C1', 'N1', 'E1', 'C2', 'N2', 'E2'])

In [11]:
def gen_dist2(row):
    '''
    Generate distance score based on Contradiction, Entailment and Neutrality probabilities for both directions.
    Assumes that C+E+N=1.
    '''    
    max1 = max([row['C1'], row['E1'], row['N1']])
    max2 = max([row['C2'], row['E2'], row['N2']])
    if max1 > max2:
        C, E, N = row['C1'], row['E1'], row['N1']
    else:
        C, E, N = row['C2'], row['E2'], row['N2']
    
    if C > 0.5:
        return C
    elif C == E:
        return 0.5
    else:
        return N/(N+E)*0.5

In [12]:
dist_ser = scores_df.apply(lambda row: gen_dist2(row), axis=1)
dist_ser.name = 'dist'
scores_df = scores_df.join(dist_ser)
scores_df = df.join(scores_df)
scores_df.to_csv(save_dir + '/' + 'nliScores.csv')

In [13]:
scores_df.head()

Unnamed: 0,first_index,second_index,first,second,C1,N1,E1,C2,N2,E2,dist
0,0,1,each battery lasts about 10-11 hours.,"the zen micro is a sleek, stylish device that ...",0.120493,0.868415,0.011092,0.00128,0.997724,0.000997,0.499501
1,0,2,each battery lasts about 10-11 hours.,"battery life is exceptional, lasting longer th...",0.425281,0.561476,0.013243,0.014313,0.532966,0.452722,0.488479
2,0,3,each battery lasts about 10-11 hours.,there are 2 things that need fixing first is t...,0.102561,0.894541,0.002898,0.260592,0.65443,0.084977,0.498385
3,0,4,each battery lasts about 10-11 hours.,i imagine if i left my player untouched (no ba...,0.0062,0.993181,0.000618,0.046462,0.941302,0.012236,0.499689
4,0,5,each battery lasts about 10-11 hours.,"the battery, as others have said, also seems w...",0.001769,0.997358,0.000874,0.002019,0.997144,0.000837,0.499562


## Clustering based on precomputed pairwise distances.

In [14]:
import numpy as np
import pandas as pd

import scipy
from scipy.cluster.hierarchy import dendrogram,linkage
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sb

import sklearn
# from sklearn import datasets
from sklearn.cluster import AgglomerativeClustering
import sklearn.metrics as sm
from sklearn.preprocessing import scale

# Configure the output
np.set_printoptions(precision=4,suppress=True)
%matplotlib inline
rcParams["figure.figsize"] =20,10
sb.set_style("whitegrid")

In [15]:
# convert this original dataframe to a 'square' distance matrix.
# we utilise a pivot table to do this

# first, add the redundant pairs (two of the same sentences) to the data.
# we need this for the pivot function to generate the correct indices and columns.
first = np.unique(df['first_index'].values)
second = np.unique(df['second_index'].values)
combined_arr = np.concatenate((first, second))
sentences = np.unique(combined_arr)
add_data = np.column_stack([sentences, sentences, np.zeros(len(sentences))])
add_data = add_data.astype(np.int)

print(add_data)

[[ 0  0  0]
 [ 1  1  0]
 [ 2  2  0]
 [ 3  3  0]
 [ 4  4  0]
 [ 5  5  0]
 [ 6  6  0]
 [ 7  7  0]
 [ 8  8  0]
 [ 9  9  0]
 [10 10  0]
 [11 11  0]
 [12 12  0]
 [13 13  0]
 [14 14  0]
 [15 15  0]
 [16 16  0]
 [17 17  0]
 [18 18  0]
 [19 19  0]]


In [None]:
add_df = pd.DataFrame(data=add_data, columns=['first_index','second_index','dist'])
scores_df = scores_df.append(add_df, ignore_index=True)
df_piv = scores_df.pivot("first_index", "second_index", "dist").fillna(0)
piv_arr = df_piv.values
dist_mat = piv_arr + np.transpose(piv_arr)

# generate pairwise matrix for saving.
intersentence = pd.DataFrame(data=dist_mat, index=df_piv.index, columns=df_piv.columns)

path = save_dir + '/' + 'pairwiseMatrix.csv'
intersentence.to_csv(path)

In [8]:
# load data files (this step can be skipped if the previous few cells have been run to produce the required data)

# load intersentence pairwise distance matrix.
df_piv = pd.read_csv("../data/test3/pairwiseMatrix.csv")
print("loaded data from disk.")
display(df_piv.head())
df_piv = df_piv.drop(columns=['first_index'])
display(df_piv.head())
dist_mat = df_piv.values
display(dist_mat)

loaded data from disk.


Unnamed: 0,first_index,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,0,0.0,0.499501,0.488479,0.498385,0.499689,0.499562,0.499525,0.499429,0.499442,...,0.49956,0.499464,0.49964,0.498906,0.499599,0.499474,0.463023,0.484418,0.499378,0.499539
1,1,0.499501,0.0,0.499551,0.923219,0.499546,0.499673,0.947805,0.498945,0.49906,...,0.499046,0.498462,0.999136,0.993903,0.989903,0.499319,0.499453,0.499354,0.499051,0.499663
2,2,0.488479,0.499551,0.0,0.974216,0.499409,0.998521,0.999468,0.499324,0.499127,...,0.499579,0.499539,0.499692,0.909942,0.499188,0.499507,0.49787,0.499176,0.499423,0.49967
3,3,0.498385,0.923219,0.974216,0.0,0.499496,0.499616,0.743863,0.499104,0.499021,...,0.498632,0.499142,0.499336,0.491827,0.499333,0.498906,0.980102,0.980462,0.498987,0.49955
4,4,0.499689,0.499546,0.499409,0.499496,0.0,0.499607,0.499481,0.499487,0.499585,...,0.49946,0.499494,0.499651,0.498446,0.494485,0.49959,0.499556,0.499649,0.499591,0.499543


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.499501,0.488479,0.498385,0.499689,0.499562,0.499525,0.499429,0.499442,0.484586,0.49956,0.499464,0.49964,0.498906,0.499599,0.499474,0.463023,0.484418,0.499378,0.499539
1,0.499501,0.0,0.499551,0.923219,0.499546,0.499673,0.947805,0.498945,0.49906,0.496263,0.499046,0.498462,0.999136,0.993903,0.989903,0.499319,0.499453,0.499354,0.499051,0.499663
2,0.488479,0.499551,0.0,0.974216,0.499409,0.998521,0.999468,0.499324,0.499127,0.495346,0.499579,0.499539,0.499692,0.909942,0.499188,0.499507,0.49787,0.499176,0.499423,0.49967
3,0.498385,0.923219,0.974216,0.0,0.499496,0.499616,0.743863,0.499104,0.499021,0.94833,0.498632,0.499142,0.499336,0.491827,0.499333,0.498906,0.980102,0.980462,0.498987,0.49955
4,0.499689,0.499546,0.499409,0.499496,0.0,0.499607,0.499481,0.499487,0.499585,0.497995,0.49946,0.499494,0.499651,0.498446,0.494485,0.49959,0.499556,0.499649,0.499591,0.499543


array([[0.        , 0.49950104, 0.48847896, 0.4983853 , 0.49968899,
        0.49956223, 0.49952469, 0.49942948, 0.49944224, 0.48458592,
        0.49955978, 0.49946432, 0.49963952, 0.49890647, 0.49959878,
        0.49947382, 0.46302313, 0.48441849, 0.49937845, 0.49953908],
       [0.49950104, 0.        , 0.49955125, 0.92321876, 0.49954632,
        0.49967291, 0.94780498, 0.49894457, 0.49906046, 0.49626317,
        0.49904561, 0.49846248, 0.99913587, 0.99390275, 0.98990256,
        0.49931895, 0.49945269, 0.49935381, 0.49905139, 0.49966268],
       [0.48847896, 0.49955125, 0.        , 0.97421614, 0.49940934,
        0.99852147, 0.99946847, 0.49932447, 0.49912705, 0.49534632,
        0.49957868, 0.49953941, 0.49969199, 0.90994243, 0.49918833,
        0.49950708, 0.49787027, 0.49917552, 0.49942267, 0.49967025],
       [0.4983853 , 0.92321876, 0.97421614, 0.        , 0.49949581,
        0.49961587, 0.74386297, 0.49910364, 0.49902085, 0.94833029,
        0.49863175, 0.49914215, 0.49933647, 0

In [20]:
from scipy.spatial.distance import squareform
condensed_mat = squareform(dist_mat)
linkage = 'complete'

sil_scores = []  # silhouette scores
for i in range(2, len(dist_mat)):
    clusters = AgglomerativeClustering(n_clusters=i, affinity='precomputed', linkage=linkage)
    clusters.fit_predict(dist_mat)

    # use metric to measure/compare goodness of clusters based on internal coherence and 
    # external coupling.
    # consider Dunn index. maximise the Dunn index.
    # or consider average sillhouette width (maximise). available in sklearn.
    from sklearn.metrics import silhouette_score

    silhouette_avg = silhouette_score(dist_mat, clusters.labels_, metric='precomputed')
    sil_scores.append(silhouette_avg)
    print(i, ": ", silhouette_avg)

2 :  0.23558467124165486
3 :  0.12967654263553158
4 :  0.051369891268869025
5 :  0.05326604310169788
6 :  0.008931293223240483
7 :  0.009358228524803202
8 :  0.010816633461541044
9 :  0.01087853917149927
10 :  0.01084641815208133
11 :  0.010762902770904482
12 :  0.011976963285562485
13 :  0.011730216325523244
14 :  0.015628977832201178
15 :  0.017558837839946582
16 :  0.01752002806447993
17 :  0.01672578992395295
18 :  0.013563156111279645
19 :  0.007181803926500784


In [21]:
sil_scores_arr = np.asarray(sil_scores)
print('max avg silhouette score: ', sil_scores_arr.max())
OPTIMAL_K = np.where(sil_scores_arr == sil_scores_arr.max())[0][0] + 2
print('nclusters w max avg silhouette score: ', OPTIMAL_K)

max avg silhouette score:  0.23558467124165486
nclusters w max avg silhouette score:  2


In [25]:
selected_k = 3
clusters = AgglomerativeClustering(n_clusters=selected_k, affinity='precomputed', linkage=linkage)
clusters.fit_predict(dist_mat)
for i in range(clusters.n_clusters_):
    indices = np.where(clusters.labels_ == i)[0]
    
    print(f'cluster {i} (size {len(indices)}):')
    print(indices)
    for idx in indices:
        display(sents_df.iloc[idx]['sentence'])
    print('\n')

cluster 0 (size 7):
[ 4  5  6 12 13 14 19]


'i imagine if i left my player untouched (no backlight) it could play for considerably more than 12 hours at a low volume level.'

'the battery, as others have said, also seems weaker than Creative suggests.'

'the battery life leaves a lot to be desired.'

'but my biggest gripe is still the extremely ugly design.'

'but, the creative zen micro is plagued by a serious, serious stupid and elemenatary design flaw.'

"i thought it could've been designed a bit better."

'the software that comes along with it is not very intuitive but once you are used to it,it  does the job well.'



cluster 1 (size 10):
[ 0  1  2  7  9 11 15 16 17 18]


'each battery lasts about 10-11 hours.'

"the zen micro is a sleek, stylish device that caters not just to 'hip' youngsters but to people of all walks of life who like to listen on the go without having to worry frequently about battery life, size, or the 'awkward' factor."

'battery life is exceptional, lasting longer than the 8-10 hours that seem reasonable with most music players on the market.'

'the earbud headphones are really nice.'

'Creative did an awesome job with the design.'

'it was a good player when it worked and i loved the design.'

'creative has excellent music software, a very responsive tech support group (they are quick on firmware updates, and make them publicly available), and usually a good array of accessories.'

'the software is simple and straight forward.'

'the software is very easy to use, and the transfer is super fast!'

"it's easy to use, and makes transfering your cd collection to the player a cinch."



cluster 2 (size 3):
[ 3  8 10]


'there are 2 things that need fixing first is the battery life. it will run for 6 hrs without problems with medium usage of the buttons.'

'the earbuds are great headphones. as good as my agk studio 240, well not as good but darn close.'

'the design is very nice, and the colors are too.'





In [26]:
# save clusters into JSON dict
clusters_dict = dict()
for i in range(clusters.n_clusters_):
    indices = np.where(clusters.labels_ == i)[0]
    sents = list()
    for idx in indices:
        sents.append(int(df_piv.index[idx]))
    clusters_dict[i] = sents

path = save_dir + '/' + 'clusters.json'
with open(path, 'w') as fp:
    json.dump(clusters_dict, fp)

In [27]:
# Compute the distance of one cluster to another cluster by the average pairwise distance
# of the elements in one cluster and another cluster.  
for n_clusters in [selected_k]:
    print('------')
    print(f'\nno. of clusters = {n_clusters}')
    clusters = AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage=linkage)
    clusters.fit_predict(dist_mat)
    
    # find average distances between clusters and output a new distance matrix
    labels = [i for i in range(clusters.n_clusters_)]
    avg_dist_arr = np.zeros(shape=(clusters.n_clusters_, clusters.n_clusters_))
    for i in range(clusters.n_clusters_):
        indices = np.where(clusters.labels_ == i)[0]
        labels.remove(i)
        for j in labels:
            other_indices = np.where(clusters.labels_ == j)[0]
            intra_dist_arr = np.zeros(shape=(len(indices), len(other_indices)))
            
            for k, index in enumerate(indices):
                for m, other_index in enumerate(other_indices):
                    intra_dist_arr[k][m] = dist_mat[index][other_index]
            
            #print (f'cluster {i} against cluster {j}:')
            #print(intra_dist_arr)
            
            avg_dist = np.average(intra_dist_arr)
            #print(avg_dist)
            avg_dist_arr[i][j] = avg_dist
    print(avg_dist_arr)

------

no. of clusters = 3
[[0.     0.7181 0.6384]
 [0.     0.     0.5762]
 [0.     0.     0.    ]]


In [28]:
path = save_dir + '/' + 'clusterDistances.csv'
intercluster_dists = avg_dist_arr + np.transpose(avg_dist_arr)
np.savetxt(path, intercluster_dists, delimiter=",")

In [29]:
print(intercluster_dists)

[[0.     0.7181 0.6384]
 [0.7181 0.     0.5762]
 [0.6384 0.5762 0.    ]]


## Visualise network of clusters

In [None]:
import networkx as nx
import string

In [None]:
G = nx.from_numpy_matrix(tmp)
print(G.nodes())
print(G.edges().data())
#G = nx.relabel_nodes(G, dict(zip(range(len(G.nodes())),string.ascii_uppercase)))

import pydot
from networkx.drawing.nx_pydot import write_dot

print("using package pydot")
path = save_dir + '/' + 'clusterViz.dot'
write_dot(G, path)
# you can then use the various graphviz CLI to generate the image. E.g. "neato -Tpng clusterViz.dot -o clusterViz.png"
# see https://www.graphviz.org/pdf/neatoguide.pdf for more info on modifying dot files.
# fdp or sfdp can be used instead of neato. 

## Summarize

In [9]:
# load data files

import numpy as np
# load indexed sentences:
sents_df = pd.read_csv("../data/test3/MP3Player.csv")

# load intersentence pairwise distance matrix.
intersentence = pd.read_csv("../data/test3/pairwiseMatrix.csv")

# load intercluster pairwise distance matrix.
intercluster_dists = np.loadtxt("../data/test3/clusterDistances.csv", delimiter=",")

# load clusters.
with open("../data/test3/clusters.json") as fp:
    clusters_dict = json.load(fp)

    print("loaded data from disk.")

loaded data from disk.


In [3]:
from nltk.tokenize import RegexpTokenizer

def choose_longest(sentences):
    tokenizer = RegexpTokenizer(r'\w+')
    max_len = 0
    final_sent = None
    for sent in sentences:
        tokens = tokenizer.tokenize(sent)
        if len(tokens) > max_len:
            max_len = len(sent)
            final_sent = sent
    return final_sent

from lexrank import LexRank
from lexrank.mappings.stopwords import STOPWORDS
def choose_salient(sentences):
    """
    
    """
    lxr = LexRank(sentences, stopwords=STOPWORDS['en'])
    top_sentence = lxr.get_summary(sentences[0], summary_size=1, threshold=.1)
    return top_sentence[0]
    

def summarize(clusters, intersentence, sent_df, word_limit=90, coherence_threshold=0.6, redundancy_threshold=0.3):
    """
    :param clusters: dict of clusters of the form {str(cluster_num): list(sentence_indices)}.
    :param intersentence: Pandas DataFrame of precomputed inter-sentence distance in pairwise matrix form.
    :param sent_df: Pandas DataFrame of indexed sentences.
    :param word_limit: maximum num of words for summary.
    :param coherence_threshold: maximum distance allowed for a sentence pair within the summary. valid values between 0-1.
    :param redundancy_threshold: minimum distance allowed for a sentence pair within the summary. valid values between 0-1.
    
    We need a redundancy_threshold because the clustering algorithm does not always manage to group 
    redundant sentences together?
    """
    if coherence_threshold <= redundancy_threshold:
        raise Exception
    
    tokenizer = RegexpTokenizer(r'\w+') # tokenize into word-like tokens.
    summary = list()
    word_count = 0
    
    sorted_clusters = sorted(clusters.items(), key=lambda x: len(x[1]), reverse=True)  # iterable of pairs.
    
    for key, clust in sorted_clusters:
        sentences = [sent_df.iloc[idx]['sentence'] for idx in clust]
        #selected = choose_longest(clust)
        selected = choose_salient([sentences])
        selected_idx = sent_df[sent_df.sentence == selected].index[0]
        tokens = tokenizer.tokenize(selected)
        if word_count + len(tokens) < word_limit:
            max_dist = 0
            min_dist = 1
            for sent_idx in summary:
                dist = intersentence.iloc[sent_idx][selected_idx]
                if dist > max_dist:
                    max_dist = dist
                if dist < min_dist:
                    min_dist = dist
            if max_dist < coherence_threshold and min_dist > redundancy_threshold:
                summary.append(selected_idx)
                word_count = word_count + len(tokens)
        else:
            break
    return summary

In [4]:
# test lexrank algorithm.
sentences = [sents_df.iloc[idx]['sentence'] for idx in clusters_dict['0']]
display(sentences)
choose_salient([sentences])

['i imagine if i left my player untouched (no backlight) it could play for considerably more than 12 hours at a low volume level.',
 'the battery, as others have said, also seems weaker than Creative suggests.',
 'the battery life leaves a lot to be desired.',
 'but my biggest gripe is still the extremely ugly design.',
 'but, the creative zen micro is plagued by a serious, serious stupid and elemenatary design flaw.',
 "i thought it could've been designed a bit better.",
 'the software that comes along with it is not very intuitive but once you are used to it,it  does the job well.']

'the software that comes along with it is not very intuitive but once you are used to it,it  does the job well.'

In [5]:
# todo: think of statistical way to determine the coherence_threshold and redundancy_threshold.
summary = summarize(clusters_dict, intersentence, sents_df, word_limit=90, coherence_threshold=0.6, redundancy_threshold=0.35)
print("no. of sentences in summary: ", len(summary))
print("no. of sentences in source: ", len(intersentence.index.values))
print()
for sent_idx in summary:
    print(f"{sent_idx}: {sents_df.iloc[sent_idx]['sentence']}")
print()
for sent_idx in sorted(summary):
    print(f"{sent_idx}: {sents_df.iloc[sent_idx]['sentence']}")

no. of sentences in summary:  2
no. of sentences in source:  20

18: it's easy to use, and makes transfering your cd collection to the player a cinch.
10: the design is very nice, and the colors are too.

10: the design is very nice, and the colors are too.
18: it's easy to use, and makes transfering your cd collection to the player a cinch.


## Multiple Summaries

In [None]:
def multi_summary(clusters, intercluster_dists, sent_df, word_limit=100, contradiction_threshold=0.6):
    """
    :param clusters: dict of clusters of the form {str(cluster_index): list(sentence_indices)}.
    :param intercluster_dists: square numpy.ndarray representing distances between each cluster.
    :param sent_df: Pandas DataFrame of indexed sentences.
    :param word_limit: maximum num of words for summary.
    :param contradiction_threshold: float value between 0 and 1, representing the value after which
    sentence pairs will be considered contradictory.
    """
    # find top/representative sentence of each cluster.
    top_sentences = dict()
    for clust_idx, clust in clusters.items():
        sentences = [sent_df.iloc[idx]['sentence'] for idx in clust]
        selected = choose_salient([sentences])
        selected_idx = sent_df[sent_df.sentence == selected].index[0]
        top_sentences[int(clust_idx)] = selected_idx  # typecasted to int because json dicts saved integer keys into str keys.
    
    # find clusters that are not contradictory w ALL other clusters.
    base_clusters = list()
    contradicted_pairs = list()
    contradicted_set= set()
    for i in range(len(intercluster_dists)):
        for j in range(i+1,len(intercluster_dists)):
            if intercluster_dists[i][j] > contradiction_threshold:
                contradicted_pairs.append((i,j))
                contradicted_set.add(i)
                contradicted_set.add(j)
        if i not in contradicted_set:
            base_clusters.append(i)
    
    # construct base summary
    base_summary = list()
    word_count = 0
    tokenizer = RegexpTokenizer(r'\w+') # tokenize into word-like tokens.
    sorted_clusters = sorted(clusters.items(), key=lambda x: len(x[1]), reverse=True)  # iterable of pairs.
    for clust_idx, clust in sorted_clusters:
        if int(clust_idx) in base_clusters:
            selected_idx = top_sentences[int(clust_idx)]
            selected = sent_df.iloc[selected_idx]['sentence']
            tokens = tokenizer.tokenize(selected)
            if word_count + len(tokens) < word_limit:
                base_summary.append(selected_idx)
                word_count = word_count + len(tokens)
            else:
                break
    
    # construct summaries w contentious information.
    all_summaries = list()
    added_sentences = set()
    for clust1, clust2 in contradicted_pairs:
        sent1 =  top_sentences[clust1]
        sent2 =  top_sentences[clust2]
        
        if sent1 not in added_sentences:
            all_summaries.append(base_summary + [sent1])
            added_sentences.add(sent1)
        if sent2 not in added_sentences:
            all_summaries.append(base_summary + [sent2])
            added_sentences.add(sent2)
    print(base_summary)
    return all_summaries, base_summary

In [None]:
from IPython.display import Markdown
all_summaries, base_summary = multi_summary(clusters_dict, intercluster_dists, sents_df, word_limit=90, contradiction_threshold=0.6)

print(f"total no. of summaries = {len(all_summaries)}")
print()

for summary in all_summaries:
    print("no. of sentences in summary: ", len(summary))
    print("no. of sentences in source: ", len(intersentence.index.values))
    print()
    for sent_idx in sorted(summary):
        if sent_idx in base_summary:
            display(Markdown(f"**{sent_idx}: {sents_df.iloc[sent_idx]['sentence']}**"))
        else:
            display(Markdown(f"{sent_idx}: {sents_df.iloc[sent_idx]['sentence']}"))
    print()

## Reliability Ranking approach to multi-summarization.

In [6]:
clusters_dict

{'0': [4, 5, 6, 12, 13, 14, 19],
 '1': [0, 1, 2, 7, 9, 11, 15, 16, 17, 18],
 '2': [3, 8, 10]}

In [10]:
def rank_by_reliability(clusters_dict, intercluster, contra_threshold=0.5):
    """
    :param clusters_dict: dict of clusters. {str(cluster index): list(sentence indices)}.
    :param intercluster: square numpy.ndarray representing distances between each cluster.
    
    """
    scores = list()
    for cluster_idx in range(len(intercluster)):
        contra_size = 0
        for other_cluster_idx in range(len(intercluster)):
            if intercluster[cluster_idx][other_cluster_idx] > contra_threshold:
                contra_size = contra_size + len(clusters_dict[str(other_cluster_idx)])
        if contra_size > 0:
            numer = len(clusters_dict[str(cluster_idx)])
            denom = numer + contra_size
            score = numer/denom
        else:
            score = len(clusters_dict[str(cluster_idx)])
        scores.append((score, cluster_idx))
    return scores

In [11]:
scores = rank_by_reliability(clusters_dict, intercluster_dists, contra_threshold=0.6)

In [12]:
for score, idx in sorted(scores, reverse=True):
    print(f"cluster: {idx}")
    print(f"reliability score: {score}")
    display(sents_df.iloc[clusters_dict[str(idx)]])

cluster: 1
reliability score: 0.5882352941176471


Unnamed: 0,index,sentence
0,0,each battery lasts about 10-11 hours.
1,1,"the zen micro is a sleek, stylish device that ..."
2,2,"battery life is exceptional, lasting longer th..."
7,7,the earbud headphones are really nice.
9,9,Creative did an awesome job with the design.
11,11,it was a good player when it worked and i love...
15,15,"creative has excellent music software, a very ..."
16,16,the software is simple and straight forward.
17,17,"the software is very easy to use, and the tran..."
18,18,"it's easy to use, and makes transfering your c..."


cluster: 0
reliability score: 0.35


Unnamed: 0,index,sentence
4,4,i imagine if i left my player untouched (no ba...
5,5,"the battery, as others have said, also seems w..."
6,6,the battery life leaves a lot to be desired.
12,12,but my biggest gripe is still the extremely ug...
13,13,"but, the creative zen micro is plagued by a se..."
14,14,i thought it could've been designed a bit better.
19,19,the software that comes along with it is not v...


cluster: 2
reliability score: 0.3


Unnamed: 0,index,sentence
3,3,there are 2 things that need fixing first is t...
8,8,the earbuds are great headphones. as good as m...
10,10,"the design is very nice, and the colors are too."


In [13]:
def intra_clust_dis_mat(clust, pairwise):
    """
    :param clust: list of sentence indices that form the cluster.
    :param pairwise: numpy.ndarray of universal pairwise dist matrix. 
    """
    sorted_clust = sorted(clust)
    pairs = list(itertools.combinations(clust, 2))
    tmp = np.zeros((len(clust), len(clust)))
    for pair in pairs:
        tmp_idx0 = sorted_clust.index(pair[0])
        tmp_idx1 = sorted_clust.index(pair[1])
        tmp[tmp_idx0][tmp_idx1] = pairwise[pair[0], pair[1]]
    tmp = tmp + np.transpose(tmp)
    return tmp, sorted_clust

In [19]:
mat, sorted_clust = intra_clust_dis_mat(clusters_dict['0'], dist_mat)
print(mat)
print(sorted_clust)

[[0.         0.49960732 0.49948128 0.49965078 0.4984464  0.4944854
  0.49954331]
 [0.49960732 0.         0.49902613 0.49962005 0.4995278  0.49956929
  0.49962768]
 [0.49948128 0.49902613 0.         0.49962461 0.49682008 0.49935964
  0.49957831]
 [0.49965078 0.49962005 0.49962461 0.         0.47973174 0.49902386
  0.49961127]
 [0.4984464  0.4995278  0.49682008 0.47973174 0.         0.49667249
  0.49952537]
 [0.4944854  0.49956929 0.49935964 0.49902386 0.49667249 0.
  0.49944676]
 [0.49954331 0.49962768 0.49957831 0.49961127 0.49952537 0.49944676
  0.        ]]
[4, 5, 6, 12, 13, 14, 19]


In [20]:
intercluster_dists

array([[0.        , 0.7181331 , 0.63836379],
       [0.7181331 , 0.        , 0.57616617],
       [0.63836379, 0.57616617, 0.        ]])

In [21]:
from lexrank.lexrank import LexRank
from lexrank.lexrank import degree_centrality_scores

In [34]:
def choose_salient(dis_mat, sorted_clust):
    """
    :param dis_mat: disimilarity or distance matrix. every value is between 0-1, inclusive.
    :sorted_clust: sorted list of sentence indices corresponding to the indexing of dis_mat.
    """
    similarity_matrix = 1 - dis_mat
    scores = degree_centrality_scores(similarity_matrix, threshold=None)
    return sorted_clust[np.argmax(scores)]

In [36]:
sents_df.iloc[choose_salient(mat, sorted_clust)]['sentence']

'but, the creative zen micro is plagued by a serious, serious stupid and elemenatary design flaw.'

In [37]:
# find top/representative sentence of each cluster.
top_sentences = dict()
for clust_idx, clust in clusters_dict.items():
    intraclust, sorted_clust = intra_clust_dis_mat(clust, dist_mat)
    display(sents_df.iloc[choose_salient(intraclust, sorted_clust)]['sentence'])

'but, the creative zen micro is plagued by a serious, serious stupid and elemenatary design flaw.'

'each battery lasts about 10-11 hours.'

'there are 2 things that need fixing first is the battery life. it will run for 6 hrs without problems with medium usage of the buttons.'