In [None]:
import pandas as pd
import numpy as np
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification

In [None]:
df = pd.read_csv('bbc_text_cls.csv')

In [None]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [None]:
doc = df[df.labels == 'business']['text'].sample()

In [None]:
def wrap(x):
  return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [None]:
print(wrap(doc.iloc[0]))

Argentina closes $102.6bn debt swap

Argentina is set to close its
$102.6bn (£53.51bn) debt restructuring offer for bondholders later on
Friday, with the government hopeful that most creditors will accept
the deal.

The estimated loss to bondholders is up to 70% of the
original value of the bonds, yet the majority are expected to accept
the government's offer.  Argentina defaulted on its debt three years
ago, the biggest sovereign default in modern history.  Yesterday
Argentina's economy minister, Roberto Lavagna, said that he estimated
that the results of the restructuring would be ready around next
Thursday (3 March). Argentina's President, Nestor Kirchner, said on
Friday: "A year ago when we started the swap (negotiations), they told
us we were crazy, that we were irrational."  But he added that his
government was close to achieving: "The best debt renegotiation in
history."  The country has been in default on the $102.6bn - based on
an original debt of $81.8bn plus interest - for t

In [None]:
print(doc.iloc[0].split("\n", 1)[1])


Argentina is set to close its $102.6bn (£53.51bn) debt restructuring offer for bondholders later on Friday, with the government hopeful that most creditors will accept the deal.

The estimated loss to bondholders is up to 70% of the original value of the bonds, yet the majority are expected to accept the government's offer. Argentina defaulted on its debt three years ago, the biggest sovereign default in modern history. Yesterday Argentina's economy minister, Roberto Lavagna, said that he estimated that the results of the restructuring would be ready around next Thursday (3 March). Argentina's President, Nestor Kirchner, said on Friday: "A year ago when we started the swap (negotiations), they told us we were crazy, that we were irrational." But he added that his government was close to achieving: "The best debt renegotiation in history." The country has been in default on the $102.6bn - based on an original debt of $81.8bn plus interest - for the past three years. If the offer does n

In [None]:
sents = nltk.sent_tokenize(doc.iloc[0].split("\n", 1)[1])

In [None]:
len(sents)

14

In [None]:
featurizer = TfidfVectorizer(
    stop_words=stopwords.words('english'),
    norm='l2')

In [None]:
X = featurizer.fit_transform(sents)

In [None]:
X.shape

(14, 123)

Computing Sentence vs Sentence Similarity Matrix, this represents a link between web pages as in PageRank.

Our representation of the link is similarity and magnitude of link denotes extent of similarity on a scale of 1-10.

In [None]:
S = cosine_similarity(X)

In [None]:
S /= S.sum(axis=1, keepdims=True)

Smoothing to relax the 0 magnitude edges.

In [None]:
U = np.ones_like(S) / len(S)

Smoothed Similarity Matrix.

In [None]:
factor = 0.15
S = (1 - factor) * S + factor * U

Stationary Distirbution: How?

PageRank is based on a Markov Chain that is time independent, and thus, p(st) = S.p(st-1).

After infinite steps, we get limiting disitribution.

p(s_inf)=S.p(s_inf), sinc infinity-1 is equivalent to infinity.

Thus, the stationary probability p(s#) satisfies, S.p(s#)=p(s#).

Or, the stationary probability is an eigen vector of S, whose eigen value is 1.

So, our next step should be eigen decomposition of S.



In [None]:
eigenvals, eigenvecs = np.linalg.eig(S.T) #Transpose

In [None]:
eigenvals

array([1.        , 0.23661838, 0.27630462, 0.35460771, 0.70072197,
       0.69225848, 0.66274471, 0.47272816, 0.51033853, 0.52511247,
       0.54860038, 0.56578729, 0.58753904, 0.58233942])

In [None]:
eigenvecs[:]

Take the principal eigen vector, the one with eigen value 1, and normalise it.

In [None]:
limiting_distribution=eigenvecs[:,0] / eigenvecs[:,0].sum()

This represents the effect each sentence has in the final set of sentences.

In [None]:
scores=limiting_distribution

In [None]:
sort_idx = np.argsort(-scores)

In [None]:
print("Generated summary:")
for i in sort_idx[:5]:
  print(wrap("%.2f: %s" % (scores[i], sents[i])))

Generated summary:
0.10: 
Argentina is set to close its $102.6bn (£53.51bn) debt
restructuring offer for bondholders later on Friday, with the
government hopeful that most creditors will accept the deal.
0.08: The estimated loss to bondholders is up to 70% of the original
value of the bonds, yet the majority are expected to accept the
government's offer.
0.08: About 70% to 80% of bondholders are expected to accept the terms
of the offer.
0.08: Argentina defaulted on its debt three years ago, the biggest
sovereign default in modern history.
0.07: The country has been in default on the $102.6bn - based on an
original debt of $81.8bn plus interest - for the past three years.
