In [71]:
import pandas as pd
import numpy as np
import re
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from scipy.spatial import distance
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [72]:
!pip install sumy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [73]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [74]:
df = pd.read_csv('bbc_text_cls.csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [122]:
doc = df[df.labels == 'business']['text'].sample(random_state=24)

In [123]:
def wrap(x):
  return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [125]:
print(wrap(doc.iloc[0]))

Singapore growth at 8.1% in 2004

Singapore's economy grew by 8.1% in
2004, its best performance since 2000, figures from the trade ministry
show.

The advance, the second-fastest in Asia after China, was led by
growth of 13.1% in the key manufacturing sector.  However, a slower-
than-expected fourth quarter points to more modest growth for the
trade-driven economy in 2005 as global technology demand falls back.
Slowdowns in the US and China could hit electronics exports, while the
tsunami disaster may effect the service sector.

Economic growth is
set to halve in Singapore this year to between 3% and 5%. In the
fourth quarter, the city state's gross domestic product (GDP) rose at
an annual rate of 2.4%. That was up from the third quarter, when it
fell 3.0%, but was well below analyst forecasts.  "I am surprised at
the weak fourth quarter number.  The main drag came from electronics,"
said Lian Chia Liang, economist at JP Morgan Chase.  Singapore's
economy had contracted over the summe

### **Text Rank Summary**
used for comparing different models


In [126]:
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
summarizer = TextRankSummarizer()
parser = PlaintextParser.from_string(
    doc.iloc[0].split("\n", 1)[1],
    Tokenizer("english"))
summary = summarizer(parser.document, sentences_count=5)
sum_=str()
for s in summary:
  sum_+=str(s)
sum_

"The advance, the second-fastest in Asia after China, was led by growth of 13.1% in the key manufacturing sector.However, a slower-than-expected fourth quarter points to more modest growth for the trade-driven economy in 2005 as global technology demand falls back.Slowdowns in the US and China could hit electronics exports, while the tsunami disaster may effect the service sector.In the fourth quarter, the city state's gross domestic product (GDP) rose at an annual rate of 2.4%.The economy's poor performance in the July to September period followed four consecutive quarters of double-digit growth as Singapore bounced back strongly from the effects of the deadly Sars virus in 2003."

### **TF-IDF Score to Summerize Text**

In [127]:
featurizer = TfidfVectorizer(stop_words=stopwords.words('english'),norm='l1')
X = featurizer.fit_transform(sents)
def get_sentence_score(tfidf_row):
  # return the average of the non-zero values
  # of the tf-idf vector representation of a sentence
  x = tfidf_row[tfidf_row != 0]
  return x.mean()


def summarize_tfidf(text):
  # extract sentences
  sents = nltk.sent_tokenize(text)

  # perform tf-idf
  X = featurizer.fit_transform(sents)

  # compute scores for each sentence
  scores = np.zeros(len(sents))
  for i in range(len(sents)):
    score = get_sentence_score(X[i,:])
    scores[i] = score
  
  # sort the scores
  sort_idx = np.argsort(-scores)

  # print summary
  sum_=str()
  for i in sort_idx[:5]:
    #print(wrap("%.2f: %s" % (scores[i], sents[i])))
    sum_+=sents[i]
  return sum_

In [128]:
sum_tfidf=summarize_tfidf(doc.iloc[0].split("\n", 1)[1])

In [129]:
print(wrap(sum_tfidf))

"I am surprised at the weak fourth quarter number.Economic growth is
set to halve in Singapore this year to between 3% and 5%.That was up
from the third quarter, when it fell 3.0%, but was well below analyst
forecasts.Singapore's economy had contracted over the summer, weighed
down by soaring oil prices.The advance, the second-fastest in Asia
after China, was led by growth of 13.1% in the key manufacturing
sector.


### **Cosine Similarity**

In [130]:
from sklearn.metrics.pairwise import cosine_similarity

def summarize_cosine(text, factor = 0.15):
  # extract sentences
  sents = nltk.sent_tokenize(text)

  # perform tf-idf
  featurizer = TfidfVectorizer(
      stop_words=stopwords.words('english'),
      norm='l1')
  X = featurizer.fit_transform(sents)

  # compute similarity matrix
  S = cosine_similarity(X)

  # normalize similarity matrix
  S /= S.sum(axis=1, keepdims=True)

  # uniform transition matrix
  U = np.ones_like(S) / len(S)

  # smoothed similarity matrix
  S = (1 - factor) * S + factor * U

  # find the limiting / stationary distribution
  eigenvals, eigenvecs = np.linalg.eig(S.T)

  # compute scores
  scores = eigenvecs[:,0] / eigenvecs[:,0].sum()
  
  # sort the scores
  sort_idx = np.argsort(-scores)

  # print summary
  sum_=str()
  for i in sort_idx[:5]:
    #print(wrap("%.2f: %s" % (scores[i], sents[i])))
    sum_+=sents[i]
  return sum_

In [131]:
sum_cos=summarize_cosine(doc.iloc[0].split("\n", 1)[1], factor = 0.15)

In [132]:
print(wrap(sum_cos))

However, a slower-than-expected fourth quarter points to more modest
growth for the trade-driven economy in 2005 as global technology
demand falls back.The economy's poor performance in the July to
September period followed four consecutive quarters of double-digit
growth as Singapore bounced back strongly from the effects of the
deadly Sars virus in 2003."I am surprised at the weak fourth quarter
number.
Singapore's economy grew by 8.1% in 2004, its best performance
since 2000, figures from the trade ministry show.Economic growth is
set to halve in Singapore this year to between 3% and 5%.


### **K-Mean Clustering**

In [133]:
sentence = nltk.sent_tokenize(doc.iloc[0].split("\n", 1)[1])
corpus = []
for i in range(len(sentence)):
    sen = re.sub('[^a-zA-Z]', " ", sentence[i])  
    sen = sen.lower()                            
    sen = sen.split()                         
    sen = ' '.join([i for i in sen if i not in stopwords.words('english')])   
    corpus.append(sen)

all_words = [i.split() for i in corpus]
model = Word2Vec(all_words, min_count=1,vector_size=300)

sent_vector=[]
for i in corpus:  
    plus=0
    for j in i.split():
        plus+= model.wv[j]
    plus = plus/len(i.split())
    sent_vector.append(plus)

n_clusters = 5
kmeans = KMeans(n_clusters, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(sent_vector)

my_list=[]
for i in range(n_clusters):
    my_dict={} 
    for j in range(len(y_kmeans)):
        if y_kmeans[j]==i:
            my_dict[j] =  distance.euclidean(kmeans.cluster_centers_[i],sent_vector[j])
    min_distance = min(my_dict.values())
    my_list.append(min(my_dict, key=my_dict.get))
                            
for i in sorted(my_list):
    sum_kmean=sentence[i]

print(wrap(sum_kmean))

The economy's poor performance in the July to September period
followed four consecutive quarters of double-digit growth as Singapore
bounced back strongly from the effects of the deadly Sars virus in
2003.




### **Evaluation**
Bleu Score

In [134]:
from nltk.translate.bleu_score import sentence_bleu
t1=word_tokenize(sum_)
t2=word_tokenize(sum_tfidf)
t3=word_tokenize(sum_cos)
t4=word_tokenize(sum_kmean)
print(sentence_bleu(t1,t2,weights=(1,0,0,0)))
print(sentence_bleu(t1,t3,weights=(1,0,0,0)))
print(sentence_bleu(t1,t4,weights=(1,0,0,0)))

0.07317073170731705
0.0660377358490566
0.02941176470588235


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [135]:
from rouge import Rouge
rouge = Rouge()
s=rouge.get_scores(sum_kmean, sum_)
s[0]['rouge-1']['f']

0.49122806646968303