In [96]:
import pandas as pd
import numpy as np
import re
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from scipy.spatial import distance
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [97]:
!pip install sumy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [98]:
df = pd.read_csv('bbc_text_cls.csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [99]:
doc = df[df.labels == 'business']['text'].sample(random_state=24)

In [100]:
def wrap(x):
  return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [101]:
print(wrap(doc.iloc[0]))

Singapore growth at 8.1% in 2004

Singapore's economy grew by 8.1% in
2004, its best performance since 2000, figures from the trade ministry
show.

The advance, the second-fastest in Asia after China, was led by
growth of 13.1% in the key manufacturing sector.  However, a slower-
than-expected fourth quarter points to more modest growth for the
trade-driven economy in 2005 as global technology demand falls back.
Slowdowns in the US and China could hit electronics exports, while the
tsunami disaster may effect the service sector.

Economic growth is
set to halve in Singapore this year to between 3% and 5%. In the
fourth quarter, the city state's gross domestic product (GDP) rose at
an annual rate of 2.4%. That was up from the third quarter, when it
fell 3.0%, but was well below analyst forecasts.  "I am surprised at
the weak fourth quarter number.  The main drag came from electronics,"
said Lian Chia Liang, economist at JP Morgan Chase.  Singapore's
economy had contracted over the summe

### **Text Rank Summary**
used for comparing different models


In [102]:
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
summarizer = TextRankSummarizer()
parser = PlaintextParser.from_string(
    doc.iloc[0].split("\n", 1)[1],
    Tokenizer("english"))
summary = summarizer(parser.document, sentences_count=5)
sum_=str()
for s in summary:
  sum_+=str(s)
sum_

"The advance, the second-fastest in Asia after China, was led by growth of 13.1% in the key manufacturing sector.However, a slower-than-expected fourth quarter points to more modest growth for the trade-driven economy in 2005 as global technology demand falls back.Slowdowns in the US and China could hit electronics exports, while the tsunami disaster may effect the service sector.In the fourth quarter, the city state's gross domestic product (GDP) rose at an annual rate of 2.4%.The economy's poor performance in the July to September period followed four consecutive quarters of double-digit growth as Singapore bounced back strongly from the effects of the deadly Sars virus in 2003."

### **TF-IDF Score to Summerize Text**

In [103]:
featurizer = TfidfVectorizer(stop_words=stopwords.words('english'),norm='l1')
# mean of tfidf score for each sencence
def get_sentence_score(tfidf_row):
  x = tfidf_row[tfidf_row != 0]
  return x.mean()

def summarize_tfidf(text):
  # tokenize sentences
  sents = nltk.sent_tokenize(text)
  # tf-idf score
  X = featurizer.fit_transform(sents)
  # scores for each sentence
  scores = np.zeros(len(sents))
  for i in range(len(sents)):
    score = get_sentence_score(X[i,:])
    scores[i] = score
  # sort the scores and pick top 5 sentences
  sort_idx = np.argsort(-scores)
  sort_idx=sort_idx[:5]
  sort_idx=np.sort(sort_idx)
  # generate summary
  sum_=str()
  for i in sort_idx[:5]:
    sum_+=sents[i]
  return sum_

In [104]:
sum_tfidf=summarize_tfidf(doc.iloc[0].split("\n", 1)[1])

In [105]:
print(wrap(sum_tfidf))

The advance, the second-fastest in Asia after China, was led by growth
of 13.1% in the key manufacturing sector.Economic growth is set to
halve in Singapore this year to between 3% and 5%.That was up from the
third quarter, when it fell 3.0%, but was well below analyst
forecasts."I am surprised at the weak fourth quarter
number.Singapore's economy had contracted over the summer, weighed
down by soaring oil prices.


### **Cosine Similarity**

In [106]:
from sklearn.metrics.pairwise import cosine_similarity

def summarize_cosine(text, factor = 0.15):
  # tokenize sentences
  sents = nltk.sent_tokenize(text)
  # calulate tf-idf score
  featurizer = TfidfVectorizer(stop_words=stopwords.words('english'),norm='l1')
  X = featurizer.fit_transform(sents)
  # compute similarity matrix
  S = cosine_similarity(X)
  # normalize similarity matrix
  S /= S.sum(axis=1, keepdims=True)
  # uniform transition matrix
  U = np.ones_like(S) / len(S)
  # smoothed similarity matrix
  S = (1 - factor) * S + factor * U
  # find the limiting / stationary distribution
  eigenvals, eigenvecs = np.linalg.eig(S.T)
  # compute scores
  scores = eigenvecs[:,0] / eigenvecs[:,0].sum()
  # sort the scores and pick top 5 sentences
  sort_idx = np.argsort(-scores)
  sort_idx=sort_idx[:5]
  sort_idx=np.sort(sort_idx)
  # generate summary
  sum_=str()
  for i in sort_idx[:5]:
    #print(wrap("%.2f: %s" % (scores[i], sents[i])))
    sum_+=sents[i]
  return sum_

In [107]:
sum_cos=summarize_cosine(doc.iloc[0].split("\n", 1)[1], factor = 0.15)

In [108]:
print(wrap(sum_cos))


Singapore's economy grew by 8.1% in 2004, its best performance since
2000, figures from the trade ministry show.However, a slower-than-
expected fourth quarter points to more modest growth for the trade-
driven economy in 2005 as global technology demand falls back.Economic
growth is set to halve in Singapore this year to between 3% and 5%."I
am surprised at the weak fourth quarter number.The economy's poor
performance in the July to September period followed four consecutive
quarters of double-digit growth as Singapore bounced back strongly
from the effects of the deadly Sars virus in 2003.


### **K-Mean Clustering**

In [109]:
sentence = nltk.sent_tokenize(doc.iloc[0].split("\n", 1)[1])
corpus = []
for i in range(len(sentence)):
    sen = re.sub('[^a-zA-Z]', " ", sentence[i])  
    sen = sen.lower()                            
    sen = sen.split()                         
    sen = ' '.join([i for i in sen if i not in stopwords.words('english')])   
    corpus.append(sen)

all_words = [i.split() for i in corpus]
model = Word2Vec(all_words, min_count=1,vector_size=300)

sent_vector=[]
for i in corpus:  
    plus=0
    for j in i.split():
        plus+= model.wv[j]
    plus = plus/len(i.split())
    sent_vector.append(plus)

n_clusters = 5
kmeans = KMeans(n_clusters, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(sent_vector)

my_list=[]
for i in range(n_clusters):
    my_dict={} 
    for j in range(len(y_kmeans)):
        if y_kmeans[j]==i:
            my_dict[j] =  distance.euclidean(kmeans.cluster_centers_[i],sent_vector[j])
    min_distance = min(my_dict.values())
    my_list.append(min(my_dict, key=my_dict.get))
sum_kmean=str()                           
for i in sorted(my_list):
    sum_kmean+=sentence[i]

print(wrap(sum_kmean))


Singapore's economy grew by 8.1% in 2004, its best performance since
2000, figures from the trade ministry show.Economic growth is set to
halve in Singapore this year to between 3% and 5%.That was up from the
third quarter, when it fell 3.0%, but was well below analyst
forecasts."I am surprised at the weak fourth quarter number.The
economy's poor performance in the July to September period followed
four consecutive quarters of double-digit growth as Singapore bounced
back strongly from the effects of the deadly Sars virus in 2003.




### **Evaluation**
Bleu Score

In [111]:
from nltk.translate.bleu_score import sentence_bleu
t1=word_tokenize(sum_)
t2=word_tokenize(sum_tfidf)
t3=word_tokenize(sum_cos)
t4=word_tokenize(sum_kmean)
print("Bleu score of tfidf model: ",sentence_bleu(t1,t2,weights=(1,0,0,0)))
print("Bleu score of cosine similarity model: ",sentence_bleu(t1,t3,weights=(1,0,0,0)))
print("Bleu score of kmeans clustering model: ",sentence_bleu(t1,t4,weights=(1,0,0,0)))

Bleu score of tfidf model:  0.07142857142857141
Bleu score of cosine similarity model:  0.0673076923076923
Bleu score of kmeans clustering model:  0.0588235294117647
