# Text summarization
## 1. Importing necessary Libraries & Dataset

In [47]:
import pandas as pd
import os
import sys
import re
import unicodedata
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from bs4 import BeautifulSoup
import heapq
from sklearn.feature_extraction.text import TfidfVectorizer
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [48]:
df = pd.read_csv('../Data/amakuru.csv')


In [49]:
df.shape

(3182, 4)

In [50]:
df.head()

Unnamed: 0,Title,Body,Url,Category
0,Ubushakashatsi bwagaragaje uburyo bushya bwo k...,Abashakashatsi bagaragaje ko agakoko gatera ig...,https://m.igihe.com/ikoranabuhanga/article/ubu...,Tech
1,"Tuuza bundle, ipaki ya internet yakwifashishwa...",Sositeye y’Itumanaho ya MTN Rwanda izwiho cyan...,https://m.igihe.com/ikoranabuhanga/internet/ar...,Tech
2,Impamvu ‘Zoom’ yanikiye izindi porogaramu zifa...,"Mu buryo busanzwe, iyo abantu bavuze urubuga n...",https://m.igihe.com/ikoranabuhanga/internet/ar...,Tech
3,Huawei yashyize hanze telefone nshya za P40,"Uruganda rukora telefone rwo mu Bushinwa, Huaw...",https://m.igihe.com/ikoranabuhanga/article/hua...,Tech
4,U Rwanda ku rutonde rw’ibihugu Apple yaguriyem...,Uruganda rukora ibikoresho by’Ikoranabuhanga r...,https://m.igihe.com/ikoranabuhanga/article/u-r...,Tech


In [51]:
df = df.drop_duplicates(subset="Title", keep="first")

In [52]:
df = df.dropna(subset=['Category'])

## 2. Get Stopwords list

In [54]:
from lib.kinya_norm import normalize_text
from lib.helpers import getTopSortedKElements, saveWordsToFile, getStopwordsFromFile
corpus = df['Body']
corpus_norm = []

In [8]:
type(corpus)

In [55]:
for article in corpus:
    article = normalize_text(article)
    corpus_norm.append(article)

### 2.1 Create word count vector

In [56]:
# first approach with CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(tokenizer=word_tokenize)
word_count_vector = cv.fit_transform(corpus_norm)
word_count_vector

<1888x69417 sparse matrix of type '<class 'numpy.int64'>'
	with 453328 stored elements in Compressed Sparse Row format>

In [57]:
word_count_vector.shape

(1888, 69417)

### 2.2 Computer TF-IDF values

In [58]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [59]:
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(), columns=["idf_weights"])#

In [60]:
df_idf = df_idf.sort_values(by=['idf_weights'])

In [61]:
df_idf.shape

(69417, 1)

### 2.3 Save lowest IDF-values words to a text file as stopwords

In [62]:
idf_values_dict = dict(zip(cv.get_feature_names(), tfidf_transformer.idf_))
saveWordsToFile('stopwords_idfvalues.txt', getTopSortedKElements(idf_values_dict, 200))

In [63]:
stopwords = getStopwordsFromFile('stopwords_idfvalues.txt')

In [64]:
stopwords[:10]

['mu', 'na', 'ya', 'wa', 'ko', 'ku', 'muri', 'ni', 'ari', 'ngo']

## 3 Text summarization
### 3.1 Gathering the summarization functions
1. `kinyasum.summarize_text_` will be used to summarize texts using word frequency
2. `summariza.summariza_tfidf` will summarize texts using tf-idf weights
3. `summariza.summariza_textrank` will be used to summarize texts using TextRank

In [69]:
from lib.kinyasum import summarize_text_
from lib.summariza_tfidf import summariza_tfidf
from lib.summariza_textrank import summariza_textrank

### 3.2 Getting test articles

In [70]:
a1 = df.loc[10]
a1_title = a1['Title']
a1_body = a1['Body']

a2 = df.loc[15]
a2_title = a2['Title']
a2_body = a2['Body']

In [71]:
al_summary_ideal = """
Alibaba imenyerewe mu bucuruzi bwo kuri internet, yatangije serivisi y’ikoranabuhanga yo gusuzuma uburwayi ku buntu, hagamijwe kugabanya igitutu ku bitaro kirimo guturuka ku ikwirakwira rya virusi ya Coronavirus. Abakoresha iyi serivisi bazajya babasha gusuzumwa banabone amakuru kuri iki cyorezo ku gihe, hirya no hino mu gihugu binyuze ku rubuga rwa Alibaba ruzwi nka Taobao cyangwa kuri application ya telefoni yo kwishyuriraho yitwa Alipay. Kugera Saa Sita z’amanywa zo ku cyumweru, iyi serivisi yari imaze kwakira hafi abantu 400 000, nyuma y’amasaha 24 itangijwe.
"""
a2_summary_ideal = """
Isoko rya telefone zigezweho muri Afurika y’Iburasirazuba (EAC) biteganyijwe ko rizagabanyuka ku kigero cya 12% mu mezi atatu ya mbere ya 2020 na 3% mu mezi atatu azakurikiraho. Ibyo bivuze ko telefone za smartphone zinjira mu karere zizaba nke, igiciro cyazo kikazamuka kubera ibibazo biri mu gutumiza ibikoresho by’ikoranabuhanga bitewe n’icyorezo cya coronavirus. Bagaragaza ko coronavirus ishobora no kugira ingaruka ku gushyira hanze telefone nshya kuko hari bimwe mu bikoresho byifashishwa mu gukora telefone bitari kubasha kugera ku nganda kuko biva hanze y’u Bushinwa.
"""

### 3.3 Word Frequency Summarization

#### Summarization For  Article 1

In [33]:
len(list(dict.fromkeys(sent_tokenize(a1_body))))

7

In [79]:
a1_summary_wff = summarize_text_(a1_title, a1_body, 0.5, stopwords, False, False)

selected sentences: [0, 5, 1]


In [80]:
a1_summary_wtf = summarize_text_(a1_title, a1_body, 0.5, stopwords, True, False)

selected sentences: [0, 6, 1]


In [81]:
a1_summary_wft = summarize_text_(a1_title, a1_body, 0.5, stopwords, False, True)

selected sentences: [0, 1, 2]


In [82]:
a1_summary_wtt = summarize_text_(a1_title, a1_body, 0.5, stopwords, True, True)

selected sentences: [0, 1, 2]


#### Summarization For  Article 2

In [83]:
len(list(dict.fromkeys(sent_tokenize(a2_body))))

12

In [84]:
a2_summary_wff = summarize_text_(a2_title, a2_body, 0.3, stopwords, False, False)

selected sentences: [10, 0, 5]


In [85]:
a2_summary_wtf = summarize_text_(a2_title, a2_body, 0.3, stopwords, True, False)

selected sentences: [10, 0, 4]


In [86]:
a2_summary_wft = summarize_text_(a2_title, a2_body, 0.3, stopwords, False, True)

selected sentences: [0, 10, 5]


In [87]:
a2_summary_wtt = summarize_text_(a2_title, a2_body, 0.3, stopwords, True, True)

selected sentences: [0, 10, 4]


### 3.4 TF-IDF Summarization

#### Summarization For  Article 1

In [88]:
len(list(dict.fromkeys(sent_tokenize(a1_body))))

7

In [89]:
a1_summary_tdff = summariza_tfidf(a1_title, a1_body, 0.5, stopwords, score_title=False, score_position=False)

selected sentences: [0, 3, 6]


In [90]:
a1_summary_tdtf = summariza_tfidf(a1_title, a1_body, 0.5, stopwords, score_title=True, score_position=False)

selected sentences: [0, 1, 6]


In [91]:
a1_summary_tdft = summariza_tfidf(a1_title, a1_body, 0.5, stopwords, score_title=False, score_position=True)

selected sentences: [0, 1, 2]


In [92]:
a1_summary_tdtt = summariza_tfidf(a1_title, a1_body, 0.5, stopwords, score_title=True, score_position=True)

selected sentences: [0, 1, 2]


#### Article 2

In [93]:
a2_summary_tdff = summariza_tfidf(a2_title, a2_body, 0.3, stopwords, score_title=False, score_position=False)

selected sentences: [2, 6, 7]


In [94]:
a2_summary_tdtf = summariza_tfidf(a2_title, a2_body, 0.3, stopwords, score_title=True, score_position=False)

selected sentences: [0, 2, 4]


In [95]:
a2_summary_tdft = summariza_tfidf(a2_title, a2_body, 0.3, stopwords, score_title=False, score_position=True)

selected sentences: [1, 2, 6]


In [96]:
a2_summary_tdtt = summariza_tfidf(a2_title, a2_body, 0.3, stopwords, score_title=True, score_position=True)

selected sentences: [0, 1, 2]


### 3.5 Summarization with TextRank
#### article 1

In [97]:
a1_summary_tr = summariza_textrank(a1_body, 0.5, stopwords=stopwords)

selected sentences: [0, 1, 2]


#### article 2

In [98]:
a2_summary_tr = summariza_textrank(a2_body, 0.2, stopwords=stopwords)

selected sentences: [0, 5, 6]


## 4. Scoring

In [99]:
from rouge_score import rouge_scorer

In [100]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2','rougeL'])

In [101]:
# scores for word frequency based summarization on article 1
scores_a1_wf_ff = scorer.score(str(a1_summary_wff),
                      str(al_summary_ideal))
scores_a1_wf_tf = scorer.score(str(a1_summary_wtf),
                      str(al_summary_ideal))
scores_a1_wf_ft = scorer.score(str(a1_summary_wft),
                      str(al_summary_ideal))
scores_a1_wf_tt = scorer.score(str(a1_summary_wtt),
                      str(al_summary_ideal))

# scores for word frequency based summarization on article 2
scores_a2_wf_ff = scorer.score(str(a2_summary_wff),
                      str(a2_summary_ideal))
scores_a2_wf_tf = scorer.score(str(a2_summary_wtf),
                      str(a2_summary_ideal))
scores_a2_wf_ft = scorer.score(str(a2_summary_wft),
                      str(a2_summary_ideal))
scores_a2_wf_tt = scorer.score(str(a2_summary_wtt),
                      str(a2_summary_ideal))

# scores for tfidf based summarization on article 1
scores_a1_td_ff = scorer.score(str(a1_summary_tdff),
                      str(al_summary_ideal))
scores_a1_td_tf = scorer.score(str(a1_summary_tdtf),
                      str(al_summary_ideal))
scores_a1_td_ft = scorer.score(str(a1_summary_tdft),
                      str(al_summary_ideal))
scores_a1_td_tt = scorer.score(str(a1_summary_tdtt),
                      str(al_summary_ideal))

# scores for tfidf  based summarization on article 2
scores_a2_td_ff = scorer.score(str(a2_summary_tdff),
                      str(a2_summary_ideal))
scores_a2_td_tf = scorer.score(str(a2_summary_tdtf),
                      str(a2_summary_ideal))
scores_a2_td_ft = scorer.score(str(a2_summary_tdft),
                      str(a2_summary_ideal))
scores_a2_td_tt = scorer.score(str(a2_summary_tdtt),
                      str(a2_summary_ideal))


# scores for textrank based summarization on article 1

scores_a1_tr = scorer.score(str(a1_summary_tr),
                      str(al_summary_ideal))

scores_a2_tr = scorer.score(str(a2_summary_tr),
                      str(a2_summary_ideal))

# scores = scorer.score(str(j_summary_word_frequency),
#                       str(j_summary_ideal))

In [102]:
scores_a1_wf_ff

{'rouge1': Score(precision=0.7558139534883721, recall=0.8333333333333334, fmeasure=0.7926829268292682),
 'rouge2': Score(precision=0.7294117647058823, recall=0.8051948051948052, fmeasure=0.7654320987654321),
 'rougeL': Score(precision=0.7441860465116279, recall=0.8205128205128205, fmeasure=0.7804878048780488)}

In [103]:
scores_a1_wf_tf

{'rouge1': Score(precision=0.7558139534883721, recall=0.7647058823529411, fmeasure=0.7602339181286549),
 'rouge2': Score(precision=0.7294117647058823, recall=0.7380952380952381, fmeasure=0.7337278106508874),
 'rougeL': Score(precision=0.7441860465116279, recall=0.7529411764705882, fmeasure=0.7485380116959064)}

In [104]:
scores_a1_wf_ft

{'rouge1': Score(precision=0.8023255813953488, recall=0.7040816326530612, fmeasure=0.7500000000000001),
 'rouge2': Score(precision=0.7411764705882353, recall=0.6494845360824743, fmeasure=0.6923076923076924),
 'rougeL': Score(precision=0.7674418604651163, recall=0.673469387755102, fmeasure=0.7173913043478259)}

In [105]:
scores_a1_wf_tt

{'rouge1': Score(precision=0.8023255813953488, recall=0.7040816326530612, fmeasure=0.7500000000000001),
 'rouge2': Score(precision=0.7411764705882353, recall=0.6494845360824743, fmeasure=0.6923076923076924),
 'rougeL': Score(precision=0.7674418604651163, recall=0.673469387755102, fmeasure=0.7173913043478259)}

In [106]:
scores_a2_wf_ff

{'rouge1': Score(precision=0.38636363636363635, recall=0.5151515151515151, fmeasure=0.44155844155844154),
 'rouge2': Score(precision=0.3218390804597701, recall=0.4307692307692308, fmeasure=0.3684210526315789),
 'rougeL': Score(precision=0.36363636363636365, recall=0.48484848484848486, fmeasure=0.4155844155844156)}

In [107]:
scores_a2_wf_tf

{'rouge1': Score(precision=0.36363636363636365, recall=0.5245901639344263, fmeasure=0.4295302013422819),
 'rouge2': Score(precision=0.3218390804597701, recall=0.4666666666666667, fmeasure=0.380952380952381),
 'rougeL': Score(precision=0.3522727272727273, recall=0.5081967213114754, fmeasure=0.4161073825503356)}

In [108]:
scores_a2_wf_ft

{'rouge1': Score(precision=0.38636363636363635, recall=0.5151515151515151, fmeasure=0.44155844155844154),
 'rouge2': Score(precision=0.3218390804597701, recall=0.4307692307692308, fmeasure=0.3684210526315789),
 'rougeL': Score(precision=0.36363636363636365, recall=0.48484848484848486, fmeasure=0.4155844155844156)}

In [109]:
scores_a2_wf_tt

{'rouge1': Score(precision=0.36363636363636365, recall=0.5245901639344263, fmeasure=0.4295302013422819),
 'rouge2': Score(precision=0.3218390804597701, recall=0.4666666666666667, fmeasure=0.380952380952381),
 'rougeL': Score(precision=0.36363636363636365, recall=0.5245901639344263, fmeasure=0.4295302013422819)}

In [110]:
scores_a1_td_ff

{'rouge1': Score(precision=0.6162790697674418, recall=0.7361111111111112, fmeasure=0.6708860759493671),
 'rouge2': Score(precision=0.5764705882352941, recall=0.6901408450704225, fmeasure=0.6282051282051282),
 'rougeL': Score(precision=0.5930232558139535, recall=0.7083333333333334, fmeasure=0.6455696202531647)}

In [111]:
scores_a1_td_tf

{'rouge1': Score(precision=0.7558139534883721, recall=0.7647058823529411, fmeasure=0.7602339181286549),
 'rouge2': Score(precision=0.7411764705882353, recall=0.75, fmeasure=0.7455621301775148),
 'rougeL': Score(precision=0.7558139534883721, recall=0.7647058823529411, fmeasure=0.7602339181286549)}

In [112]:
scores_a1_td_ft

{'rouge1': Score(precision=0.8023255813953488, recall=0.7040816326530612, fmeasure=0.7500000000000001),
 'rouge2': Score(precision=0.7411764705882353, recall=0.6494845360824743, fmeasure=0.6923076923076924),
 'rougeL': Score(precision=0.7674418604651163, recall=0.673469387755102, fmeasure=0.7173913043478259)}

In [113]:
scores_a1_td_tt

{'rouge1': Score(precision=0.8023255813953488, recall=0.7040816326530612, fmeasure=0.7500000000000001),
 'rouge2': Score(precision=0.7411764705882353, recall=0.6494845360824743, fmeasure=0.6923076923076924),
 'rougeL': Score(precision=0.7674418604651163, recall=0.673469387755102, fmeasure=0.7173913043478259)}

In [114]:
scores_a2_td_ff

{'rouge1': Score(precision=0.36363636363636365, recall=0.3076923076923077, fmeasure=0.33333333333333337),
 'rouge2': Score(precision=0.09195402298850575, recall=0.07766990291262135, fmeasure=0.08421052631578947),
 'rougeL': Score(precision=0.19318181818181818, recall=0.16346153846153846, fmeasure=0.17708333333333331)}

In [115]:
scores_a2_td_tf

{'rouge1': Score(precision=0.4431818181818182, recall=0.4642857142857143, fmeasure=0.45348837209302323),
 'rouge2': Score(precision=0.3218390804597701, recall=0.3373493975903614, fmeasure=0.3294117647058823),
 'rougeL': Score(precision=0.4090909090909091, recall=0.42857142857142855, fmeasure=0.4186046511627907)}

In [116]:
scores_a2_td_ft

{'rouge1': Score(precision=0.5227272727272727, recall=0.46, fmeasure=0.4893617021276596),
 'rouge2': Score(precision=0.3448275862068966, recall=0.30303030303030304, fmeasure=0.32258064516129037),
 'rougeL': Score(precision=0.36363636363636365, recall=0.32, fmeasure=0.3404255319148936)}

In [117]:
scores_a2_td_tt

{'rouge1': Score(precision=0.6704545454545454, recall=0.6555555555555556, fmeasure=0.6629213483146068),
 'rouge2': Score(precision=0.632183908045977, recall=0.6179775280898876, fmeasure=0.625),
 'rougeL': Score(precision=0.6704545454545454, recall=0.6555555555555556, fmeasure=0.6629213483146068)}

In [118]:
scores_a1_tr

{'rouge1': Score(precision=0.8023255813953488, recall=0.7040816326530612, fmeasure=0.7500000000000001),
 'rouge2': Score(precision=0.7411764705882353, recall=0.6494845360824743, fmeasure=0.6923076923076924),
 'rougeL': Score(precision=0.7674418604651163, recall=0.673469387755102, fmeasure=0.7173913043478259)}

In [119]:
scores_a2_tr

{'rouge1': Score(precision=0.375, recall=0.5, fmeasure=0.42857142857142855),
 'rouge2': Score(precision=0.3218390804597701, recall=0.4307692307692308, fmeasure=0.3684210526315789),
 'rougeL': Score(precision=0.36363636363636365, recall=0.48484848484848486, fmeasure=0.4155844155844156)}