In [21]:
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import os
import sys
import pickle
import pandas as pd
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [22]:
import re
import unicodedata
from bs4 import BeautifulSoup

def normalize_omissions(text):
    pattern = re.compile(r"\w*['’]\w*", re.IGNORECASE)
    tokens = text.split()
    norm_str = text
    for token in tokens:
        if re.match(pattern, token):
            m = re.split(r"['’]", token)
            n = ("a ").join(m)
            norm_str = norm_str.replace(token, n)
    return norm_str


def strip_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text


def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode(
        'ascii', 'ignore').decode('utf-8', 'ignore')
    return text


def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text


def remove_stop_words(text, stopwords):
    return " ".join(w for w in word_tokenize(text) if w not in stopwords)


def normalize_text(doc, html_stripping=True, handle_omissions=True, accented_char_removal=True, text_lower_case=True, special_chars_removal=True, remove_digits=True, stopwords_removal=False, stopwords=None):
    # strip html
    if html_stripping:
        doc = strip_html_tags(doc)
    if handle_omissions:
        doc = normalize_omissions(doc)
    # remove accented chars
    if accented_char_removal:
        doc = remove_accented_chars(doc)
    # lowercase the text
    if text_lower_case:
        doc = doc.lower()
    # remove extra newlines
    doc = re.sub(r'[\r|\n|\r\n\t]+', ' ', doc)
    # remove special characters and digits
    if special_chars_removal:
        special_chars_pattern = re.compile(r'([{.(-)!}])')
        doc = special_chars_pattern.sub(" \\1 ", doc)
        doc = remove_special_characters(doc, remove_digits=remove_digits)
    # remove extra whitespace
    doc = re.sub(' +', ' ', doc)
    # remove stopwords
    if stopwords_removal and stopwords:
        doc = remove_stop_words(doc, stopwords)

    return doc


In [23]:
from lib.helpers import getTopSortedKElements, saveWordsToFile, getStopwordsFromFile
stopwords = getStopwordsFromFile('stopwords_idfvalues.txt')

In [32]:
pfile = open('../Data/arl', 'rb')
articles = pickle.load(pfile)
pfile.close()


txt1 = articles[0]
sentences = sent_tokenize(txt1)
from summarize.summary import build_summary_using_tfidf
from lib.summariza_word_frequency import summariza_word_frequency
from lib.summariza_tfidf import summariza_tfidf

In [33]:
# build_summary_using_wordsFreq(txt1, stopwords, 3)
summariza_word_frequency(txt1, 3, stopwords)

selected sentences: [9, 8, 0]


'Abashakashatsi bagaragaje ko agakoko gatera igituntu kugira ngo kabeho, gakenera intungamubiri zizwi nk’ubutare (fer). Markus Seeger, umwarimu mu ishami ry’ubuvuzi muri Kaminuza ya Zurich (UZH), akaba no mu itsinda ryakoze ubushakashatsi yavuze ko basanze ako gakoko gatera igituntu iyo kageze mu mubiri gacura utundi turemangingo intungamubiri z’ubutare. Seeger yavuze ko babashije guhagarika utunyangingo duto twitwa IrtAB dutwara ubutare tubujyana mu gakoko gatera igituntu, kadashobora gukura cyangwa kakaba kanapfa, bityo igituntu ntikizahaze abantu.'

In [34]:
build_summary_using_tfidf(txt1, stopwords, 3)

Chosen sentences: [4, 6, 10]
Ibi byagaragaje uburyo bushya bushobora kwifashishwa n’abahanga mu gukora imiti y’igituntu no kugihashya kitarangiza abantu.
Igituntu giterwa n’agakoko kazwi nka Mycobacterium tuberculosis, kandi kaboneka kenshi mu mubiri w’umuntu.
Ishami ry’Umuryango w’Abibumbye ryita ku Buzima (OMS), ryatangaje ko umwaka ushize abantu miliyoni 1.5 bishwe n’indwara zibasira ibihaha n’igituntu kirimo.


In [35]:
summariza_tfidf(txt1, 3, stopwords)



Summary:
Ibi byagaragaje uburyo bushya bushobora kwifashishwa n’abahanga mu gukora imiti y’igituntu no kugihashya kitarangiza abantu. Igituntu giterwa n’agakoko kazwi nka Mycobacterium tuberculosis, kandi kaboneka kenshi mu mubiri w’umuntu. Seeger yavuze ko babashije guhagarika utunyangingo duto twitwa IrtAB dutwara ubutare tubujyana mu gakoko gatera igituntu, kadashobora gukura cyangwa kakaba kanapfa, bityo igituntu ntikizahaze abantu.
selected sentences: f[4, 6, 9]


In [None]:
from nltk import FreqDist
norm_text = normalize_text(txt1, stopwords_removal=True, stopwords=stopwords)
tokens = word_tokenize(norm_text)
sentences = sent_tokenize(txt1)
wordsFreq = FreqDist(tokens)
def score_sentence(sentence, cdict, stopwords):
    sentence_score = 0
    clean_sentences = normalize_text(
        sentence, stopwords_removal=True, stopwords=stopwords)
    for word in word_tokenize(clean_sentences):
        if word.lower() not in stopwords and word not in stopwords and len(word) > 1:
            sentence_score += cdict[word.lower()]
    return sentence_score
score = score_sentence(sentences[-2], wordsFreq, stopwords)
score

In [None]:
feature_names = cv.get_feature_names()
dfc = pd.DataFrame(word_count_vector.todense().tolist(), columns=feature_names)

In [None]:
dfc.T.to_dict('dict')[0]

In [None]:
def getTFIDF(text, stopwords=None):
    # save original sentences in a list
    original_sentences = sent_tokenize(text)
    # preprocess the document
    doc = normalize_text(text, stopwords_removal=True, stopwords=stopwords)
    # get clean sentence tokens
    clean_sentence_tokens = sent_tokenize(doc)
    cv = CountVectorizer(tokenizer=word_tokenize)
    # get word counts for the word in the document
    word_count_vector = cv.fit_transform(clean_sentence_tokens)
#     print(f'type of word_count_vector {type(word_count_vector)}')
    tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    tfidf_transformer.fit(word_count_vector)
    
    # count matrix
    count_vector = cv.transform(clean_sentence_tokens)
    
    #tf-idf scores
    tf_idf_vector = tfidf_transformer.transform(count_vector)
    
    # feature names
    feature_names = cv.get_feature_names()
    
#     df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"])
 
    # sort ascending
#     df_idf.sort_values(by=['idf_weights'])
    df = pd.DataFrame(tf_idf_vector.todense().tolist(), columns=feature_names)
    
    return df.T.to_dict('dict')[0]


In [None]:
dff = getTFIDF(txt1, stopwords)
def score_sentence(sentence, tfidf_dict, stopwords):
    sentence_score = 0
    clean_sentence = normalize_text(sentence, stopwords_removal=True, stopwords=stopwords)
    for word in word_tokenize(clean_sentence):
        if word.lower() not in stopwords and word not in stopwords and len(word) > 1:
            sentence_score = sentence_score + tfidf_dict[word.lower()]
    return sentence_score

In [None]:
sentence_scores = {}
# from summarize.score import score_sentence

In [None]:
for i, sentence in enumerate(sentences):
    sentence_scores[i] = score_sentence(sentence, dff, stopwords)

In [None]:
sentence_scores

In [None]:
sorted_sentences = sorted(sentence_scores.items(), key=lambda x: x[1])

In [None]:
sorted_sentences

In [None]:
top_sentences_indexes = []
for n in range(0, 2):
    top_sentences_indexes.append(sorted_sentences[n][0])

In [None]:
top_sentences_indexes

In [None]:
for s in top_sentences_indexes:
    print(sentences[s])

In [None]:
txt1

In [None]:
# from summarize.tfidf import getTFIDF

In [None]:
from summarize.summary import build_summary_using_tfidf

In [None]:
build_summary_using_tfidf(txt1, stopwords, 3)

In [None]:
build_summary_using_tfidf(txt1, stopwords, 3)