In [1]:
from summa.preprocessing.textcleaner import clean_text_by_sentences
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../Data/Amakuru.csv')
df = df.drop_duplicates(subset="Title", keep="first")
df = df.dropna(subset=['Category'])

In [4]:
text_sample = df['Body'][0]

In [5]:
len(text_sample)

1498

In [6]:
cleaned_sentences = clean_text_by_sentences(text_sample)

In [7]:
cleaned_sentences[0].token

"abashakashatsi bagaragaj ko agakoko gatera igituntu kugira ngo kabeho gakenera intungamubiri zizwi nk'ubutar fer"

In [8]:
cleaned_sentences[0]

Original unit: 'Abashakashatsi bagaragaje ko agakoko gatera igituntu kugira ngo kabeho, gakenera intungamubiri zizwi nk’ubutare (fer).' *-*-*-* Processed unit: 'abashakashatsi bagaragaj ko agakoko gatera igituntu kugira ngo kabeho gakenera intungamubiri zizwi nk'ubutar fer'

In [9]:
from summa.commons import build_graph
import networkx as nx
import networkx.drawing
import matplotlib.pyplot as plt

In [10]:
graph = build_graph([sentence.token for sentence in cleaned_sentences])
G = nx.Graph()
G

<networkx.classes.graph.Graph at 0xa23f6eb50>

In [11]:
graph.nodes()
G.add_nodes_from(graph.nodes())

In [12]:
graph.edges()

[]

In [13]:
graph.has_edge

<bound method Graph.has_edge of <summa.graph.Graph object at 0xa23d26e50>>

In [14]:
from summa.summarizer import _set_graph_edge_weights

In [15]:
_set_graph_edge_weights(graph)

In [16]:
G.add_edges_from(graph.edges())

In [17]:
# import matplotlib
# matplotlib.rcParams['figure.figsize'] = (20.0, 16.0)
# networkx.draw_networkx(G)

In [18]:
from summa.pagerank_weighted import pagerank_weighted_scipy as _pagerank

In [19]:
from summa.summarizer import _remove_unreachable_nodes
from summa.summarizer import _extract_most_important_sentences, _add_scores_to_sentences, _format_results

In [20]:
_remove_unreachable_nodes(graph)

In [21]:
pagerank_scores = _pagerank(graph)

In [22]:
from lib.clean_sentences import clean_sentences
from lib.stopwords import STOPWORDS

In [23]:
clean_sentences(text_sample, STOPWORDS)

[Original unit: 'Abashakashatsi bagaragaje ko agakoko gatera igituntu kugira ngo kabeho, gakenera intungamubiri zizwi nk’ubutare (fer)' *-*-*-* Processed unit: 'abashakashatsi agakoko gatera igituntu kabeho gakenera intungamubiri zizwi ubutare fer',
 Original unit: ' Mu gihe hahagaritswe inzira ijyana ubutare aho agakoko kari, igituntu ntigishibora gukura ngo gishegeshe abantu' *-*-*-* Processed unit: 'hahagaritswe ijyana ubutare agakoko kari igituntu ntigishibora gukura gishegeshe',
 Original unit: 'Ibi byagaragaje uburyo bushya bushobora kwifashishwa n’abahanga mu gukora imiti y’igituntu no kugihashya kitarangiza abantu' *-*-*-* Processed unit: 'byagaragaje bushya bushobora kwifashishwa igituntu kugihashya kitarangiza',
 Original unit: 'Abashakashatsi bagaragaje ko iyo agakoko gatera igituntu kabuze ubutare (fer) kadashobora gukura' *-*-*-* Processed unit: 'abashakashatsi agakoko gatera igituntu kabuze ubutare fer kadashobora gukura',
 Original unit: 'Igituntu giterwa n’agakoko kazwi

In [24]:
def summarize_original(text, ratio=0.2, split=False, scores=False, words=None):
    cleaned_sentences = clean_text_by_sentences(text)
    graph = build_graph([sentence.token for sentence in cleaned_sentences])
    _set_graph_edge_weights(graph)
    _remove_unreachable_nodes(graph)
    pagerank_scores = _pagerank(graph)
    _add_scores_to_sentences(cleaned_sentences, pagerank_scores)
    extracted_sentences = _extract_most_important_sentences(cleaned_sentences, ratio, words)
    extracted_sentences.sort(key=lambda s: s.index)
    return _format_results(extracted_sentences, split, scores)

In [25]:
summarize_original(text_sample, 0.3)

'Abashakashatsi bagaragaje ko iyo agakoko gatera igituntu kabuze ubutare (fer) kadashobora gukura.\nMarkus Seeger, umwarimu mu ishami ry’ubuvuzi muri Kaminuza ya Zurich (UZH), akaba no mu itsinda ryakoze ubushakashatsi yavuze ko basanze ako gakoko gatera igituntu iyo kageze mu mubiri gacura utundi turemangingo intungamubiri z’ubutare.\nSeeger yavuze ko babashije guhagarika utunyangingo duto twitwa IrtAB dutwara ubutare tubujyana mu gakoko gatera igituntu, kadashobora gukura cyangwa kakaba kanapfa, bityo igituntu ntikizahaze abantu.'

In [26]:
def summarize_custom(text, ratio=0.3, split=False, scores=False, words=None, stopwords=None):
    cleaned_sentences = clean_sentences(text, stopwords=stopwords)
    graph = build_graph([sentence.token for sentence in cleaned_sentences])
    _set_graph_edge_weights(graph)
    _remove_unreachable_nodes(graph)
    pagerank_scores = _pagerank(graph)
    _add_scores_to_sentences(cleaned_sentences, pagerank_scores)
    extracted_sentences = _extract_most_important_sentences(cleaned_sentences, ratio, words)
    extracted_sentences.sort(key=lambda s: s.index)
    return _format_results(extracted_sentences, split, scores)

In [27]:
summarize_custom(text_sample, 0.4, stopwords=STOPWORDS)

'Abashakashatsi bagaragaje ko agakoko gatera igituntu kugira ngo kabeho, gakenera intungamubiri zizwi nk’ubutare (fer)\n Mu gihe hahagaritswe inzira ijyana ubutare aho agakoko kari, igituntu ntigishibora gukura ngo gishegeshe abantu\nAbashakashatsi bagaragaje ko iyo agakoko gatera igituntu kabuze ubutare (fer) kadashobora gukura\nSeeger yavuze ko babashije guhagarika utunyangingo duto twitwa IrtAB dutwara ubutare tubujyana mu gakoko gatera igituntu, kadashobora gukura cyangwa kakaba kanapfa, bityo igituntu ntikizahaze abantu'

In [28]:
text_sample2 = df['Body'][750]

In [29]:
text_sample2

'\n\nKu rubuga www.biocoiff.com bavuga ko ibumba ry’icyatsi ari ikintu cy’umwimerere bakura ku rutare ruvungagurika, rikaba rikoreshwa akenshi nk’umuti cyangwa se mu bijyanye no kwita ku ruhu.\nIbumba ry’icyatsi rikize ku butare butandukanye nka “fer” yongera amaraso, hakabamo na potassium, sodium, calcium na magnésium.\nHabaho ubwoko butandukanye bw’ibumba, rishobora kandi kugira amabara atandukanye bitewe n’aho ryavuye, gusa ibumba ryose umuntu yahitamo riba ryifitemo ikitwa “silice”, iyo silice rero igira uruhare rukomeye mu gufasha umubiri w’umuntu gukora neza.\nIkindi kandi silice igira uruhare mu gukomeza imisokoro no mu ngingo (articulations), silice kandi ituma uruhu rw’umuntu rugira ubuzima bwiza, igakomeza amenyo n’inzara, imisatsi, imitsi ndetse n’amagufa.\nSilice iboneka mu ibumba ry’icyatsi kandi, ifasha ubudahangarwa bw’umubiri gukora neza.\nIbumba ry’icyatsi rikoreshwa cyane cyane mu kwita ku ruhu rwo mu maso, rigafasha cyane abagira uruhu rwo mu maso ruyaga, aho bavanga

In [30]:
summarize_custom(text_sample2, words=50)

'Ibumba ry’icyatsi rikoreshwa cyane cyane mu kwita ku ruhu rwo mu maso, rigafasha cyane abagira uruhu rwo mu maso ruyaga, aho bavanga ibumba n’amazi bikamera nk’igikoma gifashe, nyuma bakagisiga mu maso kugira ngo bifashe uruhu kumera neza.\nIbumba ry’icyatsi kandi ryifashishwa mu kwita ku musatsi ndetse no mu gukesha amenyo.'

In [31]:
summarize_original(text_sample2, words=50)

'Uwahoze ari myugariro w’Ikipe y’Igihugu Amavubi, Nshimiyimana Canisius, yavuze ko ubwo Amavubi yakinaga n’Ikipe y’Igihugu ya Tunisia mu Gikombe cya Afurika cya 2004, we na bagenzi be bagize ubwoba bwo gusohoka muri Stade ya Radès ngo bajye kwishyushya.'