## Imports


In [2]:
import os
import pandas as pd
import numpy as np
import spacy
import time

from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

In [3]:
classes = os.listdir('BBC News Summary/News Articles')
articles_d = 'BBC News Summary/News Articles/'
summaries_d = 'BBC News Summary/Summaries/'

articles = []
summaries = []
file_arr = []
for cla in classes:
    files = os.listdir(articles_d + cla)
    for file in files:
        article_file_path = articles_d + cla + '/' + file
        summary_file_path = summaries_d + cla + '/' + file
        try:
            with open (article_file_path,'r') as f:
                articles.append('.'.join([line.rstrip() for line in f.readlines()]))
            with open (summary_file_path,'r') as f:
                summaries.append('.'.join([line.rstrip() for line in f.readlines()]))
            file_arr.append(cla + '/' + file)
        except:
            pass
            
df = pd.DataFrame({'File_path':file_arr,'Articles': articles,'Summaries':summaries})

In [4]:
import sumy
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer 
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

def summ(text):
    SENTENCES_COUNT = 3
    language = 'english'

    summary_full = []
    
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    stemmer = Stemmer(language)
    
    summarizer = LexRankSummarizer(stemmer)
    
    summarizer.stop_words = get_stop_words(language)
    
    summary = []  
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        summary.append(str(sentence))

    summary = ' '.join(summary)
    summary_full.append(summary)
        
    return " ".join(summary_full)

## Result

In [5]:
from rouge import Rouge
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/maxim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
df["Sumy"] = df["Articles"].apply(summ)

rouge = Rouge()
scores = rouge.get_scores(df["Summaries"], df["Sumy"])
df_result = pd.DataFrame(scores)
df_result.head()

Unnamed: 0,rouge-1,rouge-2,rouge-l
0,"{'r': 0.6101694915254238, 'p': 0.6666666666666...","{'r': 0.5, 'p': 0.5671641791044776, 'f': 0.531...","{'r': 0.6101694915254238, 'p': 0.6666666666666..."
1,"{'r': 0.5945945945945946, 'p': 0.7719298245614...","{'r': 0.5212765957446809, 'p': 0.6805555555555...","{'r': 0.581081081081081, 'p': 0.75438596491228..."
2,"{'r': 0.8412698412698413, 'p': 0.5408163265306...","{'r': 0.6987951807228916, 'p': 0.453125, 'f': ...","{'r': 0.8412698412698413, 'p': 0.5408163265306..."
3,"{'r': 0.7540983606557377, 'p': 0.575, 'f': 0.6...","{'r': 0.6575342465753424, 'p': 0.4485981308411...","{'r': 0.7540983606557377, 'p': 0.575, 'f': 0.6..."
4,"{'r': 0.7391304347826086, 'p': 0.4857142857142...","{'r': 0.5842696629213483, 'p': 0.3823529411764...","{'r': 0.7391304347826086, 'p': 0.4857142857142..."


Score

In [7]:
hyps, refs = map(list, [df["Summaries"], df["Sumy"]])
rouge = Rouge()

scores = rouge.get_scores(hyps, refs, avg=True)
scores

{'rouge-1': {'r': 0.7162083823333302,
  'p': 0.49866631251393956,
  'f': 0.5666393362816899},
 'rouge-2': {'r': 0.6082675016633666,
  'p': 0.39049477115324116,
  'f': 0.4527170881098828},
 'rouge-l': {'r': 0.7059239769653619,
  'p': 0.4916010176505664,
  'f': 0.5585581807770834}}

In [10]:
pd.options.display.max_colwidth = 1000
df.head().get(['Sumy', 'Summaries'])

Unnamed: 0,Sumy,Summaries
0,"Blue beat U2 to top France honour..Irish band U2 have been honoured at France's biggest music awards, but were beaten to a prize by boy band Blue...U2 received a special achievement prize at the NRJ Music Awards, but Blue beat them to the international group award. US band Maroon 5 was named best new international artist, and took the best international song title for This Love. More than five million radio listeners voted in the awards.","US band Maroon 5 was named best new international artist, and took the best international song title for This Love.U2 received a special achievement prize at the NRJ Music Awards, but Blue beat them to the international group award.Singer Jenifer also took home two awards, for best French female singer and best French album.US pop act Black Eyed Peas picked up the best international album gong for Elephunk."
1,"Ethnic producers 'face barriers'..Minority ethnic led (Mel) production companies face barriers in succeeding in the film and television industries, research has suggested...The study, commissioned by Pact and the UK Film Council, included interviews with industry experts and individuals. The research indicated that about 10% of independent production companies in the UK are minority ethnic led...A minority ethnic led company is defined as one in which the majority of decision-making power rests with an individual or individuals from a minority ethnic group. The report also explored the problems faced by such companies when attempting to compete within the film and TV industries.","Minority ethnic led (Mel) production companies face barriers in succeeding in the film and television industries, research has suggested.The research indicated that about 10% of independent production companies in the UK are minority ethnic led.The research recommended that minority ethnic led companies could benefit from such positive actions as career training and business advice, plus improved communication within the film and TV sectors.The report also explored the problems faced by such companies when attempting to compete within the film and TV industries."
2,"Baghdad Blogger on big screen..A film based on the internet musings of the ""Baghdad Blogger"" has been shown at the Rotterdam Film Festival...The film has been directed by the man who calls himself Salam Pax, the author of the weblog about Iraqi life during and after the war. The movie version comes in the form of a series of shorts made by Pax on a hand-held camera. Baghdad Blogger is among a number of films about Iraq showcased at the Dutch festival, which runs until Sunday.","A film based on the internet musings of the ""Baghdad Blogger"" has been shown at the Rotterdam Film Festival.Baghdad Blogger is among a number of films about Iraq showcased at the Dutch festival, which runs until Sunday.The festival was also due to screen murdered Dutch film-maker Theo Van Gogh's film about the treatment of woman under Islam, but it was withdrawn due to safety fears.The film has been directed by the man who calls himself Salam Pax, the author of the weblog about Iraqi life during and after the war.Director Oday Rasheed made the film on discarded 1980s Kodak film taken from the remains for the former Ministry of Culture building.Van Gogh was shot and stabbed in November 2004, following death threats he received about his film Submission."
3,"But fans around the world have pitched in to pay for the advert, which had the headline ""Save Star Trek"". They are also asking the Sci-Fi Channel to pick it up from UPN and will stage a rally in Los Angeles on 25 February...The advert described the Star Trek franchise as a ""cultural icon"". Star Trek: Enterprise began in 2001 following other Star Trek spin-off series The Next Generation, Deep Space Nine and Voyager.","Star Trek: Enterprise began in 2001 following other Star Trek spin-off series The Next Generation, Deep Space Nine and Voyager.Star Trek fans have taken out a full-page ad in the Los Angeles Times in an attempt to persuade TV executives not to scrap Star Trek: Enterprise.Enterprise stars former Quantum Leap actor Scott Bakula as Captain Archer and is set before the original 1960s Star Trek series.But fans around the world have pitched in to pay for the advert, which had the headline ""Save Star Trek"".The advert described the Star Trek franchise as a ""cultural icon"".The 98th and final episode of Star Trek: Enterprise will air in the US on 13 May."
4,"DVD review: I, Robot..Only one man recognises that robots are a threat to humanity - but that's fine because it only takes one man to save the day in the thriller I, Robot...Will Smith co-stars alongside more CGI robots than you can count and as a thrill-a-minute kind of action film, it's perfectly adequate. You'll have forgotten it all tomorrow but you'll have a fun night with the film and all the extras. There is a one-disc version that has commentaries and a Making Of but the two-disc adds more.","Will Smith co-stars alongside more CGI robots than you can count and as a thrill-a-minute kind of action film, it's perfectly adequate.Unusually for this kind of film, the extras don't solely concentrate on the special effects.But as long as you're not expecting a documentary, live with it: King Arthur is a fun, exciting, totally shallow experience and looks excellent.Only one man recognises that robots are a threat to humanity - but that's fine because it only takes one man to save the day in the thriller I, Robot.Admit it, the fact that this is one of the shows Alan Titchmarsh left Ground Force to present did mean that you expected something equally frothy.It's at its best in its battle scenes which are well done and are also the best part of the Making Of extra."
