In [1]:
import pandas as pd
import numpy as np
import re
import spacy
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Summarize one text

def summarize(text):
    
    # Removing the first common part

    splitted_text = text.split('Numac')
    splitted_text = splitted_text[1]

    # removing symbols

    no_symboles = splitted_text.replace('\n',' ')
    no_symbols = no_symbols.replace('/','')
    no_symbols = no_symbols.replace('§','')
    
    # result = ''.join([i for i in trial if not i.isdigit()])
    
    
    # Creating Tokens

    nlp = spacy.load('nl_core_news_sm')
    text = no_symbols
    tokenized = nlp(text)
    tokens = [token.text for token in tokenized]
    
    # Removing stopwords
    stopwords = spacy.lang.nl.stop_words.STOP_WORDS
    
    # Lemmatizing the text
    text_no_stop = [lemma for lemma in tokens if lemma not in stopwords]
    cleaned = ' '.join(text_no_stop)
    
    # Removing 'begin, eerste, laatste,...' from the end of the text and creating our main text.
    index = cleaned.rfind('begin')
    
    corpus = cleaned[:index]
    
    # Starting the summarizer
    
    import transformers
    undisputed_best_model = transformers.MBartForConditionalGeneration.from_pretrained(
        "ml6team/mbart-large-cc25-cnn-dailymail-nl-finetune")
    
    tokenizer = transformers.MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
    summarization_pipeline = transformers.pipeline(
        task="summarization",
        model=undisputed_best_model,
        tokenizer=tokenizer,
    )
    summarization_pipeline.model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
        "nl_XX"
    ]


    article = corpus
    summarization_pipeline(
        article,
        do_sample=True,
        top_p=0.75,
        top_k=50,
        # num_beams=4,
        min_length=50,
        early_stopping=True,
        truncation=True,
    )[0]["summary_text"]
    

In [6]:
def summarize_csv(filepath):
    df = pd.read_csv(filepath)

    # Remove the German Translations

    df = df[df["Text"].str.contains("Duitse vertaling")==False]

    # Remove empty text rows if any are left.

    df.dropna(axis = 0, how ='any', inplace = True)
    
    # Create a summary column
    
    df['Summary'] = ''
    
    # Loop over the columns
    
    for idx, row in df.iterrows():
        
        text = df.loc[idx,'Text']
        
        # Removing the first common part

        splitted_text = text.split('Numac')
        splitted_text = splitted_text[1]

        # Removing symbols.

        no_symbols = splitted_text.replace('\n',' ')
        no_symbols = no_symbols.replace('/','')
        no_symbols = no_symbols.replace('§','')
    
        # Removing 'begin, eerste, laatste,...' from the end of the text and creating our main text.
        
        index = no_symbols.rfind('begin')
    
        corpus = no_symbols[:index]
    
        # loading mBart finetune Model

        undisputed_best_model = transformers.MBartForConditionalGeneration.from_pretrained(
            "ml6team/mbart-large-cc25-cnn-dailymail-nl-finetune"
        )
        tokenizer = transformers.MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
        summarization_pipeline = transformers.pipeline(
            task="summarization",
            model=undisputed_best_model,
            tokenizer=tokenizer,
        )
        summarization_pipeline.model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
            "nl_XX"
        ]

        article = corpus 
        df.loc[idx,'Summary'] = summarization_pipeline(
            article,
            do_sample=True,
            top_p=0.75,
            top_k=50,
            # num_beams=4,
            min_length=50,
            early_stopping=True,
            truncation=True,
        )[0]["summary_text"]
        
    df = df.reset_index(drop=True)    
    df.to_excel("KPMG Tax Case - Summarized.xlsx", index=False)
    df.to_csv("KPMG Tax Case - CSV_Summarized.csv", index=False, encoding="utf-8-sig")

Unnamed: 0,Date,Title,Numac,Link FR,Link NL,Text,Cleaned Text,Summary_1,Summary_2
0,1/14/2020,REGION DE BRUXELLES-CAPITALE\nREGION DE BRUXEL...,2020010053,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\nbelgiëlex.be - Kruispuntbank Wetgevi...,,,
3,1/24/2020,MINISTERE DE LA COMMUNAUTE FRANCAISE\n20 DECEM...,2020010214,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\n\neinde eerste woord laatste woord\nPub...,,,
4,1/28/2020,SERVICE PUBLIC FEDERAL FINANCES\n20 JANVIER 20...,2020040138,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\nbelgiëlex.be - Kruispuntbank Wetgevi...,,,
5,1/28/2020,SERVICE PUBLIC FEDERAL FINANCES\n20 JANVIER 20...,2020020094,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\nbelgiëlex.be - Kruispuntbank Wetgevi...,,,
6,1/28/2020,SERVICE PUBLIC FEDERAL FINANCES\nAdministratio...,2020010193,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\n\neinde eerste woord laatste woord\nPub...,,,


In [37]:
def summarize

for idx, row in df.iterrows():
    string = df.loc[idx,'Text']
    # Removing the first common part

    splitted_text = string.split('Numac')
    splitted_text = splitted_text[1]

    # testing

    trial = splitted_text.replace('\n',' ')
    trial = trial.replace('/','')
    trial = trial.replace('§','')
    
    # result = ''.join([i for i in trial if not i.isdigit()])
    
    
    # Creating Tokens

    nlp = spacy.load('nl_core_news_sm')
    text = trial
    tokenized = nlp(text)
    tokens = [token.text for token in tokenized]

    stopwords = spacy.lang.nl.stop_words.STOP_WORDS

    text_no_stop = [lemma for lemma in tokens if lemma not in stopwords]
    cleaned = ' '.join(text_no_stop)

    index = cleaned.rfind('begin')
    
    corpus = trial[:index]
    
    df.loc[idx,'Cleaned Text'] = corpus


IndexError: list index out of range

In [61]:
# mBart finetune Model

import transformers
undisputed_best_model = transformers.MBartForConditionalGeneration.from_pretrained(
    "ml6team/mbart-large-cc25-cnn-dailymail-nl-finetune"
)
tokenizer = transformers.MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
summarization_pipeline = transformers.pipeline(
    task="summarization",
    model=undisputed_best_model,
    tokenizer=tokenizer,
)
summarization_pipeline.model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
    "nl_XX"
]


for idx, row in df.loc[500:].iterrows():
    
    article = df.loc[idx,'Cleaned Text']  
    df.loc[idx,'Summary_1'] = summarization_pipeline(
        article,
        do_sample=True,
        top_p=0.75,
        top_k=50,
        # num_beams=4,
        min_length=50,
        early_stopping=True,
        truncation=True,
    )[0]["summary_text"]

Your max_length is set to 1024, but you input_length is only 581. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=290)
Your max_length is set to 1024, but you input_length is only 582. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=291)
Your max_length is set to 1024, but you input_length is only 511. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=255)
Your max_length is set to 1024, but you input_length is only 353. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=176)
Your max_length is set to 1024, but you input_length is only 406. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=203)
Your max_length is set to 1024, but you input_length is only 457. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=228)
Your max_length is set to 1024, but you input_length is only 508

Your max_length is set to 1024, but you input_length is only 531. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=265)
Your max_length is set to 1024, but you input_length is only 978. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=489)
Your max_length is set to 1024, but you input_length is only 744. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=372)
Your max_length is set to 1024, but you input_length is only 116. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 1024, but you input_length is only 115. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 1024, but you input_length is only 726. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=363)
Your max_length is set to 1024, but you input_length is only 852. 