## Install Transformers Library
conda create --name myenv --no-default-packages

conda create --name myenv python

conda create --name myenv python=3.6

In [None]:
!pip install transformers

## import necessary libraries

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, Markdown
from pathlib import Path

import numpy as np
import pandas as pd 

import torch
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration

from nltk.tokenize import sent_tokenize

## Load Dataset

In [None]:
train_df = pd.read_csv( 'crypto_news_parsed_2013-2017_train.csv')
valid_df = pd.read_csv('crypto_news_parsed_2018_validation.csv')

In [None]:
train_df.head()

In [None]:
train_df['text'][0]

In [None]:
#checking for empty string
train_df.isna().sum()

In [None]:
# readling empty strings is a bit different locally and here, but not a big deal 
train_df['text'].fillna(' ', inplace=True)

In [None]:
train_df.isna().sum()

In [None]:
#check data shape
train_df.shape, valid_df.shape

## Text Preprocessing of the data

In [None]:
#In Python strings, the backslash "\" is a special character, also called the "escape" character. It is used in representing certain whitespace characters: "\t" is a tab, "\n" is a newline, and "\r" is a carriage return. ... This is called "escaping"
def minimal_processing(s):
    return s.strip().replace('\r', '').replace('\n', ' ')

In [None]:
def extract_and_process_first_k_sent(text, k=3):
    sent_tok = sent_tokenize(text)
    if not sent_tok:
        return ' '
    
    result = " ".join([minimal_processing(sent.strip(' .').lower()) for sent in sent_tok[:k]])
    return result

In [None]:
# train_texts = train_df['text'].progress_apply(lambda text:extract_and_process_first_k_sent(text))

valid_texts = valid_df['text'].progress_apply(lambda text:extract_and_process_first_k_sent(text, k=10))

In [None]:
v_t=valid_texts[0:4]

In [None]:
#checking for GPU
torch_device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [None]:
torch_device

## Import  BART Tokenizer and BART Model

In [None]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large").to(torch_device)

## Extract Text And Title from DataSet

In [None]:
example_text = train_df.loc[0, 'text']
example_title = train_df.loc[0, 'title']

In [None]:
display(Markdown('> **Title:** ' + example_title))
display(Markdown('> **Text:** ' + example_text))

## Tokenization

In [None]:
#tokenize the dataset
article_input_ids = tokenizer.batch_encode_plus([example_text], return_tensors='pt', max_length=128)['input_ids'].to(torch_device)


In [None]:
#Model Training
summary_ids = model.generate(article_input_ids,
                             num_beams=4,
                             length_penalty=2.0,
                             max_length=20,
                             min_length=5,
                             no_repeat_ngram_size=3)



In [None]:
#model output
summary_txt = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
display(Markdown('> **Summary:** ' + summary_txt))

## Get Summary for Test Data

In [None]:
bs = 2

val_summaries = []

for i in tqdm(range(0, 3, bs)):

    article_input_ids = tokenizer.batch_encode_plus(valid_texts.iloc[i:i+bs].tolist(), 
                                                    return_tensors='pt', pad_to_max_length=True,
                                                    max_length=512)['input_ids'].to(torch_device)
    
    summary_ids = model.generate(article_input_ids,
                             num_beams=4,
                             length_penalty=2.0,
                             max_length=40,
                             min_length=5,
                             no_repeat_ngram_size=3)
    
    val_summaries.extend([tokenizer.decode(summary_ids[i].squeeze(), skip_special_tokens=True).lower()for i in range(len(summary_ids))])


In [None]:
val_summaries

In [None]:
valid_titles = valid_df['title'].str.lower().tolist()

## ROUGE evaluation metric for summarization of texts as well as Machine Translation. 
https://medium.com/@prabha88978/installation-working-process-of-rouge-1-5-5-6c0dfdca49e8

In [None]:
!pip install rouge

In [None]:
from rouge import Rouge 
rouge = Rouge()
scores = rouge.get_scores(hyps=[el.split('.')[0] for el in val_summaries], refs=valid_titles,  avg=True, ignore_empty=True)

In [None]:
final_metric = (scores['rouge-1']['f'] + scores['rouge-2']['f'] + scores['rouge-l']['f']) / 3
final_metric

In [None]:
print(len(valid_titles))
print(len(val_summaries))
print(len(valid_texts))

In [None]:
val_res_df = pd.DataFrame({'title': 4, 
                           'generated': val_summaries,
                          'text': v_t.values}).reset_index(drop=True)

In [None]:
val_rouge_scores = rouge.get_scores(hyps=val_summaries, refs=valid_titles, avg=False, ignore_empty=True)

In [None]:
val_res_df['rouge-1'] = [el['rouge-1']['f'] for el in val_rouge_scores]
val_res_df['rouge-2'] = [el['rouge-2']['f'] for el in val_rouge_scores]
val_res_df['rouge-L'] = [el['rouge-l']['f'] for el in val_rouge_scores]
val_res_df['avg_rouge'] = (val_res_df['rouge-1'] + val_res_df['rouge-2'] + val_res_df['rouge-L']) / 3

In [None]:
val_res_df.head()

In [None]:
def print_result(row):
    print('_' * 68)
    display(Markdown('> **Rouge:** ' + str(round(row['avg_rouge'], 3))))
    display(Markdown('> **Title:** ' + str(row['title'])))
    display(Markdown('> **Text:** ' +str(row['text'])))
    display(Markdown('> **Generated:** ' + row['generated']))
    print('_' * 68)

In [None]:
for _, row in val_res_df.sort_values(by='avg_rouge', ascending=False).head().iterrows():
    print_result(row)

In [None]:
#make csv after testing
val_res_df.to_csv('val_set_with_bart_generated_titles.csv', index=None)

In [None]:
a=256

In [None]:
b=256

In [None]:
a is b

In [10]:
a=255

In [11]:
b=255

In [12]:
a is b

True

In [None]:
id(a)

In [None]:
id(b)