In [None]:
# Install and import all the dependencies

!pip install contractions
!pip install transformers
!pip install datasets
!pip install rouge_score

import os
import pandas as pd
import contractions
import numpy as np
import torch
import tqdm
import codecs
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
from torch import nn, optim
from transformers import AutoTokenizer, GPT2Model
from datasets import load_metric

In [None]:
# Access the Google Drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# To fill with the path where the data is stored

os.chdir("drive/My Drive/Colab Notebooks/Language")
!ls

In [None]:
# Pre-processing of the data

def pre_process(input_filename, text_column_name, summary_column_name, output_filename):
    """
    Pre-processes a .csv file containing texts and summaries, by expanding english contractions 
    and storing the obtained data in a new .csv file.

    Parameters: 
        input_filename (str)     : The name of the .csv file that is to be pre-processed
        text_column_name (str)   : The name of the column of the .csv file which contains the texts
        summary_column_name (str): The name of the column of the .csv file which contains the summaries
        output_filename (str)    : The name of the .csv file where the pre-processed data is to be stored
    """

    with open(input_filename, encoding='utf-8') as f:
        df_init = pd.read_csv(f)
    f.close()
    print("Init pandas file")

    # Remove all the useless columns, keep only the texts and the summaries
    df = df_init[[summary_column_name, text_column_name]].copy()
    df.rename(columns = {text_column_name: 'Initial_text'}, inplace = True) 
    df.rename(columns = {summary_column_name: 'Initial_summary'}, inplace = True)
    df.dropna(inplace=True) # Remove rows with NaN values
    print("Remove useless columns: Done")

    # Contractions expansion
    df['Text_not_contracted'] = df['Initial_text'].apply(lambda x: [contractions.fix(word) for word in x.split()])
    df['Text'] = [' '.join(l) for l in df['Text_not_contracted']]
    df['Summary_not_contracted'] = df['Initial_summary'].apply(lambda x: [contractions.fix(word) for word in str(x).split()])
    df['Summary'] = [' '.join(l) for l in df['Summary_not_contracted']]
    print('Contractions expansion: Done')

    # Save the new .csv file
    df[['Text', 'Summary']].to_csv(output_filename)

In [None]:
# The data is originally stored in 'Reviews.csv', the processed version is stored in 'Reviews_preprocessed.csv'

pre_process('Reviews.csv', 'Text', 'Summary', 'Reviews_preprocessed.csv')

In [None]:
# Load the GPT-2 tokenizer, and a pad token

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Definition of the custom dataset class 

class SummarizationDataset(Dataset):

    def __init__(self, filename, max_text_length=900, max_summary_length=60, stop = 10000):
     

        self.max_text_length = max_text_length
        self.max_summary_length = max_summary_length

        print("Creating Mapping")

        with open(filename, encoding='utf-8') as f:
            df = pd.read_csv(f).head(stop)
        f.close()

        self.total_lines = len(df) - 1

        pad_token_id = tokenizer(tokenizer.pad_token)['input_ids'][0]
        self.input_ids = torch.ones((self.total_lines, self.max_text_length),dtype = torch.long) * pad_token_id#self.input_ids = torch.ones((self.total_lines, self.max_summary_length + self.max_text_length),dtype = torch.long) * pad_token_id
        self.label_ids = torch.zeros((self.total_lines, self.max_summary_length), dtype = torch.long)
        self.attention_mask = torch.ones(self.total_lines,dtype = torch.long)

        # Processes the given file, by creating the summaries and texts tensors (each line of these tensors is a list of word indices)
        for index, row in tqdm(df.iterrows()):
            if index == 0: # We are not interested in the first line (contains the names of the columns)
                pass
            text, summary = row['Text'], row['Summary']
            # The texts which are longer than max_text_length are truncated, the others are padded (thanks to the inialization of the tensors)
            try :
                text_tokenizer = tokenizer(text, return_tensors="pt", max_length=self.max_text_length, truncation=True, padding='max_length')
                summary_tokenizer = tokenizer(summary, return_tensors="pt", max_length=self.max_summary_length, truncation=True, padding='max_length')
            except :
                print(text)
                print(summary)
            self.input_ids[index-1, :self.max_text_length] = text_tokenizer['input_ids']
            self.attention_mask[index-1] = torch.argmin(text_tokenizer['input_ids'])
            self.label_ids[index-1] = summary_tokenizer['input_ids']
        
        print(f'Processed {self.total_lines} lines.')
    
    def __len__(self):
        return self.total_lines

    def __getitem__(self, index):
        attention_item = torch.zeros((1,self.max_text_length), dtype = torch.long)
        attention_item[ : self.attention_mask[index]] = 1
        return self.input_ids[index], attention_item, self.label_ids[index]

In [None]:
# Set of the maximum length of both the texts and the summaries

max_text_length=500
max_summary_length=40
max_length =  max_summary_length + max_text_length

In [None]:
# Creation of the dataset

dataset = SummarizationDataset('Reviews_preprocessed.csv', max_text_length=max_text_length, max_summary_length=max_summary_length, stop=50000)

In [None]:
# Split the data into train/validation/test datasets

total_size = len(dataset)
training_size = int(0.8 * total_size)
validation_size = int(0.1 * total_size)
test_size = total_size - training_size - validation_size

training_set, validation_set, test_set = random_split(dataset, [training_size, validation_size, test_size])

In [None]:
# Load the datasets in DataLoaders

batch_size = 4
train_dataset_loader = DataLoader(training_set, batch_size=batch_size, shuffle=True)  
valid_dataset_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=True) 
test_dataset_loader  = DataLoader(test_set , batch_size=1, shuffle=True)
print("train size : " , len(training_set))
print("validation size : " , len(validation_set))
print("test size : " , len(test_set))

In [None]:
# Load the model and prepare for training

lr = 1e-4
weight_decay = 0.01
epochs = 4

# Load the model: either GPT-2 if no model is stored, or 'model.pt' if you have already started to fine-tune and saved the latest version
model = GPT2Model.from_pretrained("gpt2")
try:
    if torch.cuda.is_available():
        model.load_state_dict(torch.load("model.pt",map_location=torch.device("cuda")))
    else:
        model.load_state_dict(torch.load("model.pt",map_location=torch.device('cpu')))
except:
    print("No model find")

optimizer = optim.Adam(model.parameters(),lr=lr,weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss(ignore_index=50256)
word_embedding = model.wte.weight

if torch.cuda.is_available():
    print("CUDA")
    model.cuda()
    word_embedding.cuda()

In [None]:
# Define the function to compute and print the ROUGE scores

def rouge_scores(dataset_loader) :
    predictions = []
    summaries = []

    metric = load_metric("rouge")
    for data in tqdm(dataset_loader):
        text, mask, summary = data
        if torch.cuda.is_available():
            text,mask ,summary= text.cuda(), mask.cuda(), summary.cuda()
        last_hidden_states = model(input_ids=text, attention_mask=mask, output_hidden_states=False).last_hidden_state[0,:max_summary_length]
        scores = last_hidden_states @ word_embedding.T # compute the logits for every output
        prediction = torch.argmax(scores,dim=-1) # take the largest logits as prediction

        summary_str = tokenizer.decode(summary[0],skip_special_tokens=True)
        text_predicted = tokenizer.decode(prediction[0],skip_special_tokens=True)

        predictions.append(text_predicted)
        summaries.append(summary_str)
        
    result = metric.compute(predictions=predictions, references=summaries, use_stemmer=True)
    rouge1_result = result["rouge1"].mid.fmeasure * 100
    rouge2_result = result["rouge2"].mid.fmeasure * 100
    rougeL_result = result["rougeL"].mid.fmeasure * 100

    print("Rouge 1 : ",rouge1_result)
    print("Rouge 2 : ",rouge2_result)
    print("Rouge L : ",rougeL_result)


In [None]:
# Training of the model

rouge_scores(valid_dataset_loader)
torch.save(model.state_dict(),"model.pt")

for i in range(epochs):
    cpt = 0
    for batch in tqdm(train_dataset_loader):
        cpt += 1
        text, mask, summary = batch
        if torch.cuda.is_available(): # load the batch on the gpu if available
            text,mask ,summary= text.cuda(), mask.cuda(), summary.cuda()
        model.train()
        optimizer.zero_grad()
        output = model(input_ids=text, attention_mask=mask, output_hidden_states=False).last_hidden_state[:,:max_summary_length] # compute the output of the model
        scores = output @ word_embedding.T # compute the logits for every output
        loss = criterion(scores.view(-1, scores.size(-1)), summary.view(-1)) # compute the loss
        loss.backward() # backpropagation
        optimizer.step()
        if cpt % 5000 == 0:
            if torch.cuda.is_available():
                loss.cpu()
            print('Batch {}: {}'.format(cpt, loss.item()))

    torch.save(model.state_dict(),"model.pt")
    rouge_scores(valid_dataset_loader)

In [None]:
# Test the model on a few examples

if torch.cuda.is_available():
    model.cpu()
    word_embedding.cpu()

texts = [
    "summarize: I recently tried this flavor/brand and was surprised at how delicious these chips are.  The best thing was that there were a lot of 'brown' chips in the bsg (my favorite), so I bought some more through amazon and shared with family and friends.  I am a little disappointed that there are not, so far, very many brown chips in these bags, but the flavor is still very good.  I like them better than the yogurt and green onion flavor because they do not seem to be as salty, and the onion flavor is better.  If you haven't eaten Kettle chips before, I recommend that you try a bag before buying bulk.  They are thicker and crunchier than Lays but just as fresh out of the bag.",
    "summarize: Great chips, great price.  Odds are that you will have them to yourself.  If you like your fish and chips soaked in malt vinegar, you will love these.  The best salt and vinegar chips I have ever had (and I love this flavor, mind you).  They are spectacular with deli sandwiches or on their own.  Since I have found that S&V is palatable to only the most intelligent of our species I know that my afternoon snack is all mine, unless of course I run into another heavy brained, hearty breathed like mind.  -Summary- If you like the taste of bitter sweet salty vinegar and a crispy chip to boot you wont pucker at the sight of these handily bagged morsels.  It is the first and only time I have committed to a whole case of chips and I will do it again.  Mmmmm... vinegar...",
    "summarize: These chips taste awesome. And unlike most other flavored chips, they actually make sure that plenty of the flavory salty goodness gets on each individual chip. Just don't pass gas near any pretty ladies after consumption. They'll totally know it was you.",
    "summarize: So I got this and tasted it strait out of the bottle, it tasted like smoky flavored milk - YUCK! I was depressed I was stuck with 4 bottles of this.  It sat on my shelf and I forgot about it. Last weekend I tasted a Zevia cream soda and was not pleased with the flavor, it tasted weird and smokey like the LorAnn oil I had gotten.  I tried to doctor up the soda with a little SF Vanilla Torani syrup and a tablespoon of heavy cream - IT TASTED AMAZING!!!  The odd smokiness was gone and it had a wonderful rich mouthfeel.<br /><br />So I wanted to see if this would help the LorAnn oil too.  I put a cup of whipping cream in a bowl and added a teaspoon of vanilla and tasted it, it tasted fine.  Then I added ONE DROP of the oil, it is very strong, and stirred it in - IT TASTED SO MUCH BETTER!  I whipped it up and put it on fruit and it was such a treat.<br /><br />Now I will always whip a drop of it with vanilla into desserts and whip cream.  It was such a happy accident to find out how to use it.  When added to vanilla it really adds a new depth of flavor and tastes so different than just out of the bottle or on its own in cream.<br /><br />It is VERY potent so only use a drop and increase after tasting, if you add too much it will ruin the recipe so use a light touch.",
]

summaries = [
    "Best sour cream & onion chip I've had", 
    "So Delicious...Yet my companions wont touch them.",
    "So much flavor your farts will smell like sweet onions",
    "Do not taste from bottle! Mix with vanilla for true flavor.",
]

def test(example) :
    inputs =  tokenizer(example, return_tensors="pt", max_length=max_length, truncation=True, padding='max_length')
    outputs = model(**inputs)

    last_hidden_states = outputs.last_hidden_state[0,:max_summary_length]
    scores = last_hidden_states @ word_embedding.T
    prediction = torch.argmax(scores,dim=-1)
    token = tokenizer.decode(prediction,skip_special_tokens=True)
    return(token)

for i in range(len(texts)) :
    token = test(texts[i])
    print('Predicted summary: {}'.format(token))
    print('Original summary: {}'.format(summaries[i]))

In [None]:
# ROUGE score on the test data

rouge_scores(test_dataset_loader)

100%|██████████| 5001/5001 [03:17<00:00, 25.37it/s]


Rouge 1 :  6.0435119658275
Rouge 2 :  0.0
Rouge L :  6.057487858627633
