In [1]:
# connect to google colab
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [7]:
# base path
DATA_PATH = './drive/MyDrive/fyp-code/codes/data/ecpe/'
DEST_PATH = './drive/MyDrive/fyp-code/codes/data/subtasks/'

In [3]:
# load transformers
!pip install transformers

Collecting transformers
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 4.1 MB/s 
[?25hCollecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 7.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 40.9 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 48.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 52.6 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

In [4]:
# load sentencepiece library
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 4.2 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [5]:
# usual import
import pandas as pd
import numpy as np
from tqdm import tqdm

# to load the pegasus model
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# check if it is running with GPU or not
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


## Import and Load Models
We will be trying out 2 variant of pegasus models, both trained with different text corpus
- pegasus-xsum
- pegasus-reddit_tifu

In [6]:
# load tokenizers
tokenizer_xsum = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
tokenizer_reddit = PegasusTokenizer.from_pretrained("google/pegasus-reddit_tifu")

# load the models
model_xsum = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to(device)
model_reddit = PegasusForConditionalGeneration.from_pretrained("google/pegasus-reddit_tifu").to(device)

Downloading:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

## Load the datasets

### Import Short Text

In [8]:
short_data = pd.read_csv(DATA_PATH+'ecpe_cleaned_short_data.csv')[['text_cleaned_ecpe']]
short_data.head(3)

Unnamed: 0,text_cleaned_ecpe
0,I get to spend New Year is home again alone an...
1,"Depressed and lonely /: Stuck in a deep, never..."
2,Learning to pretend to have a good time had be...


### Import Long Text

In [9]:
long_data = pd.read_csv(DATA_PATH+'ecpe_cleaned_long_data.csv')[['text_cleaned_ecpe']]
long_data.head(3)

Unnamed: 0,text_cleaned_ecpe
0,Just another night. Another night of feeling l...
1,Is it possible to fake depression? I have been...
2,Imagine being attractive Imagine what it would...


## Helper function to summarize text

In [10]:
# helper function to summarize the text
def summarize(text, tokenizer_type, model_type):
    # create tokens - number representation of our text
    tokens = tokenizer_type(text, truncation=True, padding="longest", return_tensors="pt").to(device)
    
    # summarize the text
    summary = model_type.generate(**tokens)

    return tokenizer_type.decode(summary[0])

## Perform Abstractive Summarization

In [12]:
# try on one piece of data

# original text
print(long_data.text_cleaned_ecpe[10])
print()

# summarized text - xsum
summary_xsum = summarize(long_data.text_cleaned_ecpe[10], tokenizer_xsum, model_xsum)
print(summary_xsum)
print()

# summarized text - reddit-tifu
summary_reddit = summarize(long_data.text_cleaned_ecpe[10], tokenizer_reddit, model_reddit)
print(summary_reddit)

Hi. I am new to the forums. I was going to lurk for a bit before I posted, but there is so many of us, (I mean, I knew. but I did not know.. you know? it seems like the thing to do. I am a 24 year old male and I have had problems with depression for as long as I can remember, though it is become worse over the last year or so. I was pretty heavily involved in music and martial arts for years, but I am no longer motivated to do either. I am still going to school, though Hi. I am new to the forums. I was going to lurk for a bit before I posted, but there is so many of us, (I mean, I knew. but I did not know.. you know? it seems like the thing to do. I am a 24 year old male and I have had problems with depression for as long as I can remember, though it is become worse over the last year or so. I was pretty heavily involved in music and martial arts for years, but I am no longer motivated to do either. I am still going to school, though I have no real direction and I am afraid that my lac

## Begin the pegasus text summarization that is trained on xsum dataset

In [14]:
# short data
short_data_summaries_xsum = []

# get the predicted summary for the whole short text dataset
for sentence in tqdm(short_data.text_cleaned_ecpe.tolist()):
    summary = summarize(sentence, tokenizer_xsum, model_xsum)
    short_data_summaries_xsum.append(summary)

100%|██████████| 843/843 [18:39<00:00,  1.33s/it]


In [24]:
# create a dataframe to store the text summaries
summary_short_df = pd.DataFrame()
summary_short_df['text_cleaned'] = short_data.text_cleaned_ecpe
summary_short_df['text_summarized_xsum'] = short_data_summaries_xsum
summary_short_df.head()

Unnamed: 0,text_cleaned,text_summarized_xsum
0,I get to spend New Year is home again alone an...,It's that time of year again.
1,"Depressed and lonely /: Stuck in a deep, never...","Depressed and lonely /: Stuck in a deep, never..."
2,Learning to pretend to have a good time had be...,When I was a child I used to pretend to be som...
3,So far he stop texting meafter I said somethin...,I've been trying to get my boyfriend to stop t...
4,*sigh* ?? I have not cried so muchI am in so m...,I have not cried so much as I have done in so ...


In [16]:
# long data
long_data_summaries_xsum = []

# get the predicted summary for the whole short text dataset
for sentence in tqdm(long_data.text_cleaned_ecpe.tolist()):
    summary = summarize(sentence, tokenizer_xsum, model_xsum)
    long_data_summaries_xsum.append(summary)

100%|██████████| 1437/1437 [41:07<00:00,  1.72s/it]


In [18]:
# create a dataframe to store the text summaries
summary_long_df = pd.DataFrame()
summary_long_df['text_cleaned'] = long_data.text_cleaned_ecpe
summary_long_df['text_summarized_xsum'] = long_data_summaries_xsum
summary_long_df.head()

Unnamed: 0,text_cleaned,text_summarized_xsum
0,Just another night. Another night of feeling l...,Just another night.
1,Is it possible to fake depression? I have been...,I have a question about depression.
2,Imagine being attractive Imagine what it would...,I wish I was born attractive.
3,"Best moment to have anxiety It is 3:30am, I am...",The best moment for thinking of smoking when y...
4,"hi, I am a 21 year-old male from the uk, over ...",I have been suffering from depression for the ...


## Begin the pegasus text summarization that is trained on reddit-tifu dataset

In [19]:
# short data
short_data_summaries_reddit = []

# get the predicted summary for the whole short text dataset
for sentence in tqdm(short_data.text_cleaned_ecpe.tolist()):
    summary = summarize(sentence, tokenizer_reddit, model_reddit)
    short_data_summaries_reddit.append(summary)

100%|██████████| 843/843 [1:17:06<00:00,  5.49s/it]


In [26]:
# create a dataframe to store the text summaries
summary_short_df['text_summarized_reddit'] = short_data_summaries_reddit
summary_short_df.head()

Unnamed: 0,text_cleaned,text_summarized_xsum,text_summarized_reddit
0,I get to spend New Year is home again alone an...,It's that time of year again.,⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇...
1,"Depressed and lonely /: Stuck in a deep, never...","Depressed and lonely /: Stuck in a deep, never...",⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇...
2,Learning to pretend to have a good time had be...,When I was a child I used to pretend to be som...,"pretend to have a good time, pretend to have a..."
3,So far he stop texting meafter I said somethin...,I've been trying to get my boyfriend to stop t...,so far he stop texting meafter i said somethin...
4,*sigh* ?? I have not cried so muchI am in so m...,I have not cried so much as I have done in so ...,i have not cried so much i am in so much pain ...


In [27]:
# long data
long_data_summaries_reddit = []

# get the predicted summary for the whole short text dataset
for sentence in tqdm(long_data.text_cleaned_ecpe.tolist()):
    summary = summarize(sentence, tokenizer_reddit, model_reddit)
    long_data_summaries_reddit.append(summary)

100%|██████████| 1437/1437 [1:31:26<00:00,  3.82s/it]


In [28]:
# create a dataframe to store the text summaries
summary_long_df['text_summarized_reddit'] = long_data_summaries_reddit
summary_long_df.head()

Unnamed: 0,text_cleaned,text_summarized_xsum,text_summarized_reddit
0,Just another night. Another night of feeling l...,Just another night.,the love of your life does not love you anywhe...
1,Is it possible to fake depression? I have been...,I have a question about depression.,i have been feeling bad for about 7 months now...
2,Imagine being attractive Imagine what it would...,I wish I was born attractive.,i wish i was born attractive.. i wish i was bo...
3,"Best moment to have anxiety It is 3:30am, I am...",The best moment for thinking of smoking when y...,"don't smoke in the middle of the night, it is ..."
4,"hi, I am a 21 year-old male from the uk, over ...",I have been suffering from depression for the ...,i feel like i will die in the next few months ...


## Save the dataframe into csv

In [29]:
summary_short_df.to_csv(DEST_PATH+'subtasks_text_summarization_abstractive_short_data.csv', index=False)

In [30]:
summary_long_df.to_csv(DEST_PATH+'subtasks_text_summarization_abstractive_long_data.csv', index=False)