In [1]:
# connect to google colab
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# base path
DATA_PATH = './drive/MyDrive/fyp-code/codes/data/ecpe/'
SUMMARIZED_PATH = './drive/MyDrive/fyp-code/codes/data/subtasks/'

In [3]:
# Libraries
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import csv
import string

# load spacy engine
en = spacy.load('en_core_web_sm')

In [4]:
!pip install deplacy
import deplacy

Collecting deplacy
  Downloading deplacy-2.0.0-py3-none-any.whl (22 kB)
Installing collected packages: deplacy
Successfully installed deplacy-2.0.0


# ECPE on the long dataset 
Load the ecpe long dataset

In [None]:
# Load only the cleaned text column only
ecpe_data = pd.read_csv(DATA_PATH+'ecpe_cleaned_long_data.csv')[['text_cleaned_ecpe']]
ecpe_data

Unnamed: 0,text_cleaned_ecpe
0,Just another night. Another night of feeling l...
1,Is it possible to fake depression? I have been...
2,Imagine being attractive Imagine what it would...
3,"Best moment to have anxiety It is 3:30am, I am..."
4,"hi, I am a 21 year-old male from the uk, over ..."
...,...
1432,I never asked to be born..seems unfair we cann...
1433,I hate the way I look I have pretty much alway...
1434,Anyone else feel guilty about people checking ...
1435,"Need help with suicidal wife, please. My wife ..."


In [None]:
# sample text
ecpe_data.text_cleaned_ecpe[0]

'Just another night. Another night of feeling lonely and just wondering what I did wrong in life to deserve this unhappiness. I have never felt a pain stronger than being rejected by the love of your life. The person who gives you a purpose. The person who is supposed to make everything better. You would give your life for this person and they just do not love you anywhere near the same.'

In [None]:
len(ecpe_data)

1437

## Splitting the text data on one text entry (tryout)

In [None]:
# test sentence
#text = "This all encompassing experience wore off for a moment and in that moment, my awareness came gasping to the surface of the hallucination and I was able to consider momentarily that I had killed myself by taking an outrageous dose of an online drug and this was the most pathetic death experience of all time."
text = "My cat just got hit by a car I cannot even think about it too much. My family is a little sad but are kinda telling me to get over it. I have been in such a terrible depressive spell for the last few weeks and this just ruined everything. When older pets die by natural causes I do not cry as much as because I know we gave them the best life and they lived very long but my suki was only 2 years old she was so fat and adorable and I cannot believe she is gone just because of a bad driver. She always came to me when I was upset she would lay on top of me and snuggle. I wish I had more time with her. now I have almost nothing that makes me smile"

doc = en(text)
deplacy.render(doc)

seen = set() # keep track of covered words

chunks = []
for sent in doc.sents:
    heads = [cc for cc in sent.root.children if cc.dep_ == 'conj']

    for head in heads:
        words = [ww for ww in head.subtree]
        for word in words:
            seen.add(word)
        chunk = (' '.join([ww.text for ww in words]))
        chunks.append( (head.i, chunk) )

    unseen = [ww for ww in sent if ww not in seen]
    chunk = ' '.join([ww.text for ww in unseen])
    chunks.append( (sent.root.i, chunk) )

chunks = sorted(chunks, key=lambda x: x[0])
chunk_list = []

for ii, chunk in chunks:
    chunk_list.append(chunk)

#print(chunk_list)

My         DET   <╗                               poss
cat        NOUN  ═╝<════════════════════════╗     nsubjpass
just       ADV   <════════════════════════╗ ║     advmod
got        VERB  <══════════════════════╗ ║ ║     auxpass
hit        VERB  ═════════════════════╗═╝═╝═╝═╗   ROOT
by         ADP   ═══════════════════╗<╝       ║   agent
a          DET   <════════════════╗ ║         ║   det
car        NOUN  ═══════════════╗═╝<╝         ║   pobj
I          PRON  <════════════╗ ║             ║   nsubj
can        VERB  <══════════╗ ║ ║             ║   aux
not        PART  <════════╗ ║ ║ ║             ║   neg
even       ADV   <══════╗ ║ ║ ║ ║             ║   advmod
think      VERB  ═══╗═╗═╝═╝═╝═╝<╝             ║   relcl
about      ADP   ═╗<╝ ║                       ║   prep
it         PRON  <╝   ║                       ║   pobj
too        ADV   <╗   ║                       ║   advmod
much       ADV   ═╝<══╝                       ║   advmod
.          PUNCT <════════════════════════════╝  

In [None]:
# try out on one entry
print(chunk_list)

# open text file
f = open(f'{DATA_PATH}ecpe_test_one_example.txt','a+')

# enter data into the text file according to the format we want
counter = 3
f.write(f'{counter} {len(chunk_list)}\n')
f.write(f' (0, 0),\n')
for i,j in enumerate(chunk_list):
    f.write(f'{i},null,null,{j}\n')

In [None]:
f.close()

## Helper function to get separate clauses in the entries

In [9]:
def get_separate_clauses(text):
    doc = en(text)
    #deplacy.render(doc)

    seen = set() # keep track of covered words

    chunks = []
    for sent in doc.sents:
        heads = [cc for cc in sent.root.children if cc.dep_ == 'conj']

        for head in heads:
            words = [ww for ww in head.subtree]
            for word in words:
                seen.add(word)
            chunk = (' '.join([ww.text for ww in words]))
            chunks.append( (head.i, chunk) )

        unseen = [ww for ww in sent if ww not in seen]
        chunk = ' '.join([ww.text for ww in unseen])
        chunks.append( (sent.root.i, chunk) )

    chunks = sorted(chunks, key=lambda x: x[0])
    chunk_list = []

    for ii, chunk in chunks:
        chunk_list.append(chunk)

    return chunk_list

In [10]:
# Sample string for punctuation removal
s = "string. With. Punctuation?"
out = s.translate(str.maketrans('', '', string.punctuation))
out

'string With Punctuation'

## Split the full text data into its separate clauses 

In [None]:
# split all the text data into clauses

# open text file , save as ecpe_raw for further annotations later
f = open(f'{DATA_PATH}ecpe_raw.txt','a+')

for i in tqdm(range(len(ecpe_data))):
    chunk_list = get_separate_clauses(ecpe_data.text_cleaned_ecpe[i])

    # do a regex here to remove the punctuations
    chunk_list_no_punct = []
    for chunk_str in chunk_list:
        out = chunk_str.translate(str.maketrans('', '', string.punctuation))
        chunk_list_no_punct.append(out)

    # enter data into the text file according to the format we want
    counter = i+1
    f.write(f'{counter} {len(chunk_list)}\n')
    f.write(f' (0, 0),\n')
    for index, clause in enumerate(chunk_list_no_punct):
        f.write(f'{index+1},null,null,{clause}\n')

# close text file
f.close()

100%|██████████| 1437/1437 [01:06<00:00, 21.53it/s]


## Write into csv file

In [None]:
with open(f'{DATA_PATH}ecpe_raw_testing.csv','a+', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    for i in tqdm(range(len(ecpe_data))):
        chunk_list = get_separate_clauses(ecpe_data.text_cleaned_ecpe[i])

        # do a regex here to remove the punctuations
        chunk_list_no_punct = []
        for chunk_str in chunk_list:
            out = chunk_str.translate(str.maketrans('', '', string.punctuation))
            chunk_list_no_punct.append(out)
        
        # enter data into the csv file according to the format we want
        for index, clause in enumerate(chunk_list_no_punct):
            data = [index+1,'null','null',clause]
            writer.writerow(data)

100%|██████████| 1437/1437 [01:03<00:00, 22.58it/s]


# ECPE on the short dataset 
Load the ecpe short dataset

In [None]:
# Load only the cleaned text column only
ecpe_data_short = pd.read_csv(DATA_PATH+'ecpe_cleaned_short_data.csv')[['text_cleaned_ecpe']]
ecpe_data_short

Unnamed: 0,text_cleaned_ecpe
0,I get to spend New Year is home again alone an...
1,"Depressed and lonely /: Stuck in a deep, never..."
2,Learning to pretend to have a good time had be...
3,So far he stop texting meafter I said somethin...
4,*sigh* ?? I have not cried so muchI am in so m...
...,...
838,I am going to try and get some sleep. Not sure...
839,I cannot Take it ??
840,Fuck surgery. Fuck laying here watching days p...
841,Depression is like wearing a one of those weig...


## Remove entries that have less than 15 words

In [None]:
ecpe_data_short_clean = ecpe_data_short[ecpe_data_short['text_cleaned_ecpe'].str.split().str.len() >= 15]
# reset the index of the text data
ecpe_data_short_clean = ecpe_data_short_clean.reset_index(drop=True)
ecpe_data_short_clean

Unnamed: 0,text_cleaned_ecpe
0,Learning to pretend to have a good time had be...
1,So far he stop texting meafter I said somethin...
2,Thank god the last presentation is over! tomor...
3,"No, I am not wouldepressed because of the weat..."
4,Nobody cares in real life and nobody cares her...
...,...
478,Been super maintaining control & not comfort e...
479,How many times I went through Hell? I cannot r...
480,I am going to try and get some sleep. Not sure...
481,Fuck surgery. Fuck laying here watching days p...


## Split the full text data into its separate clauses 

In [None]:
# split all the text data into clauses

# open text file , save as ecpe_raw for further annotations later
f = open(f'{DATA_PATH}ecpe_raw_short.txt','a+')

for i in tqdm(range(len(ecpe_data_short_clean))):
    chunk_list = get_separate_clauses(ecpe_data_short_clean.text_cleaned_ecpe[i])

    # do a regex here to remove the punctuations
    chunk_list_no_punct = []
    for chunk_str in chunk_list:
        out = chunk_str.translate(str.maketrans('', '', string.punctuation))
        chunk_list_no_punct.append(out)

    # enter data into the text file according to the format we want
    counter = i+1
    f.write(f'{counter} {len(chunk_list)}\n')
    f.write(f' (0, 0),\n')
    for index, clause in enumerate(chunk_list_no_punct):
        f.write(f'{index+1},null,null,{clause}\n')

# close text file
f.close()

100%|██████████| 483/483 [00:06<00:00, 71.75it/s]


## Write into csv file

In [None]:
with open(f'{DATA_PATH}ecpe_raw_short.csv','a+', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    for i in tqdm(range(len(ecpe_data_short_clean))):
        chunk_list = get_separate_clauses(ecpe_data_short_clean.text_cleaned_ecpe[i])

        # do a regex here to remove the punctuations
        chunk_list_no_punct = []
        for chunk_str in chunk_list:
            out = chunk_str.translate(str.maketrans('', '', string.punctuation))
            chunk_list_no_punct.append(out)
        
        # enter data into the csv file according to the format we want
        for index, clause in enumerate(chunk_list_no_punct):
            data = [index+1,'null','null',clause]
            writer.writerow(data)

100%|██████████| 483/483 [00:06<00:00, 79.73it/s]


# ECPE on the SUMMARIZED long dataset using extractive summarization done earlier

In [5]:
# Load only the cleaned text column only
ecpe_data_long_summarized = pd.read_csv(SUMMARIZED_PATH+'subtasks_text_summarization_extractive_long_data.csv')[['text_summarized']]
ecpe_data_long_summarized

Unnamed: 0,text_summarized
0,You would give your life for this person and t...
1,Is it possible to fake depression?\nI am just ...
2,-
3,The middle of the night is the best moment for...
4,"hi, I am a 21 year-old male from the uk, over ..."
...,...
1432,-
1433,I will be ugly no matter what so I guess hair ...
1434,-
1435,"Need help with suicidal wife, please.\nI told ..."


## Remove entries that have less than 15 words

In [7]:
ecpe_data_long_summarized_clean = ecpe_data_long_summarized[ecpe_data_long_summarized['text_summarized'].str.split().str.len() >= 15]

# reset the index of the text data
ecpe_data_long_summarized_clean = ecpe_data_long_summarized_clean.reset_index(drop=True)

# clear newline characters in the dataframe
ecpe_data_long_summarized_clean = ecpe_data_long_summarized_clean.replace(r'\n',' ', regex=True)
ecpe_data_long_summarized_clean

Unnamed: 0,text_summarized
0,You would give your life for this person and t...
1,Is it possible to fake depression? I am just t...
2,The middle of the night is the best moment for...
3,"hi, I am a 21 year-old male from the uk, over ..."
4,I was just curious what your feelings are on l...
...,...
1128,"Little sister has depression, what do I do? I ..."
1129,Been independent since 17 I have been living w...
1130,I will be ugly no matter what so I guess hair ...
1131,"Need help with suicidal wife, please. I told h..."


## Split the full text data into its separate clauses 
Only save those entries that have not more than 5 clauses

In [14]:
# split all the text data into clauses

# added a counter to see how many data are being stored after those conditions
data_counter = 0

# open text file , save as ecpe_raw for further annotations later
f = open(f'{DATA_PATH}ecpe_raw_long_summarized_max_5_clauses.txt','a+')

for i in tqdm(range(len(ecpe_data_long_summarized_clean))):
    chunk_list = get_separate_clauses(ecpe_data_long_summarized_clean.text_summarized[i])

    # only save those entries with not more than 5 clauses
    if len(chunk_list) > 5:
        continue

    # do a regex here to remove the punctuations
    chunk_list_no_punct = []
    for chunk_str in chunk_list:
        out = chunk_str.translate(str.maketrans('', '', string.punctuation))
        chunk_list_no_punct.append(out)

    # enter data into the text file according to the format we want
    counter = i+1
    data_counter += 1
    f.write(f'{counter} {len(chunk_list)}\n')
    f.write(f' (0, 0),\n')
    for index, clause in enumerate(chunk_list_no_punct):
        f.write(f'{index+1},null,null,{clause}\n')

# close text file
f.close()

100%|██████████| 1133/1133 [00:22<00:00, 51.46it/s]


In [15]:
data_counter

724

## Write into CSV file

In [16]:
# added a counter to see how many data are being stored after those conditions
data_counter = 0

with open(f'{DATA_PATH}ecpe_raw_long_summarized_max_5_clauses.csv','a+', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    for i in tqdm(range(len(ecpe_data_long_summarized_clean))):
        chunk_list = get_separate_clauses(ecpe_data_long_summarized_clean.text_summarized[i])

        # only save those entries with not more than 5 clauses
        if len(chunk_list) > 5:
            continue

        # do a regex here to remove the punctuations
        chunk_list_no_punct = []
        for chunk_str in chunk_list:
            out = chunk_str.translate(str.maketrans('', '', string.punctuation))
            chunk_list_no_punct.append(out)
        
        # enter data into the csv file according to the format we want
        data_counter += 1
        for index, clause in enumerate(chunk_list_no_punct):
            data = [index+1,'null','null',clause]
            writer.writerow(data)

100%|██████████| 1133/1133 [00:21<00:00, 51.76it/s]


In [17]:
data_counter

724