In [1]:
# connect to google colab
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.2.3-py3-none-any.whl (342 kB)
[?25l[K     |█                               | 10 kB 22.6 MB/s eta 0:00:01[K     |██                              | 20 kB 28.6 MB/s eta 0:00:01[K     |██▉                             | 30 kB 32.5 MB/s eta 0:00:01[K     |███▉                            | 40 kB 35.4 MB/s eta 0:00:01[K     |████▉                           | 51 kB 33.3 MB/s eta 0:00:01[K     |█████▊                          | 61 kB 29.7 MB/s eta 0:00:01[K     |██████▊                         | 71 kB 26.6 MB/s eta 0:00:01[K     |███████▋                        | 81 kB 28.5 MB/s eta 0:00:01[K     |████████▋                       | 92 kB 28.9 MB/s eta 0:00:01[K     |█████████▋                      | 102 kB 29.5 MB/s eta 0:00:01[K     |██████████▌                     | 112 kB 29.5 MB/s eta 0:00:01[K     |███████████▌                    | 122 kB 29.5 MB/s eta 0:00:01[K     |████████████▌                   | 133 kB 29.5 MB/s eta 0:0

In [1]:
# base path
DATA_PATH = './drive/MyDrive/fyp-code/codes/data/emotion_intensity/'
DEST_PATH = './drive/MyDrive/fyp-code/codes/data/subtasks/'

In [2]:
# usual import
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import stanza
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 

In [3]:
# download English model
stanza.download('en') 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.2.json:   0%|   …

2021-08-28 06:53:12 INFO: Downloading default packages for language: en (English)...
2021-08-28 06:53:13 INFO: File exists: /root/stanza_resources/en/default.zip.
2021-08-28 06:53:17 INFO: Finished downloading models and saved to /root/stanza_resources.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Import Short Text

In [4]:
short_data = pd.read_csv(DATA_PATH+'emotion_intensity_depressed_clean_short_data_vader_t2e.csv')[['text_cleaned_t2e_vader']]
short_data.head()

Unnamed: 0,text_cleaned_t2e_vader
0,I get to spend New Year is home again alone an...
1,"Depressed and lonely Stuck in a deep, never en..."
2,Learning to pretend to have a good time had be...
3,So far he stop texting me after I said somethi...
4,sigh ?? I have not cried so much I am in so mu...


In [5]:
short_data_list = list(short_data['text_cleaned_t2e_vader'])
print(len(short_data_list))
short_data_list[:5]

834


['I get to spend New Year is home again alone and lonely. ??',
 'Depressed and lonely Stuck in a deep, never ending hole',
 'Learning to pretend to have a good time had become a natural skill. I hope one day it is genuine',
 'So far he stop texting me after I said something so hopefully he doe not show up at my house',
 'sigh ?? I have not cried so much I am in so much pain']

## Import Long Text

In [6]:
long_data = pd.read_csv(DATA_PATH+'emotion_intensity_depressed_clean_long_data_vader_t2e.csv')[['text_cleaned_t2e_vader']]
long_data.head()

Unnamed: 0,text_cleaned_t2e_vader
0,Just another night. Another night of feeling l...
1,Is it possible to fake depression? I have been...
2,Imagine being attractive Imagine what it would...
3,"Best moment to have anxiety It is am, I am tir..."
4,"hi, I am a year-old male from the uk, over the..."


In [7]:
long_data_list = list(long_data['text_cleaned_t2e_vader'])
print(len(long_data_list))
long_data_list[:5]

1436


['Just another night. Another night of feeling lonely and just wondering what I did wrong in life to deserve this unhappiness. I have never felt a pain stronger than being rejected by the love of your life. The person who give you a purpose. The person who is supposed to make everything better. You would give your life for this person and they just do not love you anywhere near the same.',
 'Is it possible to fake depression? I have been feeling bad for about month now. There are period where I cry on a daily basis, I never feel like talking, I struggle concentrating and collecting thoughts, I feel such an extreme sadness inside of me also in happy situations, I am exhausted all the time even if I sleep hour straight, I often have headaches, I am completely alone and I find it difficult to do anything. There are night where it is unbearable. I think about my situation I am completely alone, I am failing at anything because I am so weak I cannot handle my feelings, I feel guilt for ever

## Aspect based sentiment portion

In [8]:
stop_words = set(stopwords.words('english'))
nlp = stanza.Pipeline('en')

2021-08-28 06:53:18 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-08-28 06:53:18 INFO: Use device: cpu
2021-08-28 06:53:18 INFO: Loading: tokenize
2021-08-28 06:53:18 INFO: Loading: pos
2021-08-28 06:53:18 INFO: Loading: lemma
2021-08-28 06:53:18 INFO: Loading: depparse
2021-08-28 06:53:19 INFO: Loading: sentiment
2021-08-28 06:53:19 INFO: Loading: ner
2021-08-28 06:53:20 INFO: Done loading processors!


In [9]:
# a function to get the aspect based sentiments
def aspect_conversion(text):
    # lower case the text and tokenize the Sentence
    text = text.lower()
    sentList = nltk.sent_tokenize(text)

    # perform pos tagging
    for line in sentList:
        txt_list = nltk.word_tokenize(line)
        taggedList = nltk.pos_tag(txt_list)

    newwordList = []
    flag = 0
    for i in range(0,len(taggedList)-1):
        if (taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"):
            newwordList.append(taggedList[i][0]+taggedList[i+1][0])
            flag=1
        else:
            if(flag==1):
                flag=0
                continue
            newwordList.append(taggedList[i][0])
            if(i==len(taggedList)-2):
                newwordList.append(taggedList[i+1][0])
    finaltxt = ' '.join(word for word in newwordList)
    
    new_txt_list = nltk.word_tokenize(finaltxt)
    wordsList = [w for w in new_txt_list if not w in stop_words]
    taggedList = nltk.pos_tag(wordsList)
    
    doc = nlp(finaltxt)
    dep_node = []
    for dep_edge in doc.sentences[0].dependencies:
        dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])
        #print(len(dep_node))
    for j in range(0, len(dep_node)):
        #print(j)
        try:
            if (int(dep_node[j][1]) != 0):
                dep_node[j][1] = newwordList[(int(dep_node[j][1]) - 1)]
        except:
            print("Error")

    # only select those sublists from the dep_node 
    # that could probably contain the features
    featureList = []
    categoriesList = []

    for i in taggedList:
        if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
            featureList.append(list(i))
            categoriesList.append(i[0])

    # now using dep_node list and the featureList we will determine to 
    # which of the words these features in the feature list are related to
    fcluster = []
    for i in featureList:
        filist = []
        for j in dep_node:
            if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                if(j[0]==i[0]):
                    filist.append(j[1])
                else:
                    filist.append(j[0])
        fcluster.append([i[0], filist])

    finalcluster = []
    dic = {}
    for i in featureList:
        dic[i[0]] = i[1]
    for i in fcluster:
        if(dic[i[0]]=="NN"):
            finalcluster.append(i)
    return finalcluster

In [19]:
# absa test code
print(aspect_conversion(short_data_list[1]))

[['hole', ['deep', 'ending']]]


## ABSA on short text

In [10]:
# check on the sublist to see if code is working
absa_list = []

for k in tqdm(short_data_list):
    try:
        x = aspect_conversion(k)
        #print(x)
        absa_list.append(x)
    except:
        #print('\nError occurred')
        absa_list.append(['None'])
        continue

  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)
  5%|▌         | 45/834 [00:10<03:12,  4.10it/s]

Error
Error
Error
Error
Error


  7%|▋         | 57/834 [00:13<04:00,  3.23it/s]

Error


 15%|█▌        | 128/834 [00:28<01:56,  6.07it/s]

Error
Error
Error
Error
Error
Error


 24%|██▎       | 197/834 [00:43<02:00,  5.27it/s]

Error
Error


 27%|██▋       | 226/834 [00:49<02:28,  4.10it/s]

Error
Error
Error
Error
Error


 28%|██▊       | 237/834 [00:52<01:55,  5.17it/s]

Error
Error
Error


 41%|████      | 343/834 [01:13<01:08,  7.16it/s]

Error
Error


 45%|████▍     | 372/834 [01:19<01:49,  4.21it/s]

Error
Error


 48%|████▊     | 400/834 [01:24<01:22,  5.23it/s]

Error
Error
Error
Error


 60%|█████▉    | 498/834 [01:44<01:16,  4.39it/s]

Error


 64%|██████▍   | 532/834 [01:50<00:45,  6.57it/s]

Error
Error


 66%|██████▌   | 547/834 [01:53<01:03,  4.52it/s]

Error


 68%|██████▊   | 568/834 [01:57<00:57,  4.60it/s]

Error
Error
Error
Error
Error
Error


 73%|███████▎  | 612/834 [02:06<00:52,  4.25it/s]

Error


 75%|███████▍  | 624/834 [02:09<00:47,  4.44it/s]

Error


 75%|███████▌  | 626/834 [02:10<00:58,  3.58it/s]

Error
Error
Error


 75%|███████▌  | 629/834 [02:10<00:39,  5.26it/s]

Error


 78%|███████▊  | 652/834 [02:15<00:32,  5.64it/s]

Error


 84%|████████▎ | 697/834 [02:26<00:32,  4.26it/s]

Error
Error
Error


 88%|████████▊ | 735/834 [02:34<00:19,  5.10it/s]

Error


 98%|█████████▊| 817/834 [02:51<00:02,  6.00it/s]

Error
Error


100%|██████████| 834/834 [02:55<00:00,  4.75it/s]

Error
Error





In [16]:
len(absa_list)

834

In [14]:
absa_list[:5]

[['None'],
 [['hole', ['deep', 'ending']]],
 [['hope', ['i']], ['day', []]],
 [['texting', ['stop', 'me']],
  ['something', ['said']],
  ['show', ['so', 'hopefully', 'he', 'not']],
  ['house', []]],
 [['pain', ['i', 'much']]]]

In [18]:
# store the ABSA short text into the dataframe
absa_short_df = pd.DataFrame()
absa_short_df['text_cleaned'] = short_data['text_cleaned_t2e_vader']
# dummy initiation
absa_short_df['absa'] = 0
# convert the column type to object to take in a list
absa_short_df['absa'] = absa_short_df['absa'].astype('object')
for i, j in enumerate(absa_list):
    absa_short_df.at[i,'absa'] = j
absa_short_df.head()

Unnamed: 0,text_cleaned,absa
0,I get to spend New Year is home again alone an...,[None]
1,"Depressed and lonely Stuck in a deep, never en...","[[hole, [deep, ending]]]"
2,Learning to pretend to have a good time had be...,"[[hope, [i]], [day, []]]"
3,So far he stop texting me after I said somethi...,"[[texting, [stop, me]], [something, [said]], [..."
4,sigh ?? I have not cried so much I am in so mu...,"[[pain, [i, much]]]"


## ABSA on long text

In [20]:
# check on the sublist to see if code is working
absa_list = []

for k in tqdm(long_data_list):
    try:
        x = aspect_conversion(k)
        #print(x)
        absa_list.append(x)
    except:
        #print('\nError occurred')
        absa_list.append(['None'])
        continue

  5%|▌         | 76/1436 [00:23<11:18,  2.00it/s]

Error


  7%|▋         | 100/1436 [00:30<08:38,  2.58it/s]

Error
Error
Error
Error
Error


  8%|▊         | 112/1436 [00:34<09:40,  2.28it/s]

Error
Error
Error
Error
Error
Error
Error


 12%|█▏        | 171/1436 [00:52<14:28,  1.46it/s]

Error
Error
Error
Error


 18%|█▊        | 255/1436 [01:19<06:20,  3.11it/s]

Error
Error
Error
Error


 21%|██        | 295/1436 [01:34<04:56,  3.85it/s]

Error


 26%|██▌       | 369/1436 [01:59<05:47,  3.07it/s]

Error
Error


 28%|██▊       | 401/1436 [02:08<09:59,  1.73it/s]

Error
Error


 34%|███▍      | 495/1436 [02:44<04:59,  3.14it/s]

Error
Error


 45%|████▌     | 649/1436 [03:30<02:47,  4.71it/s]

Error
Error


 59%|█████▊    | 843/1436 [04:35<03:47,  2.61it/s]

Error
Error
Error
Error
Error


 61%|██████    | 877/1436 [04:46<03:21,  2.78it/s]

Error
Error
Error


 70%|███████   | 1008/1436 [05:29<01:58,  3.62it/s]

Error
Error
Error


 75%|███████▍  | 1074/1436 [05:57<01:37,  3.70it/s]

Error


 78%|███████▊  | 1117/1436 [06:11<01:51,  2.86it/s]

Error


 80%|████████  | 1152/1436 [06:22<01:40,  2.83it/s]

Error
Error
Error


 85%|████████▍ | 1219/1436 [06:46<00:57,  3.76it/s]

Error


 86%|████████▌ | 1229/1436 [06:50<01:48,  1.90it/s]

Error


 91%|█████████ | 1300/1436 [07:19<01:39,  1.37it/s]

Error
Error
Error
Error
Error


 94%|█████████▍| 1356/1436 [07:38<00:35,  2.28it/s]

Error
Error
Error
Error


100%|█████████▉| 1434/1436 [07:59<00:00,  2.71it/s]

Error


100%|██████████| 1436/1436 [07:59<00:00,  2.99it/s]


In [21]:
len(absa_list)

1436

In [22]:
absa_list[:5]

[[['life', ['give']], ['person', []]],
 [['responsibility', ['take']], ['failure', ['am']]],
 [['wa', []], ['fuck', []]],
 [['daynight', ['nighteveryone']],
  ['nighteveryone', ['good', 'daynight', 'have']]],
 [['anyone', []]]]

In [23]:
# store the ABSA long text into the dataframe
absa_long_df = pd.DataFrame()
absa_long_df['text_cleaned'] = long_data['text_cleaned_t2e_vader']
# dummy initiation
absa_long_df['absa'] = 0
# convert the column type to object to take in a list
absa_long_df['absa'] = absa_long_df['absa'].astype('object')
for i, j in enumerate(absa_list):
    absa_long_df.at[i,'absa'] = j
absa_long_df.head()

Unnamed: 0,text_cleaned,absa
0,Just another night. Another night of feeling l...,"[[life, [give]], [person, []]]"
1,Is it possible to fake depression? I have been...,"[[responsibility, [take]], [failure, [am]]]"
2,Imagine being attractive Imagine what it would...,"[[wa, []], [fuck, []]]"
3,"Best moment to have anxiety It is am, I am tir...","[[daynight, [nighteveryone]], [nighteveryone, ..."
4,"hi, I am a year-old male from the uk, over the...","[[anyone, []]]"


## Save the dataframe as csv

In [24]:
absa_short_df.to_csv(DEST_PATH+'subtasks_text_absa_short_data.csv', index=False)
absa_long_df.to_csv(DEST_PATH+'subtasks_text_absa_long_data.csv', index=False)