In [1]:
!pip install pandas==1.3.4
!pip install transformers==4.12.5
!pip install datasets==1.15.1

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
!pip install ipywidgets
!pip install IProgress

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [3]:
import os
import re
import pickle

import pandas as pd

import numpy as np
import torch

import transformers
from transformers import BertTokenizer

import datasets
from datasets import concatenate_datasets
from datasets import Dataset
from datasets import ClassLabel
from datasets import DatasetDict

In [4]:
print('pandas:\t\t', pd.__version__)
print('transformers:\t', transformers.__version__)
print('datasets:\t', datasets.__version__)

pandas:		 1.3.4
transformers:	 4.12.5
datasets:	 1.15.1


## Load Data

In [5]:
dataset_df = pd.read_pickle("/notebooks/KURI-BERT/data/pe_dataset_w_essay_position_pos_tags_pickle")

In [6]:
dataset_df = dataset_df.drop_duplicates(subset=['label_and_comp_idxs', 'text'], keep='first').reset_index(drop=True)

In [7]:
dataset_df = dataset_df.sort_values(['essay_nr', 'argument_bound_1'], ascending=[1, 1]).reset_index(drop=True)

In [8]:
dataset_df

Unnamed: 0,essay_nr,component_id,label_and_comp_idxs,text,label_x,label_ComponentType,relation_SupportAttack,label_RelationType,label_LinkedNotLinked,split_y,...,is_last_in_para,nr_preceeding_comps_in_para,nr_following_comps_in_para,structural_fts_as_text,structural_fts_as_text_combined,para_ratio,first_or_last,strct_fts_w_position_in_essay,component_pos_tags,strct_fts_essay_position_pos_tags
0,essay001,T1,MajorClaim 503 575,we should attach more importance to cooperatio...,MajorClaim,MajorClaim,[],,Linked,TRAIN,...,1,0,0,Topic: Should students be taught to compete or...,Topic: Should students be taught to compete or...,0.25,1,Topic: Should students be taught to compete or...,"Part Of Speech tags: PRON, VERB, VERB, ADJ, NO...",Topic: Should students be taught to compete or...
1,essay001,T3,Claim 591 714,"through cooperation, children can learn about ...",Claim,Claim,[],Support,Linked,TRAIN,...,0,0,3,Topic: Should students be taught to compete or...,Topic: Should students be taught to compete or...,0.50,0,Topic: Should students be taught to compete or...,"Part Of Speech tags: ADP, NOUN, PUNCT, NOUN, V...",Topic: Should students be taught to compete or...
2,essay001,T4,Premise 716 851,What we acquired from team work is not only ho...,Premise,Premise,[],Support,NotLinked,TRAIN,...,0,1,2,Topic: Should students be taught to compete or...,Topic: Should students be taught to compete or...,0.50,0,Topic: Should students be taught to compete or...,"Part Of Speech tags: PRON, PRON, VERB, ADP, NO...",Topic: Should students be taught to compete or...
3,essay001,T5,Premise 853 1086,"During the process of cooperation, children ca...",Premise,Premise,[],Support,NotLinked,TRAIN,...,0,2,1,Topic: Should students be taught to compete or...,Topic: Should students be taught to compete or...,0.50,0,Topic: Should students be taught to compete or...,"Part Of Speech tags: ADP, DET, NOUN, ADP, NOUN...",Topic: Should students be taught to compete or...
4,essay001,T6,Premise 1088 1191,All of these skills help them to get on well w...,Premise,Premise,[],Support,NotLinked,TRAIN,...,1,3,0,Topic: Should students be taught to compete or...,Topic: Should students be taught to compete or...,0.50,0,Topic: Should students be taught to compete or...,"Part Of Speech tags: DET, ADP, DET, NOUN, VERB...",Topic: Should students be taught to compete or...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5958,essay402,T11,Premise 1275 1339,indirectly they will learn how to socialize ea...,Premise,Premise,[],Support,NotLinked,TRAIN,...,0,4,3,Topic: Children should studying hard or playin...,Topic: Children should studying hard or playin...,0.75,0,Topic: Children should studying hard or playin...,"Part Of Speech tags: ADV, PRON, VERB, VERB, AD...",Topic: Children should studying hard or playin...
5959,essay402,T12,Premise 1341 1388,That will make children getting lots of friends,Premise,Premise,[],Support,NotLinked,TRAIN,...,0,5,2,Topic: Children should studying hard or playin...,Topic: Children should studying hard or playin...,0.75,0,Topic: Children should studying hard or playin...,"Part Of Speech tags: DET, VERB, VERB, NOUN, VE...",Topic: Children should studying hard or playin...
5960,essay402,T13,Premise 1393 1436,they can contribute positively to community,Premise,Premise,[],Support,Linked,TRAIN,...,0,6,1,Topic: Children should studying hard or playin...,Topic: Children should studying hard or playin...,0.75,0,Topic: Children should studying hard or playin...,"Part Of Speech tags: PRON, VERB, VERB, ADV, AD...",Topic: Children should studying hard or playin...
5961,essay402,T14,Premise 1448 1525,playing sport makes children getting healthy a...,Premise,Premise,[],Support,NotLinked,TRAIN,...,1,7,0,Topic: Children should studying hard or playin...,Topic: Children should studying hard or playin...,0.75,0,Topic: Children should studying hard or playin...,"Part Of Speech tags: VERB, NOUN, VERB, NOUN, V...",Topic: Children should studying hard or playin...


In [9]:
dataset_df.label_RelationType.value_counts()

Support    4745
            730
Attack      488
Name: label_RelationType, dtype: int64

In [10]:
dataset_df = dataset_df[dataset_df.label_RelationType != '']

In [11]:
dataset_df.label_RelationType.value_counts()

Support    4745
Attack      488
Name: label_RelationType, dtype: int64

In [12]:
dataset_df = dataset_df.reset_index(drop=True)

In [13]:
dataset_df.columns

Index(['essay_nr', 'component_id', 'label_and_comp_idxs', 'text', 'label_x',
       'label_ComponentType', 'relation_SupportAttack', 'label_RelationType',
       'label_LinkedNotLinked', 'split_y', 'essay', 'argument_bound_1',
       'argument_bound_2', 'argument_id', 'sentence', 'paragraph', 'para_nr',
       'total_paras', 'token_count', 'token_count_covering_para',
       'tokens_count_covering_sentence', 'preceeding_tokens_in_sentence_count',
       'succeeding_tokens_in_sentence_count', 'token_ratio',
       'relative_position_in_para_char', 'is_in_intro',
       'relative_position_in_para_token', 'is_in_conclusion',
       'is_first_in_para', 'is_last_in_para', 'nr_preceeding_comps_in_para',
       'nr_following_comps_in_para', 'structural_fts_as_text',
       'structural_fts_as_text_combined', 'para_ratio', 'first_or_last',
       'strct_fts_w_position_in_essay', 'component_pos_tags',
       'strct_fts_essay_position_pos_tags'],
      dtype='object')

In [14]:
dataset_df['paragraph'][1926] = 'Based on the reasons demonstrated above, I prefer to express my opinion in person because the advantages of this are qualitatively greater than that of the opposite. In other words, I think the best way of communication is by means of face-to-face. It is not to say, of course, that other points of view are completely without merit. Still, I am firmly convinced that the reasons I proposed in favor of my own sentiment are much more tenable and cogent.'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_df['paragraph'][1926] = 'Based on the reasons demonstrated above, I prefer to express my opinion in person because the advantages of this are qualitatively greater than that of the opposite. In other words, I think the best way of communication is by means of face-to-face. It is not to say, of course, that other points of view are completely without merit. Still, I am firmly convinced that the reasons I proposed in favor of my own sentiment are much more tenable and cogent.'


In [15]:
dataset_df['paragraph'][4831] = 'Finally, although I prefer to live in a big city, I can not help but assure that this is a matter of personal taste, to be clear, I would love to spend some days in countryside to recharge my soul and relieve myself from daily stress.'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_df['paragraph'][4831] = 'Finally, although I prefer to live in a big city, I can not help but assure that this is a matter of personal taste, to be clear, I would love to spend some days in countryside to recharge my soul and relieve myself from daily stress.'


In [16]:
dataset_df['paragraph'][3908] = 'I do believe that great success requires taking great risks. If you want to gain very high profits in investments, you should use great and very high amounts of money in very risky financial decisions and dealings in which you may lose much amount of money. If you want to achieve a remarkable success in an important exam, you should risk studying all the time and sacrificing your free time and your favorite hobbies. The more you take risks, the greater your successes will be. If you don’t take any risks, you will have an ordinary life with average successes. That’s why it is important to take risks.'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_df['paragraph'][3908] = 'I do believe that great success requires taking great risks. If you want to gain very high profits in investments, you should use great and very high amounts of money in very risky financial decisions and dealings in which you may lose much amount of money. If you want to achieve a remarkable success in an important exam, you should risk studying all the time and sacrificing your free time and your favorite hobbies. The more you take risks, the greater your successes will be. If you don’t take any risks, you will have an ordinary life with average successes. That’s why it is important to take risks.'


In [17]:
dataset_df['paragraph'][4306] = 'In conclusion, experiences that shape my thinking and the way to live my life are teach me to be a better person. Because I could find a better way to solve problems. Besides, experiences could shape me to be more aware about my health. Moreover, these experiences could make me to make plan for my future life. These experiences taught me to change my life to be a better person.'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_df['paragraph'][4306] = 'In conclusion, experiences that shape my thinking and the way to live my life are teach me to be a better person. Because I could find a better way to solve problems. Besides, experiences could shape me to be more aware about my health. Moreover, these experiences could make me to make plan for my future life. These experiences taught me to change my life to be a better person.'


In [18]:
dataset_df.columns

Index(['essay_nr', 'component_id', 'label_and_comp_idxs', 'text', 'label_x',
       'label_ComponentType', 'relation_SupportAttack', 'label_RelationType',
       'label_LinkedNotLinked', 'split_y', 'essay', 'argument_bound_1',
       'argument_bound_2', 'argument_id', 'sentence', 'paragraph', 'para_nr',
       'total_paras', 'token_count', 'token_count_covering_para',
       'tokens_count_covering_sentence', 'preceeding_tokens_in_sentence_count',
       'succeeding_tokens_in_sentence_count', 'token_ratio',
       'relative_position_in_para_char', 'is_in_intro',
       'relative_position_in_para_token', 'is_in_conclusion',
       'is_first_in_para', 'is_last_in_para', 'nr_preceeding_comps_in_para',
       'nr_following_comps_in_para', 'structural_fts_as_text',
       'structural_fts_as_text_combined', 'para_ratio', 'first_or_last',
       'strct_fts_w_position_in_essay', 'component_pos_tags',
       'strct_fts_essay_position_pos_tags'],
      dtype='object')

In [19]:
dataset_df.structural_fts_as_text_combined[0]

'Topic: Should students be taught to compete or to cooperate?, Sentence: First of all, through cooperation, children can learn about interpersonal skills which are significant in the future life of all students, Structural Features: 2, Yes, No, No, No'

#### get ams

In [20]:
with open("am_list.txt") as file_am:
    list_of_ams = []
    for line in file_am:
        list_of_ams.append(line.rstrip('\n'))

In [21]:
def am_in_prefix(list_a, list_b):
    
    equal_flag = 0
    
    for i in range(len(list_a)):
        
        sub_list = list_a[i:i+len(list_b)]
        
        if sub_list == list_b:
            
            equal_flag = 1
            break
        
        else:
            
            equal_flag = 0
            
    return equal_flag

In [22]:
def get_am(x, list_of_ams):
    
    am = []
    
    sentence = x.sentence
    component = x.text
    
    prefix = sentence[:sentence.find(component)]
    prefix = re.findall(r'\s|,|[^,\s]+', prefix) 
    prefix = [item for item in prefix if item != " "]
    prefix = [word.casefold() for word in prefix]
    
    list_of_ams = [am.casefold() for am in list_of_ams]
    list_of_ams = list(set(list_of_ams))
  
    for am_phrase in list_of_ams:
        
        am_words_list = re.findall(r'\s|,|[^,\s]+', am_phrase)
        am_words_list = [item for item in am_words_list if item != " "]
        
        if am_in_prefix(prefix, am_words_list) and len(am) == 0:
            
            am.append(am_phrase)
    
    return am         

In [23]:
dataset_df['am'] = dataset_df.apply(lambda x: get_am(x, list_of_ams), axis=1)

In [24]:
dataset_df['am']

0            [first of all]
1                        []
2                        []
3                        []
4       [on the other hand]
               ...         
5228                     []
5229              [because]
5230                     []
5231                   [so]
5232             [secondly]
Name: am, Length: 5233, dtype: object

### do group by for every required column

In [25]:
paras_df = dataset_df.groupby("paragraph", sort=False)

In [26]:
paras_df

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc70a794940>

In [27]:
df_by_paras_text = paras_df["text"].apply(list)

In [28]:
df_by_paras_text

paragraph
First of all, through cooperation, children can learn about interpersonal skills which are significant in the future life of all students. What we acquired from team work is not only how to achieve the same goal with others but more importantly, how to get along with others. During the process of cooperation, children can learn about how to listen to opinions of others, how to communicate with others, how to think comprehensively, and even how to compromise with other team members when conflicts occurred. All of these skills help them to get on well with other people and will benefit them for the whole life.                                                                                                                                                                                             [through cooperation, children can learn about...
On the other hand, the significance of competition is that how to become more excellence to gain the victory. Hence it is always said th

In [29]:
df_by_paras_text = df_by_paras_text.reset_index()

In [30]:
df_by_paras_text

Unnamed: 0,paragraph,text
0,"First of all, through cooperation, children ca...","[through cooperation, children can learn about..."
1,"On the other hand, the significance of competi...",[the significance of competition is that how t...
2,"Firstly, maintaining one’s cultural identity i...",[maintaining one’s cultural identity is a key ...
3,"Secondly, it is crucial to keep one’s identity...","[it is crucial to keep one’s identity, they ne..."
4,"To conclude, although there are opposing ideas...",[there are opposing ideas of neglecting one’s ...
...,...,...
1343,"Of course, some celebrities, e.g. writers, pas...","[some celebrities, e.g. writers, passionate en..."
1344,The first reason why the father's role should ...,[The first reason why the father's role should...
1345,The second reason why I believe that fatherood...,[The second reason why I believe that fatheroo...
1346,"On the other hand, studying hard will give chi...",[studying hard will give children a better fut...


In [31]:
df_by_paras_labels = paras_df["label_RelationType"].apply(list).reset_index()

In [32]:
# Nice.

In [33]:
df_by_paras_split = paras_df['split_y'].first()

In [34]:
df_by_paras_split = df_by_paras_split.reset_index()

In [35]:
df_by_paras_split

Unnamed: 0,paragraph,split_y
0,"First of all, through cooperation, children ca...",TRAIN
1,"On the other hand, the significance of competi...",TRAIN
2,"Firstly, maintaining one’s cultural identity i...",TRAIN
3,"Secondly, it is crucial to keep one’s identity...",TRAIN
4,"To conclude, although there are opposing ideas...",TRAIN
...,...,...
1343,"Of course, some celebrities, e.g. writers, pas...",TRAIN
1344,The first reason why the father's role should ...,TRAIN
1345,The second reason why I believe that fatherood...,TRAIN
1346,"On the other hand, studying hard will give chi...",TRAIN


In [36]:
df_by_paras_ams = paras_df["am"].apply(list)

In [37]:
df_by_paras_ams

paragraph
First of all, through cooperation, children can learn about interpersonal skills which are significant in the future life of all students. What we acquired from team work is not only how to achieve the same goal with others but more importantly, how to get along with others. During the process of cooperation, children can learn about how to listen to opinions of others, how to communicate with others, how to think comprehensively, and even how to compromise with other team members when conflicts occurred. All of these skills help them to get on well with other people and will benefit them for the whole life.                                                                                                                                                                                                                  [[first of all], [], [], []]
On the other hand, the significance of competition is that how to become more excellence to gain the victory. Hence it is always said th

In [38]:
df_by_paras_ams = df_by_paras_ams.reset_index()

In [39]:
df_by_paras_ams

Unnamed: 0,paragraph,am
0,"First of all, through cooperation, children ca...","[[first of all], [], [], []]"
1,"On the other hand, the significance of competi...","[[on the other hand], [hence], [however], [], []]"
2,"Firstly, maintaining one’s cultural identity i...","[[firstly], [], [], [], [thus]]"
3,"Secondly, it is crucial to keep one’s identity...","[[secondly], [secondly], [for instance], [], [..."
4,"To conclude, although there are opposing ideas...",[[although ]]
...,...,...
1343,"Of course, some celebrities, e.g. writers, pas...","[[], []]"
1344,The first reason why the father's role should ...,"[[], [], [for example]]"
1345,The second reason why I believe that fatherood...,"[[], [in my opinion], [for instance], [], [i b..."
1346,"On the other hand, studying hard will give chi...","[[on the other hand], [], [for instance], [], []]"


In [40]:
# get essays

In [41]:
df_by_paras_essay = paras_df['essay_nr'].first()

In [42]:
df_by_paras_essay = df_by_paras_essay.reset_index()

In [43]:
df_by_paras_essay

Unnamed: 0,paragraph,essay_nr
0,"First of all, through cooperation, children ca...",essay001
1,"On the other hand, the significance of competi...",essay001
2,"Firstly, maintaining one’s cultural identity i...",essay002
3,"Secondly, it is crucial to keep one’s identity...",essay002
4,"To conclude, although there are opposing ideas...",essay002
...,...,...
1343,"Of course, some celebrities, e.g. writers, pas...",essay399
1344,The first reason why the father's role should ...,essay401
1345,The second reason why I believe that fatherood...,essay401
1346,"On the other hand, studying hard will give chi...",essay402


In [44]:
# make the first two DFs lists of lists

In [45]:
def make_acs_l_o_l(x):
    
    list_acs = x.text
    
    new_list = [[ac] for ac in list_acs]
    
    return new_list

In [46]:
df_by_paras_text['paragraph_components_list'] = df_by_paras_text.apply(lambda x: make_acs_l_o_l(x), axis=1)

In [47]:
# ok now for the second labels df

In [48]:
def make_labels_l_o_l(x):
    
    list_labels = x.label_RelationType
    
    new_list = [[label] for label in list_labels]
    
    return new_list

In [49]:
df_by_paras_labels['paragraph_labels_list'] = df_by_paras_labels.apply(lambda x: make_labels_l_o_l(x), axis=1)

In [50]:
# ok seems correct.

In [51]:
# new df for strct fts

In [52]:
# df_by_paras_strct_fts = paras_df["structural_fts_as_text"].apply(list)
df_by_paras_strct_fts = paras_df["structural_fts_as_text_combined"].apply(list)

In [53]:
df_by_paras_strct_fts = df_by_paras_strct_fts.reset_index(drop=True)

In [54]:
# now combine the dataframes

In [55]:
df = pd.concat([df_by_paras_text, df_by_paras_labels, df_by_paras_ams, df_by_paras_split, df_by_paras_essay, df_by_paras_strct_fts], axis=1)

In [56]:
df

Unnamed: 0,paragraph,text,paragraph_components_list,paragraph.1,label_RelationType,paragraph_labels_list,paragraph.2,am,paragraph.3,split_y,paragraph.4,essay_nr,structural_fts_as_text_combined
0,"First of all, through cooperation, children ca...","[through cooperation, children can learn about...","[[through cooperation, children can learn abou...","First of all, through cooperation, children ca...","[Support, Support, Support, Support]","[[Support], [Support], [Support], [Support]]","First of all, through cooperation, children ca...","[[first of all], [], [], []]","First of all, through cooperation, children ca...",TRAIN,"First of all, through cooperation, children ca...",essay001,[Topic: Should students be taught to compete o...
1,"On the other hand, the significance of competi...",[the significance of competition is that how t...,[[the significance of competition is that how ...,"On the other hand, the significance of competi...","[Support, Attack, Support, Support, Support]","[[Support], [Attack], [Support], [Support], [S...","On the other hand, the significance of competi...","[[on the other hand], [hence], [however], [], []]","On the other hand, the significance of competi...",TRAIN,"On the other hand, the significance of competi...",essay001,[Topic: Should students be taught to compete o...
2,"Firstly, maintaining one’s cultural identity i...",[maintaining one’s cultural identity is a key ...,[[maintaining one’s cultural identity is a key...,"Firstly, maintaining one’s cultural identity i...","[Support, Support, Support, Support, Support]","[[Support], [Support], [Support], [Support], [...","Firstly, maintaining one’s cultural identity i...","[[firstly], [], [], [], [thus]]","Firstly, maintaining one’s cultural identity i...",TRAIN,"Firstly, maintaining one’s cultural identity i...",essay002,[Topic: More people are migrating to other cou...
3,"Secondly, it is crucial to keep one’s identity...","[it is crucial to keep one’s identity, they ne...","[[it is crucial to keep one’s identity], [they...","Secondly, it is crucial to keep one’s identity...","[Support, Support, Support, Support, Support, ...","[[Support], [Support], [Support], [Support], [...","Secondly, it is crucial to keep one’s identity...","[[secondly], [secondly], [for instance], [], [...","Secondly, it is crucial to keep one’s identity...",TRAIN,"Secondly, it is crucial to keep one’s identity...",essay002,[Topic: More people are migrating to other cou...
4,"To conclude, although there are opposing ideas...",[there are opposing ideas of neglecting one’s ...,[[there are opposing ideas of neglecting one’s...,"To conclude, although there are opposing ideas...",[Attack],[[Attack]],"To conclude, although there are opposing ideas...",[[although ]],"To conclude, although there are opposing ideas...",TRAIN,"To conclude, although there are opposing ideas...",essay002,[Topic: More people are migrating to other cou...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1343,"Of course, some celebrities, e.g. writers, pas...","[some celebrities, e.g. writers, passionate en...","[[some celebrities, e.g. writers, passionate e...","Of course, some celebrities, e.g. writers, pas...","[Support, Support]","[[Support], [Support]]","Of course, some celebrities, e.g. writers, pas...","[[], []]","Of course, some celebrities, e.g. writers, pas...",TRAIN,"Of course, some celebrities, e.g. writers, pas...",essay399,"[Topic: Drugs, alcohol and messy sex lives, Se..."
1344,The first reason why the father's role should ...,[The first reason why the father's role should...,[[The first reason why the father's role shoul...,The first reason why the father's role should ...,"[Support, Support, Support]","[[Support], [Support], [Support]]",The first reason why the father's role should ...,"[[], [], [for example]]",The first reason why the father's role should ...,TRAIN,The first reason why the father's role should ...,essay401,[Topic: Fatherhood should be as present as mot...
1345,The second reason why I believe that fatherood...,[The second reason why I believe that fatheroo...,[[The second reason why I believe that fathero...,The second reason why I believe that fatherood...,"[Support, Support, Support, Support, Support]","[[Support], [Support], [Support], [Support], [...",The second reason why I believe that fatherood...,"[[], [in my opinion], [for instance], [], [i b...",The second reason why I believe that fatherood...,TRAIN,The second reason why I believe that fatherood...,essay401,[Topic: Fatherhood should be as present as mot...
1346,"On the other hand, studying hard will give chi...",[studying hard will give children a better fut...,[[studying hard will give children a better fu...,"On the other hand, studying hard will give chi...","[Support, Support, Support, Support, Support]","[[Support], [Support], [Support], [Support], [...","On the other hand, studying hard will give chi...","[[on the other hand], [], [for instance], [], []]","On the other hand, studying hard will give chi...",TRAIN,"On the other hand, studying hard will give chi...",essay402,[Topic: Children should studying hard or playi...


In [57]:
# df['structural_fts_as_text'][6]

In [58]:
df = df.loc[:,~df.columns.duplicated()].copy()

In [59]:
df

Unnamed: 0,paragraph,text,paragraph_components_list,label_RelationType,paragraph_labels_list,am,split_y,essay_nr,structural_fts_as_text_combined
0,"First of all, through cooperation, children ca...","[through cooperation, children can learn about...","[[through cooperation, children can learn abou...","[Support, Support, Support, Support]","[[Support], [Support], [Support], [Support]]","[[first of all], [], [], []]",TRAIN,essay001,[Topic: Should students be taught to compete o...
1,"On the other hand, the significance of competi...",[the significance of competition is that how t...,[[the significance of competition is that how ...,"[Support, Attack, Support, Support, Support]","[[Support], [Attack], [Support], [Support], [S...","[[on the other hand], [hence], [however], [], []]",TRAIN,essay001,[Topic: Should students be taught to compete o...
2,"Firstly, maintaining one’s cultural identity i...",[maintaining one’s cultural identity is a key ...,[[maintaining one’s cultural identity is a key...,"[Support, Support, Support, Support, Support]","[[Support], [Support], [Support], [Support], [...","[[firstly], [], [], [], [thus]]",TRAIN,essay002,[Topic: More people are migrating to other cou...
3,"Secondly, it is crucial to keep one’s identity...","[it is crucial to keep one’s identity, they ne...","[[it is crucial to keep one’s identity], [they...","[Support, Support, Support, Support, Support, ...","[[Support], [Support], [Support], [Support], [...","[[secondly], [secondly], [for instance], [], [...",TRAIN,essay002,[Topic: More people are migrating to other cou...
4,"To conclude, although there are opposing ideas...",[there are opposing ideas of neglecting one’s ...,[[there are opposing ideas of neglecting one’s...,[Attack],[[Attack]],[[although ]],TRAIN,essay002,[Topic: More people are migrating to other cou...
...,...,...,...,...,...,...,...,...,...
1343,"Of course, some celebrities, e.g. writers, pas...","[some celebrities, e.g. writers, passionate en...","[[some celebrities, e.g. writers, passionate e...","[Support, Support]","[[Support], [Support]]","[[], []]",TRAIN,essay399,"[Topic: Drugs, alcohol and messy sex lives, Se..."
1344,The first reason why the father's role should ...,[The first reason why the father's role should...,[[The first reason why the father's role shoul...,"[Support, Support, Support]","[[Support], [Support], [Support]]","[[], [], [for example]]",TRAIN,essay401,[Topic: Fatherhood should be as present as mot...
1345,The second reason why I believe that fatherood...,[The second reason why I believe that fatheroo...,[[The second reason why I believe that fathero...,"[Support, Support, Support, Support, Support]","[[Support], [Support], [Support], [Support], [...","[[], [in my opinion], [for instance], [], [i b...",TRAIN,essay401,[Topic: Fatherhood should be as present as mot...
1346,"On the other hand, studying hard will give chi...",[studying hard will give children a better fut...,[[studying hard will give children a better fu...,"[Support, Support, Support, Support, Support]","[[Support], [Support], [Support], [Support], [...","[[on the other hand], [], [for instance], [], []]",TRAIN,essay402,[Topic: Children should studying hard or playi...


In [60]:
df = df.drop(columns=['text', 'label_RelationType'])

In [61]:
df = df.rename(columns={"am": "paragraph_markers_list"})

In [62]:
df = df.rename(columns={"split_y": "split"})

In [63]:
df

Unnamed: 0,paragraph,paragraph_components_list,paragraph_labels_list,paragraph_markers_list,split,essay_nr,structural_fts_as_text_combined
0,"First of all, through cooperation, children ca...","[[through cooperation, children can learn abou...","[[Support], [Support], [Support], [Support]]","[[first of all], [], [], []]",TRAIN,essay001,[Topic: Should students be taught to compete o...
1,"On the other hand, the significance of competi...",[[the significance of competition is that how ...,"[[Support], [Attack], [Support], [Support], [S...","[[on the other hand], [hence], [however], [], []]",TRAIN,essay001,[Topic: Should students be taught to compete o...
2,"Firstly, maintaining one’s cultural identity i...",[[maintaining one’s cultural identity is a key...,"[[Support], [Support], [Support], [Support], [...","[[firstly], [], [], [], [thus]]",TRAIN,essay002,[Topic: More people are migrating to other cou...
3,"Secondly, it is crucial to keep one’s identity...","[[it is crucial to keep one’s identity], [they...","[[Support], [Support], [Support], [Support], [...","[[secondly], [secondly], [for instance], [], [...",TRAIN,essay002,[Topic: More people are migrating to other cou...
4,"To conclude, although there are opposing ideas...",[[there are opposing ideas of neglecting one’s...,[[Attack]],[[although ]],TRAIN,essay002,[Topic: More people are migrating to other cou...
...,...,...,...,...,...,...,...
1343,"Of course, some celebrities, e.g. writers, pas...","[[some celebrities, e.g. writers, passionate e...","[[Support], [Support]]","[[], []]",TRAIN,essay399,"[Topic: Drugs, alcohol and messy sex lives, Se..."
1344,The first reason why the father's role should ...,[[The first reason why the father's role shoul...,"[[Support], [Support], [Support]]","[[], [], [for example]]",TRAIN,essay401,[Topic: Fatherhood should be as present as mot...
1345,The second reason why I believe that fatherood...,[[The second reason why I believe that fathero...,"[[Support], [Support], [Support], [Support], [...","[[], [in my opinion], [for instance], [], [i b...",TRAIN,essay401,[Topic: Fatherhood should be as present as mot...
1346,"On the other hand, studying hard will give chi...",[[studying hard will give children a better fu...,"[[Support], [Support], [Support], [Support], [...","[[on the other hand], [], [for instance], [], []]",TRAIN,essay402,[Topic: Children should studying hard or playi...


In [64]:
def is_equal(x):
    
    # flag = 0
    
    list_ac = x.paragraph_components_list
    list_am = x.paragraph_markers_list
    
    return len(list_ac) == len(list_am)

In [65]:
df['ac=am'] = df.apply(lambda x: is_equal(x), axis=1)

In [66]:
df['ac=am'].value_counts()

True    1348
Name: ac=am, dtype: int64

In [67]:
def is_equal_label(x):
    
    flag = 0
    
    list_ac = x.paragraph_components_list
    list_am = x.paragraph_labels_list
    
    if len(list_ac) == len(list_am):
        
        flag = 1
        
    else:
        
        flag = 0
        
    return flag

In [68]:
df['ac=lab'] = df.apply(lambda x: is_equal_label(x), axis=1)

In [69]:
df['ac=lab'].value_counts()

1    1348
Name: ac=lab, dtype: int64

In [70]:
# ok both correct.

In [71]:
df = df.drop(columns=df.columns[-1])

In [72]:
df = df.drop(columns=df.columns[-1])

In [73]:
df

Unnamed: 0,paragraph,paragraph_components_list,paragraph_labels_list,paragraph_markers_list,split,essay_nr,structural_fts_as_text_combined
0,"First of all, through cooperation, children ca...","[[through cooperation, children can learn abou...","[[Support], [Support], [Support], [Support]]","[[first of all], [], [], []]",TRAIN,essay001,[Topic: Should students be taught to compete o...
1,"On the other hand, the significance of competi...",[[the significance of competition is that how ...,"[[Support], [Attack], [Support], [Support], [S...","[[on the other hand], [hence], [however], [], []]",TRAIN,essay001,[Topic: Should students be taught to compete o...
2,"Firstly, maintaining one’s cultural identity i...",[[maintaining one’s cultural identity is a key...,"[[Support], [Support], [Support], [Support], [...","[[firstly], [], [], [], [thus]]",TRAIN,essay002,[Topic: More people are migrating to other cou...
3,"Secondly, it is crucial to keep one’s identity...","[[it is crucial to keep one’s identity], [they...","[[Support], [Support], [Support], [Support], [...","[[secondly], [secondly], [for instance], [], [...",TRAIN,essay002,[Topic: More people are migrating to other cou...
4,"To conclude, although there are opposing ideas...",[[there are opposing ideas of neglecting one’s...,[[Attack]],[[although ]],TRAIN,essay002,[Topic: More people are migrating to other cou...
...,...,...,...,...,...,...,...
1343,"Of course, some celebrities, e.g. writers, pas...","[[some celebrities, e.g. writers, passionate e...","[[Support], [Support]]","[[], []]",TRAIN,essay399,"[Topic: Drugs, alcohol and messy sex lives, Se..."
1344,The first reason why the father's role should ...,[[The first reason why the father's role shoul...,"[[Support], [Support], [Support]]","[[], [], [for example]]",TRAIN,essay401,[Topic: Fatherhood should be as present as mot...
1345,The second reason why I believe that fatherood...,[[The second reason why I believe that fathero...,"[[Support], [Support], [Support], [Support], [...","[[], [in my opinion], [for instance], [], [i b...",TRAIN,essay401,[Topic: Fatherhood should be as present as mot...
1346,"On the other hand, studying hard will give chi...",[[studying hard will give children a better fu...,"[[Support], [Support], [Support], [Support], [...","[[on the other hand], [], [for instance], [], []]",TRAIN,essay402,[Topic: Children should studying hard or playi...


In [74]:
def get_numeric_labels(x):
    
    numeric_labels = []
    
    string_labels = x.paragraph_labels_list
    
    for label in string_labels:
        
        if label == ['Support']:
            
            numeric_labels.append(0)
            
        elif label == ['Attack']:
            
            numeric_labels.append(1)
            
            
    return numeric_labels

In [75]:
df['paragraph_labels_numeric'] = df.apply(lambda x: get_numeric_labels(x), axis=1)

In [76]:
# correct the five problematic things in the DF.

In [77]:
df['paragraph_components_list'][409] = [['when children take jobs, they tend to be more responsible'],
 ['whether they can earn money or not will depend on their effectiveness and attitudes in working'],
 ['To keep their jobs, children will have to try their best to finish their duties'],
 ['if a child works as a shop assistant, he will have to keep an eye on good and products, calculate prices correctly and take responsibilities for any lost']]

In [78]:
df['paragraph_components_list'][473] = [['children need human interactions'],
 ['No matter how advanced technology will be, it is clear that a computer will never be able to communicate with its pupils'],
 ["As a recent interview with Josh Trumm – the founder of the 'Teachers Global Group'- reveals, human interactions will never be imitated by computers: 'This verbal and non-verbal communication between teachers and children is the first step towards a proper education"],
 ["Without someone who can explain and understand their problems, pupils cannot actually learn'"],
 ['only a teacher can feel and connect with pupils, be a friend and a model']]

In [79]:
df['paragraph_components_list'][771] = [['the individual should finance his or her education'],
 ['it would cause too much cost from taxpayers and the government']]

In [80]:
df['paragraph_components_list'][1061] = [['as parents are involved in the same environment'],
 ['they could thus support much more to their children, such as domain knowledge and industry-related information'],
 ['my father is a software programmer, and he indeed teaches me a lot in programming and some computer knowledge'],
 ['Whenever I met a difficulty, I would reach him for consulting'],
 ['he has many experiences'],
 ['I could save enormous time in finding solution'],
 ['This truly makes me more competitive than my colleagues'],
 ['If I worked in the different field from my father, I would have faced much difficulty than I do now']]

In [81]:
df['paragraph_components_list'][1470] = [['With rapid development of the internet, people are able to enjoy quick electronic communication via internet'],
 ['as lots of chatting apps available online such as "Weichat", people tend to send instant messages free of charge by using their phones rather than face to face communication'],
 ['mobile phones have shortened the distance of communication'],
 ['mobile phones have shortened the distance of communication']]

In [82]:
df['paragraph_components_list'][1470] = [['With rapid development of the internet, people are able to enjoy quick electronic communication via internet'],
  ['as lots of chatting apps available online such as "Weichat", people tend to send instant messages free of charge by using their phones rather than face to face communication'],
  ['mobile phones have shortened the distance of communication']]

In [83]:
df['paragraph_markers_list'][1470] = [[], ['for instance'], ['therefore,']]

In [84]:
df

Unnamed: 0,paragraph,paragraph_components_list,paragraph_labels_list,paragraph_markers_list,split,essay_nr,structural_fts_as_text_combined,paragraph_labels_numeric
0,"First of all, through cooperation, children ca...","[[through cooperation, children can learn abou...","[[Support], [Support], [Support], [Support]]","[[first of all], [], [], []]",TRAIN,essay001,[Topic: Should students be taught to compete o...,"[0, 0, 0, 0]"
1,"On the other hand, the significance of competi...",[[the significance of competition is that how ...,"[[Support], [Attack], [Support], [Support], [S...","[[on the other hand], [hence], [however], [], []]",TRAIN,essay001,[Topic: Should students be taught to compete o...,"[0, 1, 0, 0, 0]"
2,"Firstly, maintaining one’s cultural identity i...",[[maintaining one’s cultural identity is a key...,"[[Support], [Support], [Support], [Support], [...","[[firstly], [], [], [], [thus]]",TRAIN,essay002,[Topic: More people are migrating to other cou...,"[0, 0, 0, 0, 0]"
3,"Secondly, it is crucial to keep one’s identity...","[[it is crucial to keep one’s identity], [they...","[[Support], [Support], [Support], [Support], [...","[[secondly], [secondly], [for instance], [], [...",TRAIN,essay002,[Topic: More people are migrating to other cou...,"[0, 0, 0, 0, 0, 0]"
4,"To conclude, although there are opposing ideas...",[[there are opposing ideas of neglecting one’s...,[[Attack]],[[although ]],TRAIN,essay002,[Topic: More people are migrating to other cou...,[1]
...,...,...,...,...,...,...,...,...
1343,"Of course, some celebrities, e.g. writers, pas...","[[some celebrities, e.g. writers, passionate e...","[[Support], [Support]]","[[], []]",TRAIN,essay399,"[Topic: Drugs, alcohol and messy sex lives, Se...","[0, 0]"
1344,The first reason why the father's role should ...,[[The first reason why the father's role shoul...,"[[Support], [Support], [Support]]","[[], [], [for example]]",TRAIN,essay401,[Topic: Fatherhood should be as present as mot...,"[0, 0, 0]"
1345,The second reason why I believe that fatherood...,[[The second reason why I believe that fathero...,"[[Support], [Support], [Support], [Support], [...","[[], [in my opinion], [for instance], [], [i b...",TRAIN,essay401,[Topic: Fatherhood should be as present as mot...,"[0, 0, 0, 0, 0]"
1346,"On the other hand, studying hard will give chi...",[[studying hard will give children a better fu...,"[[Support], [Support], [Support], [Support], [...","[[on the other hand], [], [for instance], [], []]",TRAIN,essay402,[Topic: Children should studying hard or playi...,"[0, 0, 0, 0, 0]"


In [85]:
df.to_pickle('pe_dataset_for_stance_recog_kuri')

In [None]:
# pickle the df here
# then run it in the new clean mr cabessa dataset NB to get the nice dataset
# then use that in the kuri architecture on the correct notebook.

In [85]:
# get ACs

In [86]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [87]:
def get_ac_spans(x):
    
    paragraph = x.paragraph
    paragraph_tokenized = tokenizer.tokenize(paragraph)
    
    ac_list = x.paragraph_components_list
    
    ac_spans = []        
                
    for ac in ac_list:            

        ac_tokenized = tokenizer.tokenize(ac[0])
        
        for i in range(len(paragraph_tokenized)):
            
            span = (i, i+len(ac_tokenized)-1)
            
            if paragraph_tokenized[span[0]:span[1]+1] == ac_tokenized and span not in ac_spans:
                
                ac_spans.append(span)    
                 
    return ac_spans

In [88]:
df['ac_spans'] = df.apply(lambda x: get_ac_spans(x), axis=1)

In [89]:
ac_spans_list = df['ac_spans'].tolist()

In [90]:
def get_inter_span_intervals(ac_spans):
    
    if ac_spans[0][0] == 0:
        
        ac_inter_spans = [(0, 0)]
        
    else:
        
        ac_inter_spans = [(0, ac_spans[0][0]-1)]
      
    ac_inter_spans += [(ac_spans[i][1] + 1, ac_spans[i+1][0]-1) for i in range(len(ac_spans)-1)]
    
    return ac_inter_spans

In [91]:
def get_am_spans(x):
    
    
    # df_idx = df.index[df['paragraph'] == x.paragraph]
    # print(df_idx[0])
    paragraph = x.paragraph
    paragraph_tokenized = tokenizer.tokenize(paragraph)
    
    am_list = x.paragraph_markers_list
    
    ac_spans = x.ac_spans
    ac_inter_spans = get_inter_span_intervals(ac_spans)
    # print(ac_inter_spans)
    am_spans = [(-1, -1)] * len(am_list)
                
    for idx, am in enumerate(am_list):
        print('idx:', idx)
        print('ac inter spans:', len(ac_inter_spans))
        inter_span =  ac_inter_spans[idx] 
        
        if am:
            
            am_tokenized = tokenizer.tokenize(am[0])

            for i in range(inter_span[0], inter_span[1]):

                ac_prefix = paragraph_tokenized[inter_span[0]:inter_span[1]]

                span = (i, i+len(am_tokenized)-1)

                if paragraph_tokenized[span[0]:span[1]+1] == am_tokenized:
                
                    am_spans[idx] = span
    
    return am_spans

In [92]:
df['am_spans'] = df.apply(lambda x: get_am_spans(x), axis=1)

idx: 0
ac inter spans: 4
idx: 1
ac inter spans: 4
idx: 2
ac inter spans: 4
idx: 3
ac inter spans: 4
idx: 0
ac inter spans: 5
idx: 1
ac inter spans: 5
idx: 2
ac inter spans: 5
idx: 3
ac inter spans: 5
idx: 4
ac inter spans: 5
idx: 0
ac inter spans: 5
idx: 1
ac inter spans: 5
idx: 2
ac inter spans: 5
idx: 3
ac inter spans: 5
idx: 4
ac inter spans: 5
idx: 0
ac inter spans: 6
idx: 1
ac inter spans: 6
idx: 2
ac inter spans: 6
idx: 3
ac inter spans: 6
idx: 4
ac inter spans: 6
idx: 5
ac inter spans: 6
idx: 0
ac inter spans: 1
idx: 0
ac inter spans: 4
idx: 1
ac inter spans: 4
idx: 2
ac inter spans: 4
idx: 3
ac inter spans: 4
idx: 0
ac inter spans: 4
idx: 1
ac inter spans: 4
idx: 2
ac inter spans: 4
idx: 3
ac inter spans: 4
idx: 0
ac inter spans: 1
idx: 0
ac inter spans: 4
idx: 1
ac inter spans: 4
idx: 2
ac inter spans: 4
idx: 3
ac inter spans: 4
idx: 0
ac inter spans: 4
idx: 1
ac inter spans: 4
idx: 2
ac inter spans: 4
idx: 3
ac inter spans: 4
idx: 0
ac inter spans: 2
idx: 1
ac inter spans: 2


IndexError: list index out of range

In [None]:
# correct! 79 to 387 is the correction work we need.
# SANITY CHECK BELOW

In [None]:
df['am_spans'], df['ac_spans']

In [None]:
def sanity_check(x):
    
    return len(x.ac_spans) == len(x.am_spans) 

In [None]:
df['sanity'] = df.apply(lambda x: sanity_check(x), axis=1)

In [None]:
df['sanity'].value_counts()

In [None]:
# nice! correct. now all am_spans and ac_spans are equal in length.

In [None]:
df = df.drop(columns=df.columns[-1])

In [None]:
df

In [None]:
df = df.rename(columns={'am_spans' : 'paragraph_am_spans', 'ac_spans' : 'paragraph_ac_spans', 'paragraph_labels_numeric' : 'paragraph_labels'})

In [None]:
df

In [None]:
# sanity check: check if nr of paragraph strcts fts are the same as nr of paragraph ac spans for all paragraphs

In [None]:
def sanity_check(x):
    
    return len(x.structural_fts_as_text) == len(x.paragraph_ac_spans) 

In [None]:
# df['sanity'] = df.apply(lambda x: sanity_check(x), axis=1)

In [None]:
#df['sanity'].value_counts()

In [None]:
# def sanity_check(x):
    
#     return len(x.paragraph_components_list) == len(x.structural_fts_as_text) 

In [None]:
#df['sanity_2'] = df.apply(lambda x: sanity_check(x), axis=1)

In [None]:
#df['sanity_2'].value_counts()

In [None]:
# ok nice. both sanity checks done.
# now delete these extra columns

In [None]:
# df = df.drop(columns=df.columns[-1])
# df = df.drop(columns=df.columns[-1])

In [None]:
df

In [None]:
df = df.rename(columns={'structural_fts_as_text_combined' : 'paragraph_fts_as_txt'})

In [None]:
# now create the two new columns needed.
# first strcts as fts span.
# second get the paragraph concated with strct fts.

In [None]:
df

In [None]:
df.columns

In [None]:
df = df.rename(columns={"paragraph_fts_as_txt": "paragraph_fts_as_txt_list"})

In [None]:
df.columns

In [None]:
df['paragraph_components_list'][769] = [['it is a personal choice to go to a college'],
 ['Not everyone wants to go to a college'],
 ['People who want to make more money, and live a better life than other people decide to take a college education'],
 ['those people should make the financial investment in their own choice; not the government']]

# hackey correction to equalize acs and spans.

In [None]:
df['paragraph_markers_list'][769] = [[], [], [], ['therefore,']]

# hack to correct unequal am lists

In [None]:
# df['paragraph_fts_as_txt_list'][769] = ['Topic: Should the Government Provide Free College?, Sentence: Second, it is a personal choice to go to a college, Para Number: 3, First in Para: Yes, Last in Para: No, Is in Introduction: No, Is in Conclusion: No',
#  'Topic: Should the Government Provide Free College?, Sentence: Not everyone wants to go to a college, Para Number: 3, First in Para: No, Last in Para: No, Is in Introduction: No, Is in Conclusion: No',
#  'Topic: Should the Government Provide Free College?, Sentence: People who want to make more money, and live a better life than other people decide to take a college education, Para Number: 3, First in Para: No, Last in Para: No, Is in Introduction: No, Is in Conclusion: No',
#  'Topic: Should the Government Provide Free College?, Sentence: Therefore, those people should make the financial investment in their own choice; not the government., Para Number: 3, First in Para: No, Last in Para: No, Is in Introduction: No, Is in Conclusion: No']

In [None]:
# df['paragraph_fts_as_txt_list'][786] = ['Topic: Farmland, housing and industry or saving land for animals?, Sentence: Nowadays, there is a prevailing opinion that human needs for farmland, housing and industry are more important than saving land for endangered animals, Para Number: 1, First in Para: Yes, Last in Para: No, Is in Introduction: Yes, Is in Conclusion: No',
#  'Topic: Farmland, housing and industry or saving land for animals?, Sentence: Nowadays, there is a prevailing opinion that human needs for farmland, housing and industry are more important than saving land for endangered animals, Para Number: 1, First in Para: No, Last in Para: Yes, Is in Introduction: Yes, Is in Conclusion: No']

# # correction for exactly similar component

In [None]:
df['paragraph_fts_as_txt_list'][1717]

In [None]:
def get_paras_w_fts(x):
    
    paragraph = x.paragraph
    para_fts_as_txt = x.paragraph_fts_as_txt_list
    para_acs = x.paragraph_components_list
    para_ams = x.paragraph_markers_list
    # para_fts_as_txt_spans = x.paragraph_fts_as_txt_spans
    
    # print("len of list:", len(para_fts_as_txt))
    
    for am, ac, ac_fts_as_txt in zip(para_ams, para_acs, para_fts_as_txt):
        

        print('printy: ', ac_fts_as_txt)
        
        para_nr_idx = ac_fts_as_txt.index("Structural Features:")
        ac_fts_as_txt = ac_fts_as_txt[para_nr_idx:]
        
        fts_as_txt_new = ' [SEP] ' + ac_fts_as_txt + ' [SEP]'
        
        paragraph = paragraph.replace(ac[0], ac[0] + fts_as_txt_new, 1)
        # paragraph = new_para

        
    return paragraph    

In [None]:
# problematic. because of the last line, there is repetition.
# oh no. maybe it's correct.

In [None]:
df['paragraph_w_fts_as_txt'] = df.apply(lambda x: get_paras_w_fts(x), axis=1)

In [None]:
df['paragraph_w_fts_as_txt'][0]

In [None]:
df['paragraph_w_fts_as_txt'][786] = 'Nowadays, there is a prevailing opinion that human needs for farmland, housing and industry are more important [SEP] Structural Features: 1, Yes, Yes, Yes, No [SEP] than saving land for endangered animals. People who disagree with the point dispute that the decreasing of land for endangered animals will bring damage to ecological balance. As far as I am concerned, I agree with the opinion human needs for farmland, housing and industry are more important [SEP] 1, No, Yes, Yes, No [SEP]. The reasons are based on the following aspects.'

In [None]:
# now. 
# 1. find am_spans
# 2. find ac_spans
# 3. find fts_spans

In [None]:
def get_ac_spans_new(x):
    
    paragraph = x.paragraph_w_fts_as_txt
    paragraph_tokenized = tokenizer.tokenize(paragraph)
    
    ac_list = x.paragraph_components_list
    
    ac_spans = []        
                
    for ac in ac_list:            

        ac_tokenized = tokenizer.tokenize(ac[0])
        
        for i in range(len(paragraph_tokenized)):
            
            span = (i, i+len(ac_tokenized)-1)
            
            if paragraph_tokenized[span[0]:span[1]+1] == ac_tokenized and span not in ac_spans:
                
                ac_spans.append(span)    
                 
    return ac_spans

In [None]:
df['ac_spans_new'] = df.apply(lambda x: get_ac_spans_new(x), axis=1)

In [None]:
def sanity_check(x):
    
    return len(x.ac_spans_new) == len(x.paragraph_components_list) 

In [None]:
df['sanity_new'] = df.apply(lambda x: sanity_check(x), axis=1)

In [None]:
df['sanity_new'].value_counts()

In [None]:
def get_inter_span_intervals_new(ac_spans):
    
    if ac_spans[0][0] == 0:
        
        ac_inter_spans = [(0, 0)]
        
    else:
        
        ac_inter_spans = [(0, ac_spans[0][0]-1)]
      
    ac_inter_spans += [(ac_spans[i][1] + 1, ac_spans[i+1][0]-1) for i in range(len(ac_spans)-1)]
    
    return ac_inter_spans

In [None]:
def get_am_spans_new(x):
    
    
    # df_idx = df.index[df['paragraph_w_fts_as_txt'] == x.paragraph_w_fts_as_txt]
    # print(df_idx[0])
    paragraph = x.paragraph_w_fts_as_txt
    paragraph_tokenized = tokenizer.tokenize(paragraph)
    
    am_list = x.paragraph_markers_list
    
    ac_spans = x.ac_spans_new
    ac_inter_spans = get_inter_span_intervals_new(ac_spans)
    # print(ac_inter_spans)
    am_spans = [(-1, -1)] * len(am_list)
                
    for idx, am in enumerate(am_list):
        
        inter_span =  ac_inter_spans[idx] 
        
        if am:
            
            am_tokenized = tokenizer.tokenize(am[0])

            for i in range(inter_span[0], inter_span[1]):

                ac_prefix = paragraph_tokenized[inter_span[0]:inter_span[1]]

                span = (i, i+len(am_tokenized)-1)

                if paragraph_tokenized[span[0]:span[1]+1] == am_tokenized:
                
                    am_spans[idx] = span
    
    return am_spans

In [None]:
df['am_spans_new'] = df.apply(lambda x: get_am_spans_new(x), axis=1)

In [None]:
df['am_spans_new']

In [None]:
df.columns

In [None]:
df['paragraph_fts_as_txt_list']

In [None]:
def get_fts_spans_new(x):
    
    paragraph = x.paragraph_w_fts_as_txt
    paragraph_tokenized = tokenizer.tokenize(paragraph)
    
    fts_list = x.paragraph_fts_as_txt_list
    
    fts_spans = []        
                
    for fts in fts_list:    
        
        fts_str_idx = fts.index("Structural Features:")
        fts = fts[fts_str_idx:]

        fts_tokenized = tokenizer.tokenize(fts)
        
        for i in range(len(paragraph_tokenized)):
            
            span = (i, i+len(fts_tokenized)-1)
            
            if paragraph_tokenized[span[0]:span[1]+1] == fts_tokenized and span not in fts_spans:
                
                fts_spans.append(span)    
                 
    return fts_spans

In [None]:
df['feature_spans_new'] = df.apply(lambda x: get_fts_spans_new(x), axis=1)

In [None]:
df['feature_spans_new']

In [None]:
def sanity_check(x):
    
    return len(x.feature_spans_new) == len(x.paragraph_components_list) 

In [None]:
df['sanity_new'] = df.apply(lambda x: sanity_check(x), axis=1)

In [None]:
df['sanity_new'].value_counts()

In [None]:
df.feature_spans_new[786]

In [None]:
df.paragraph_components_list[786]

In [None]:
df.paragraph_w_fts_as_txt[786]

In [None]:
df.ac_spans_new[786], df.am_spans_new[786]

#### dataset

In [None]:
train_df = df[df.split=='TRAIN'].reset_index(drop=True)
test_df = df[df.split=='TEST'].reset_index(drop=True)

In [None]:
dataset_train = Dataset.from_pandas(train_df)
dataset_test = Dataset.from_pandas(test_df)

In [None]:
train_val_datasets = dataset_train.train_test_split(train_size=0.8)
dataset_train = train_val_datasets['train']
dataset_val = train_val_datasets['test']

In [None]:
dataset = DatasetDict({"train": dataset_train, "test": dataset_test, "validation": dataset_val})

In [None]:
dataset

In [None]:
dataset['train']['paragraph_labels'][0]

In [None]:
# torch.save(dataset, os.path.join("/notebooks/KURI-BERT/notebooks/full_formula_w_fts/Link_Identification_Task", 'pe_dataset_for_bert_minus_w_fts_combined_link_task.pt'))