# setup

In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
import os
import re
import pickle
from itertools import product
from collections import Counter
import random

import spacy
import numpy as np
import pandas as pd


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# operation preserving

## complex

### preprocess data

In [None]:
# import stop words
# https://gist.github.com/sebleier/554280
# nltk+SQL

stop_words_list = ["a","about","above","after","again","against","ain","all","am","an","and","any","are","aren","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can","couldn","couldn't","d","did","didn","didn't","do","does","doesn","doesn't","doing","don","don't","down","during","each","few","for","from","further","had","hadn","hadn't","has","hasn","hasn't","have","haven","haven't","having","he","her","here","hers","herself","him","himself","his","how","i","if","in","into","is","isn","isn't","it","it's","its","itself","just","ll","m","ma","me","mightn","mightn't","more","most","mustn","mustn't","my","myself","needn","needn't","no","nor","not","now","o","of","off","on","once","only","or","other","our","ours","ourselves","out","over","own","re","s","same","shan","shan't","she","she's","should","should've","shouldn","shouldn't","so","some","such","t","than","that","that'll","the","their","theirs","them","themselves","then","there","these","they","this","those","through","to","too","under","until","up","ve","very","was","wasn","wasn't","we","were","weren","weren't","what","when","where","which","while","who","whom","why","will","with","won","won't","wouldn","wouldn't","y","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","could","he'd","he'll","he's","here's","how's","i'd","i'll","i'm","i've","let's","ought","she'd","she'll","that's","there's","they'd","they'll","they're","they've","we'd","we'll","we're","we've","what's","when's","where's","who's","why's","would","able","abst","accordance","according","accordingly","across","act","actually","added","adj","affected","affecting","affects","afterwards","ah","almost","alone","along","already","also","although","always","among","amongst","announce","another","anybody","anyhow","anymore","anyone","anything","anyway","anyways","anywhere","apparently","approximately","arent","arise","around","aside","ask","asking","auth","available","away","awfully","b","back","became","become","becomes","becoming","beforehand","begin","beginning","beginnings","begins","behind","believe","beside","besides","beyond","biol","brief","briefly","c","ca","came","cannot","can't","cause","causes","certain","certainly","co","com","come","comes","contain","containing","contains","couldnt","date","different","done","downwards","due","e","ed","edu","effect","eg","eight","eighty","either","else","elsewhere","end","ending","enough","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","except","f","far","ff","fifth","first","five","fix","followed","following","follows","former","formerly","forth","found","four","furthermore","g","gave","get","gets","getting","give","given","gives","giving","go","goes","gone","got","gotten","h","happens","hardly","hed","hence","hereafter","hereby","herein","heres","hereupon","hes","hi","hid","hither","home","howbeit","however","hundred","id","ie","im","immediate","immediately","importance","important","inc","indeed","index","information","instead","invention","inward","itd","it'll","j","k","keep","keeps","kept","kg","km","know","known","knows","l","largely","last","lately","later","latter","latterly","least","less","lest","let","lets","like","liked","likely","line","little","'ll","look","looking","looks","ltd","made","mainly","make","makes","many","may","maybe","mean","means","meantime","meanwhile","merely","mg","might","million","miss","ml","moreover","mostly","mr","mrs","much","mug","must","n","na","name","namely","nay","nd","near","nearly","necessarily","necessary","need","needs","neither","never","nevertheless","new","next","nine","ninety","nobody","non","none","nonetheless","noone","normally","nos","noted","nothing","nowhere","obtain","obtained","obviously","often","oh","ok","okay","old","omitted","one","ones","onto","ord","others","otherwise","outside","overall","owing","p","page","pages","part","particular","particularly","past","per","perhaps","placed","please","plus","poorly","possible","possibly","potentially","pp","predominantly","present","previously","primarily","probably","promptly","proud","provides","put","q","que","quickly","quite","qv","r","ran","rather","rd","readily","really","recent","recently","ref","refs","regarding","regardless","regards","related","relatively","research","respectively","resulted","resulting","results","right","run","said","saw","say","saying","says","sec","section","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sent","seven","several","shall","shed","shes","show","showed","shown","showns","shows","significant","significantly","similar","similarly","since","six","slightly","somebody","somehow","someone","somethan","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specifically","specified","specify","specifying","still","stop","strongly","sub","substantially","successfully","sufficiently","suggest","sup","sure","take","taken","taking","tell","tends","th","thank","thanks","thanx","thats","that've","thence","thereafter","thereby","thered","therefore","therein","there'll","thereof","therere","theres","thereto","thereupon","there've","theyd","theyre","think","thou","though","thoughh","thousand","throug","throughout","thru","thus","til","tip","together","took","toward","towards","tried","tries","truly","try","trying","ts","twice","two","u","un","unfortunately","unless","unlike","unlikely","unto","upon","ups","us","use","used","useful","usefully","usefulness","uses","using","usually","v","value","various","'ve","via","viz","vol","vols","vs","w","want","wants","wasnt","way","wed","welcome","went","werent","whatever","what'll","whats","whence","whenever","whereafter","whereas","whereby","wherein","wheres","whereupon","wherever","whether","whim","whither","whod","whoever","whole","who'll","whomever","whos","whose","widely","willing","wish","within","without","wont","words","world","wouldnt","www","x","yes","yet","youd","youre","z","zero","a's","ain't","allow","allows","apart","appear","appreciate","appropriate","associated","best","better","c'mon","c's","cant","changes","clearly","concerning","consequently","consider","considering","corresponding","course","currently","definitely","described","despite","entirely","exactly","example","going","greetings","hello","help","hopefully","ignored","inasmuch","indicate","indicated","indicates","inner","insofar","it'd","keep","keeps","novel","presumably","reasonably","second","secondly","sensible","serious","seriously","sure","t's","third","thorough","thoroughly","three","well","wonder"]

In [None]:
cap_stop_words_list = [x.title() for x in stop_words_list]

In [None]:
total_stop_words_list = []
total_stop_words_list.extend(cap_stop_words_list)
total_stop_words_list.extend(stop_words_list)
len(total_stop_words_list)

1568

In [None]:
# load sentence for processing

PHASES = ['valid', 'test']
LANGUAGES = ['complex']
data_names = []

data_dir = '/content/drive/MyDrive/muss/resources/datasets/asset/'

for phase, language in product(PHASES, LANGUAGES):

  data_name = phase + '_' + language
  data_names.append(data_name)

  with open(data_dir+phase+'.'+language) as f:
        
    exec("%s = %s" % (data_name,f.read().splitlines()))

    f.close()

In [None]:
# delete the first word in a sentence if it is in stop word list

for index,file_name in enumerate(data_names):
    
    data = globals()[file_name]
    
    for count, sentence in enumerate(data):

        first_word = sentence.split(' ')[0]

        if first_word.lower() in stop_words_list:
            # delete it
            data[count] = sentence[len(first_word)+1:]

### NER

In [None]:
nlp = spacy.load("en_core_web_sm")

def substringSieve(NER_list,re_NER):
  '''
  from https://stackoverflow.com/questions/21720199/python-remove-any-element-from-a-list-of-strings-that-is-a-substring-of-anothe
  '''
  string_list = []
  string_list.extend(NER_list)
  string_list.extend(re_NER)
  
  string_list.sort(key=lambda s: len(s), reverse=True)
  out = []
  for s in string_list:
      if not any([s in o for o in out]):
          out.append(s)
  
  # if len(out) == 0
  #       out = np.nan

  return out

def find_all_NE(text):
  doc = nlp(text)

  NER_list = [t.text for t in doc.ents]

  re_NER = re.findall(r"[A-Z][A-z]*(?:(?=\s)\s[A-Z][A-z]*)*(?:(?!\s)\W[A-Z][A-z]*)*(?:(?:\sin(?=\s))*(?:\sof(?=\s))*(?:\sthe(?=\s))*(?:(?=\s[A-Z])\s[A-Z][A-z]*)*)*|(?:[0-9])*",text)

  return substringSieve(NER_list,re_NER)

In [None]:
data_names


['valid_complex', 'test_complex']

In [None]:
data_NER = []

for index,file_name in enumerate(data_names):
    
    data = globals()[file_name]
    
    found_NER_list = []

    for count, sentence in enumerate(data):
        
        found_NER_list += [find_all_NE(sentence)]

        if count%1000 == 0:
            print(file_name,count)
    
    data_NER += [found_NER_list]

with open('/content/drive/MyDrive/muss/qualitative/NER_list_asset_0810_complex', 'wb') as fp:
    pickle.dump(data_NER, fp)    

valid_complex 0
valid_complex 1000
test_complex 0


In [None]:
with open ('/content/drive/MyDrive/muss/qualitative/NER_list_asset_0810_complex', 'rb') as fp:
    data_NER = pickle.load(fp)

In [None]:
for j in range(len(data_NER)):
  for i in range(len(data_NER[j])):
    # print('before:',data_NER[0][i])
    data_NER[j][i] = [NE for NE in data_NER[j][i] if NE not in total_stop_words_list]
    # print('after:',data_NER[0][i])

In [None]:
with open('/content/drive/MyDrive/muss/qualitative/NER_list_asset_0825_complex', 'wb') as fp:
    pickle.dump(data_NER, fp)    

In [None]:
# load original sentence

PHASES = ['valid', 'test']
LANGUAGES = ['complex']

data_dir = '/content/drive/MyDrive/muss/resources/datasets/asset/'

for phase, language in product(PHASES, LANGUAGES):
  # print(phase, language)
  data_name = phase + '_' + language + '_ori'

  with open(data_dir+phase+'.'+language) as f:
        
    exec("%s = %s" % (data_name,f.read().splitlines()))

    f.close()

In [None]:
data_names

['valid_complex', 'test_complex']

In [None]:
import pandas as pd

for index,data_name in enumerate(data_names):
  file_name = data_name+'_ori'

  data_tuples = list(zip(globals()[file_name],data_NER[index]))

  df_name = data_name + '_df'

  globals()[df_name] = pd.DataFrame(data_tuples, columns=['ori_text','NER'])
  globals()[df_name] = globals()[df_name].reset_index(drop=True)

  file_name = data_name.replace('_','.')

In [None]:
valid_complex_df

Unnamed: 0,ori_text,NER
0,"Adjacent counties are Marin (to the south), Me...","[Contra Costa, Mendocino, Adjacent, Solano, Ma..."
1,A Georgian inscription around the drum attests...,[Georgian]
2,They would later return to the revived series ...,"[Christmas Special ""The Next Doctor, the Cyber..."
3,"Jameson's autobiography, How to Make Love Like...","[A Cautionary Tale, August 17, 2004, Make Love..."
4,It is particularly famous for the cultivation ...,[]
...,...,...
1995,On the 15 November 2003 a Sprinter train trave...,"[November 2003, Sprinter, Ballarat, Ballan, Go..."
1996,A console manufacturer is a company that manuf...,[]
1997,Links with poverty and crime Those who are fun...,[Links]
1998,"The term ""union council"" may be used for local...",[]


In [None]:
valid_complex_df.to_pickle("/content/drive/MyDrive/muss/valid_complex_df_0825.pkl")
test_complex_df.to_pickle("/content/drive/MyDrive/muss/test_complex_df_0825.pkl")

### dataset create

#### prefix all NE

In [None]:
target_dir = '/content/drive/MyDrive/muss/resources/datasets/token_asset_0825_complex'

In [None]:
import pandas as pd


for index,data_name in enumerate(data_names):
  file_name = data_name+'_ori'

  data_tuples = list(zip(globals()[file_name],data_NER[index]))
  df = pd.DataFrame(data_tuples, columns=['ori_text','NER'])
  file_name = data_name.replace('_','.')

  with open(target_dir+'/'+file_name, 'w') as f:

    for index, row in df.iterrows():

        sentence = row.ori_text
        NER = row.NER

        sentence_w_control_token =  " <NEXT_NE> " + " <NEXT_NE> ".join(NER) + " <SENT_START> " + sentence

        if index%10000==0:
            print(index)
            print(sentence_w_control_token)

        f.write("{}\n".format(sentence_w_control_token))

  f.close()


0
 <NEXT_NE> Contra Costa <NEXT_NE> Mendocino <NEXT_NE> Adjacent <NEXT_NE> Solano <NEXT_NE> Marin <NEXT_NE> Lake <NEXT_NE> Napa <SENT_START> Adjacent counties are Marin (to the south), Mendocino (to the north), Lake (northeast), Napa (to the east), and Solano and Contra Costa (to the southeast).
0
 <NEXT_NE> Afro-Arab Abbala <NEXT_NE> Janjaweed <NEXT_NE> Sudanese <NEXT_NE> Rizeigat <SENT_START> One side of the armed conflicts is composed mainly of the Sudanese military and the Janjaweed, a Sudanese militia group recruited mostly from the Afro-Arab Abbala tribes of the northern Rizeigat region in Sudan.


#### prefix different number of NE

In [None]:

with open ('/content/drive/MyDrive/muss/valid_complex_df_0825.pkl', 'rb') as fp:
    valid_df = pickle.load(fp)

with open ('/content/drive/MyDrive/muss/test_complex_df_0825.pkl', 'rb') as fp:
    test_df = pickle.load(fp)

In [None]:
data_names = ['valid_complex', 'test_complex']

In [None]:
df_list = ['valid_df','test_df']

In [None]:
def make_dataset(folder_name,data_names,num_NE,all_word=False,all_word_random=False):
    
    target_dir = '/content/drive/MyDrive/muss/resources/datasets/'+folder_name
    os.makedirs(target_dir)
    
    for index,data_name in enumerate(data_names):
        df = globals()[df_list[index]]
        file_name = data_name.replace('_','.')  

        with open(target_dir+'/'+file_name, 'w') as f:
          for index, row in df.iterrows():
              
              if all_word == False:
                  sentence = row.ori_text
                  NER = row.NER

                  if len(NER) != 0:
                      NE_remained = row.NER[:num_NE]
                      sentence_w_control_token = " <NEXT_NE> " + " <NEXT_NE> ".join(NE_remained) + " <SENT_START> " + sentence
                  else:
                      sentence_w_control_token = " <NEXT_NE> "+ " <SENT_START> " + sentence

                  if index%100==0:
                      print(index)
                      print(sentence_w_control_token)

                  f.write("{}\n".format(sentence_w_control_token))
              else:
                  sentence = row.ori_text
                  words_NE_list = sentence.split(' ')

                  if all_word_random == True:
                      random.shuffle(words_NE_list)

                  sentence_w_control_token = " <NEXT_NE> " + " <NEXT_NE> ".join(words_NE_list) + " <SENT_START> " + sentence

                  if index%100==0:
                      print(index)
                      print(sentence_w_control_token)

                  f.write("{}\n".format(sentence_w_control_token))

        f.close()


In [None]:
# create dataset with different number of named entities in prefix
make_dataset('asset_complex_all_words_random_0828',data_names,999)

## simple

### NER

In [None]:
# load sentence for preprocessing

PHASES = ['valid', 'test']
LANGUAGES = ['simple']

data_dir = '/content/drive/MyDrive/muss/resources/datasets/asset/'

simple_NER_list = []

for i in range(10):
    print('i',i)
    data_names = []
    
    for phase, language in product(PHASES, LANGUAGES):
        # print(phase, language)
        data_name = phase + '_' + language
        data_names.append(data_name)

        with open(data_dir+phase+'.'+language + '.' +str(i)) as f:
              
            exec("%s = %s" % (data_name,f.read().splitlines()))

            f.close()

    print('BEFORE: valid_simple[5]',valid_simple[5])

   
    # preprocess - delete first word in sentences if it is in stop words list
    for index,file_name in enumerate(data_names):  # data_names ['valid_simple', 'test_simple']
        
        data = globals()[file_name]
        
        for count, sentence in enumerate(data):

            first_word = sentence.split(' ')[0]

            if first_word.lower() in stop_words_list:
                # delete it
                data[count] = sentence[len(first_word)+1:]

    print('AFTER: valid_simple[5]',valid_simple[5])


    '''
    find NE
    '''

    data_NER = []

    for index,file_name in enumerate(data_names):
        
        data = globals()[file_name]

        found_NER_list = []
        for count, sentence in enumerate(data):
            
            found_NER_list += [find_all_NE(sentence)]

            if count%1000 == 0:
                print(file_name,count)
        
        data_NER += [found_NER_list]

    simple_NER_list += [data_NER]
    print(len(simple_NER_list))

i 0
BEFORE: valid_simple[5] After years in the tag team division, Hardy took on The Undertaker in a ladder match for the Undisputed Championship.
AFTER: valid_simple[5] years in the tag team division, Hardy took on The Undertaker in a ladder match for the Undisputed Championship.
valid_simple 0
valid_simple 1000
test_simple 0
1
i 1
BEFORE: valid_simple[5] After years in the tag team, Hardy took on The Undertaker in a Championship match.
AFTER: valid_simple[5] years in the tag team, Hardy took on The Undertaker in a Championship match.
valid_simple 0
valid_simple 1000
test_simple 0
2
i 2
BEFORE: valid_simple[5] Hardy was previously in a tag team and fought The Undertaker in a ladder match for the championship belt.
AFTER: valid_simple[5] Hardy was previously in a tag team and fought The Undertaker in a ladder match for the championship belt.
valid_simple 0
valid_simple 1000
test_simple 0
3
i 3
BEFORE: valid_simple[5] Hardy took on The Undertaker. It was a ladder match for the Undisputed

In [None]:
with open('/content/drive/MyDrive/muss/qualitative/NER_list_asset_0810_simple', 'wb') as fp:
    pickle.dump(simple_NER_list, fp)    

In [None]:
with open ('/content/drive/MyDrive/muss/qualitative/NER_list_asset_0810_simple', 'rb') as fp:
    simple_NER_list = pickle.load(fp)

In [None]:
# delete stop words
for j in range(len(simple_NER_list)):
  for i in range(len(simple_NER_list[j])):
    for n in range(len(simple_NER_list[j][i])):
      print('before:',simple_NER_list[j][i][n])
      simple_NER_list[j][i][n] = [NE for NE in simple_NER_list[j][i][n] if NE not in total_stop_words_list]
      print('after:',simple_NER_list[j][i][n])


[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
before: ['Companion of Honour', '1988', 'CH']
after: ['Companion of Honour', '1988', 'CH']
before: ['Loeche', 'Swiss', 'Onyx']
after: ['Loeche', 'Swiss', 'Onyx']
before: ['matchbook']
after: ['matchbook']
before: ['first']
after: []
before: ['Commune']
after: ['Commune']
before: ['ThunderClan in The Sight', 'Twolegs in Dawn', 'Graystripe', 'English', 'three']
after: ['ThunderClan in The Sight', 'Twolegs in Dawn', 'Graystripe', 'English']
before: ['Samovar & Porter', 'Americans', 'each day', 'Syrians', '1994', '84']
after: ['Samovar & Porter', 'Americans', 'each day', 'Syrians', '1994', '84']
before: ['']
after: ['']
before: ['4', '5']
after: ['4', '5']
before: ['Australian National University', 'David Lindenmeyer', 'Leadbeater', 'Dr']
after: ['Australian National University', 'David Lindenmeyer', 'Leadbeater', 'Dr']
before: ['Montreal Canadians', 'Quebec', 'Canada']
after: ['Montreal Canadians', 'Quebec', 'Canada']
before: ['Small']
after: ['Sma

In [None]:
with open('/content/drive/MyDrive/muss/qualitative/NER_list_asset_0825_simple', 'wb') as fp:
    pickle.dump(simple_NER_list, fp)    

### find NE that appear at least 3 times among 10 references

In [None]:
def remove_substring(item_list):

  item_list.sort(key=lambda s: len(s), reverse=True)
  out = []
  
  for s in item_list:
      if not any([s in o for o in out]):
          out.append(s)
  
  return out

In [None]:
valid_intersection = []

for j in range(2000):

  all_NE_list = []

  for i in range(10):

    all_NE_list.extend(simple_NER_list[i][0][j])
  
  c = Counter(all_NE_list)
  res = [x for x in all_NE_list if c[x] >=3]
  unique_NE = list(set(res))

  if '' in unique_NE:
    unique_NE.remove('')

  unique_NE = [x for x in unique_NE if x not in cap_stop_words_list]
  
  valid_intersection.append(remove_substring(unique_NE))

  

In [None]:
# no negelegeble stop word
for lists in test_intersection:
  for x in lists:
    if x in total_stop_words_list:
      print(x)

two
first
two
three
four
first
seven
first
one
first
first
one
two
two
four
two
second
first
one
six
one
four
two
one
four
first
first
one
three
first
second
one
first
second
first
first
five
one
two
one
one
one


In [None]:
intersection = []
intersection += [valid_intersection]

In [None]:
test_intersection = []

for j in range(len(simple_NER_list[0][1])):

  all_NE_list = []

  for i in range(10):

    all_NE_list.extend(simple_NER_list[i][1][j])
  
  c = Counter(all_NE_list)
  res = [x for x in all_NE_list if c[x] >=3]
  
  unique_NE = list(set(res))
  if '' in unique_NE:
    unique_NE.remove('')

  unique_NE = [x for x in unique_NE if x not in cap_stop_words_list]
  
  test_intersection.append(remove_substring(unique_NE))
  

In [None]:
intersection += [test_intersection]

In [None]:
with open('/content/drive/MyDrive/muss/qualitative/asset_3_times_intersection_0825', 'wb') as fp:
    pickle.dump(intersection, fp)    

### dataset create

In [None]:
# load original sentence

PHASES = ['valid', 'test']
LANGUAGES = ['complex']
data_names = []

data_dir = '/content/drive/MyDrive/muss/resources/datasets/asset/'

for phase, language in product(PHASES, LANGUAGES):
  # print(phase, language)
  data_name = phase + '_' + language + '_ori'
  data_names.append(phase + '_' + language)

  with open(data_dir+phase+'.'+language) as f:
        
    exec("%s = %s" % (data_name,f.read().splitlines()))

    f.close()

In [None]:
data_names

['valid_complex', 'test_complex']

In [None]:
import pandas as pd

for index,data_name in enumerate(data_names):
  file_name = data_name+'_ori'
  # print(file_name)

  data_tuples = list(zip(globals()[file_name],intersection[index]))

  df_name = data_name + '_df'
  print(df_name)
  globals()[df_name] = pd.DataFrame(data_tuples, columns=['complex','NER'])
  # globals()[df_name].dropna(subset = ["NER"], inplace=True)
  globals()[df_name] = globals()[df_name].reset_index(drop=True)

  file_name = data_name.replace('_','.')

valid_complex_df
test_complex_df


In [None]:
valid_complex_df

Unnamed: 0,complex,NER
0,"Adjacent counties are Marin (to the south), Me...","[Mendocino, Lake, Contra Costa, Adjacent, Sola..."
1,A Georgian inscription around the drum attests...,[Georgian]
2,They would later return to the revived series ...,"[Christmas Special ""The Next Doctor, the Cyber..."
3,"Jameson's autobiography, How to Make Love Like...","[A Cautionary Tale, August 17, 2004, Make Love..."
4,It is particularly famous for the cultivation ...,[]
...,...,...
1995,On the 15 November 2003 a Sprinter train trave...,"[November 15, 2003, November 2003, Ballarat, S..."
1996,A console manufacturer is a company that manuf...,[]
1997,Links with poverty and crime Those who are fun...,[Links]
1998,"The term ""union council"" may be used for local...",[]


In [None]:
test_complex_df

Unnamed: 0,complex,NER
0,One side of the armed conflicts is composed ma...,"[Afro-Arab Abbala, The Janjaweed, Rizeigat, Su..."
1,"Jeddah is the principal gateway to Mecca, Isla...","[Muslims, Jeddah, Islam, Mecca]"
2,The Great Dark Spot is thought to represent a ...,"[Great Dark Spot, Neptune]"
3,"His next work, Saturday, follows an especially...",[Saturday]
4,"The tarantula, the trickster character, spun a...",[east]
...,...,...
354,Although the name suggests that they are locat...,"[Parts of the Bernese Alps, Bernese Oberland, ..."
355,"There he had one daughter, later baptized as M...","[Mary Ann Fisher Power, one]"
356,"During an interview, Edward Gorey mentioned th...","[Edward Gorey, Bawden, one]"
357,The string can vibrate in different modes just...,"[electron, photon]"


In [None]:
valid_complex_df.to_pickle("/content/drive/MyDrive/muss/qualitative/asset_valid_df_0810.pkl")
test_complex_df.to_pickle("/content/drive/MyDrive/muss/qualitative/asset_test_df_0810.pkl")

In [None]:
data_names

['valid_complex', 'test_complex']

In [None]:
def make_dataset(folder_name,data_names,intersection,num_NE,all_word=False,all_word_random=False):
    
    target_dir = '/content/drive/MyDrive/muss/resources/datasets/'+folder_name
    os.makedirs(target_dir)#,exist_ok=True)

    for index,data_name in enumerate(data_names):
        file_name = data_name+'_ori'
        data_tuples = list(zip(globals()[file_name],intersection[index]))
        df = pd.DataFrame(data_tuples, columns=['ori_text','NER'])
        file_name = data_name.replace('_','.')

        with open(target_dir+'/'+file_name, 'w') as f:
          for index, row in df.iterrows():
              
              if all_word == False:
                  sentence = row.ori_text
                  NER = row.NER

                  if len(NER) != 0:
                      NE_remained = row.NER[:num_NE]
                      sentence_w_control_token = " <NEXT_NE> " + " <NEXT_NE> ".join(NE_remained) + " <SENT_START> " + sentence
                  else:
                      sentence_w_control_token = " <NEXT_NE> "+ " <SENT_START> " + sentence

                  if index%100==0:
                      print(index)
                      print(sentence_w_control_token)

                  f.write("{}\n".format(sentence_w_control_token))
              else:
                  sentence = row.ori_text
                  words_NE_list = sentence.split(' ')

                  if all_word_random == True:
                      random.shuffle(words_NE_list)

                  sentence_w_control_token = " <NEXT_NE> " + " <NEXT_NE> ".join(words_NE_list) + " <SENT_START> " + sentence

                  if index%100==0:
                      print(index)
                      print(sentence_w_control_token)

                  f.write("{}\n".format(sentence_w_control_token))

        f.close()


In [None]:
make_dataset('asset_simple_0NE_0825',data_names,intersection,0)

0
 <NEXT_NE>  <SENT_START> Adjacent counties are Marin (to the south), Mendocino (to the north), Lake (northeast), Napa (to the east), and Solano and Contra Costa (to the southeast).
100
 <NEXT_NE>  <SENT_START> It aims to be fast and lightweight, while still being visually appealing and easy to use.
200
 <NEXT_NE>  <SENT_START> Warrington is a large town, borough and unitary authority area of Cheshire, England.
300
 <NEXT_NE>  <SENT_START> Alkaloids are produced by a large variety of organisms, including bacteria, fungi, plants, and animals and are part of the group of natural products (also called secondary metabolites).
400
 <NEXT_NE>  <SENT_START> Leizu shared her discoveries with others and the knowledge became widespread in China.
500
 <NEXT_NE>  <SENT_START> Earth is the only place in the universe where life has been confirmed to exist.
600
 <NEXT_NE>  <SENT_START> A synonym for "lolcat" is cat macro, since the images are a type of image macro.
700
 <NEXT_NE>  <SENT_START> Follo

## both

In [None]:

with open ('/content/drive/MyDrive/muss/valid_complex_df_0825.pkl', 'rb') as fp:
    valid_df = pickle.load(fp)

with open ('/content/drive/MyDrive/muss/test_complex_df_0825.pkl', 'rb') as fp:
    test_df = pickle.load(fp)

In [None]:

with open ('/content/drive/MyDrive/muss/qualitative/asset_valid_df_0810.pkl', 'rb') as fp:
    simple_valid_df = pickle.load(fp)

with open ('/content/drive/MyDrive/muss/qualitative/asset_test_df_0810.pkl', 'rb') as fp:
    simple_test_df = pickle.load(fp)

In [None]:
valid_df.rename(columns={"NER": "NER_complex"})

In [None]:
simple_valid_df.rename(columns={"NER": "NER_simple"})

In [None]:
com_sim_valid_df = pd.concat([valid_df.rename(columns={"NER": "NER_complex"}), simple_valid_df.rename(columns={"NER": "NER_simple"}).drop(columns=['complex'])], axis=1)
com_sim_valid_df

Unnamed: 0,ori_text,NER_complex,NER_simple
0,"Adjacent counties are Marin (to the south), Me...","[Contra Costa, Mendocino, Adjacent, Solano, Ma...","[Mendocino, Lake, Contra Costa, Adjacent, Sola..."
1,A Georgian inscription around the drum attests...,[Georgian],[Georgian]
2,They would later return to the revived series ...,"[Christmas Special ""The Next Doctor, the Cyber...","[Christmas Special ""The Next Doctor, the Cyber..."
3,"Jameson's autobiography, How to Make Love Like...","[A Cautionary Tale, August 17, 2004, Make Love...","[A Cautionary Tale, August 17, 2004, Make Love..."
4,It is particularly famous for the cultivation ...,[],[]
...,...,...,...
1995,On the 15 November 2003 a Sprinter train trave...,"[November 2003, Sprinter, Ballarat, Ballan, Go...","[November 15, 2003, November 2003, Ballarat, S..."
1996,A console manufacturer is a company that manuf...,[],[]
1997,Links with poverty and crime Those who are fun...,[Links],[Links]
1998,"The term ""union council"" may be used for local...",[],[]


In [None]:
com_sim_test_df = pd.concat([test_df.rename(columns={"NER": "NER_complex"}), simple_test_df.rename(columns={"NER": "NER_simple"}).drop(columns=['complex'])], axis=1)
com_sim_test_df

Unnamed: 0,ori_text,NER_complex,NER_simple
0,One side of the armed conflicts is composed ma...,"[Afro-Arab Abbala, Janjaweed, Sudanese, Rizeigat]","[Afro-Arab Abbala, The Janjaweed, Rizeigat, Su..."
1,"Jeddah is the principal gateway to Mecca, Isla...","[Muslims, Jeddah, Mecca, Islam]","[Muslims, Jeddah, Islam, Mecca]"
2,The Great Dark Spot is thought to represent a ...,"[Great Dark Spot, Neptune]","[Great Dark Spot, Neptune]"
3,"His next work, Saturday, follows an especially...",[Saturday],[Saturday]
4,"The tarantula, the trickster character, spun a...",[east],[east]
...,...,...,...
354,Although the name suggests that they are locat...,"[the Bernese Alps, Bernese Oberland, Obwalden,...","[Parts of the Bernese Alps, Bernese Oberland, ..."
355,"There he had one daughter, later baptized as M...",[Mary Ann Fisher Power],"[Mary Ann Fisher Power, one]"
356,"During an interview, Edward Gorey mentioned th...","[Edward Gorey, Bawden]","[Edward Gorey, Bawden, one]"
357,The string can vibrate in different modes just...,"[electron, photon]","[electron, photon]"


In [None]:
def substringSieve(NER_list,re_NER):
  string_list = []
  string_list.extend(NER_list)
  string_list.extend(re_NER)
  
  string_list.sort(key=lambda s: len(s), reverse=True)
  out = []
  for s in string_list:
      if not any([s in o for o in out]):
          out.append(s)
  
  # if len(out) == 0:

  #       out = np.nan

  return out

In [None]:
stop_words_list = ["a","about","above","after","again","against","ain","all","am","an","and","any","are","aren","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can","couldn","couldn't","d","did","didn","didn't","do","does","doesn","doesn't","doing","don","don't","down","during","each","few","for","from","further","had","hadn","hadn't","has","hasn","hasn't","have","haven","haven't","having","he","her","here","hers","herself","him","himself","his","how","i","if","in","into","is","isn","isn't","it","it's","its","itself","just","ll","m","ma","me","mightn","mightn't","more","most","mustn","mustn't","my","myself","needn","needn't","no","nor","not","now","o","of","off","on","once","only","or","other","our","ours","ourselves","out","over","own","re","s","same","shan","shan't","she","she's","should","should've","shouldn","shouldn't","so","some","such","t","than","that","that'll","the","their","theirs","them","themselves","then","there","these","they","this","those","through","to","too","under","until","up","ve","very","was","wasn","wasn't","we","were","weren","weren't","what","when","where","which","while","who","whom","why","will","with","won","won't","wouldn","wouldn't","y","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","could","he'd","he'll","he's","here's","how's","i'd","i'll","i'm","i've","let's","ought","she'd","she'll","that's","there's","they'd","they'll","they're","they've","we'd","we'll","we're","we've","what's","when's","where's","who's","why's","would","able","abst","accordance","according","accordingly","across","act","actually","added","adj","affected","affecting","affects","afterwards","ah","almost","alone","along","already","also","although","always","among","amongst","announce","another","anybody","anyhow","anymore","anyone","anything","anyway","anyways","anywhere","apparently","approximately","arent","arise","around","aside","ask","asking","auth","available","away","awfully","b","back","became","become","becomes","becoming","beforehand","begin","beginning","beginnings","begins","behind","believe","beside","besides","beyond","biol","brief","briefly","c","ca","came","cannot","can't","cause","causes","certain","certainly","co","com","come","comes","contain","containing","contains","couldnt","date","different","done","downwards","due","e","ed","edu","effect","eg","eight","eighty","either","else","elsewhere","end","ending","enough","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","except","f","far","ff","fifth","first","five","fix","followed","following","follows","former","formerly","forth","found","four","furthermore","g","gave","get","gets","getting","give","given","gives","giving","go","goes","gone","got","gotten","h","happens","hardly","hed","hence","hereafter","hereby","herein","heres","hereupon","hes","hi","hid","hither","home","howbeit","however","hundred","id","ie","im","immediate","immediately","importance","important","inc","indeed","index","information","instead","invention","inward","itd","it'll","j","k","keep","keeps","kept","kg","km","know","known","knows","l","largely","last","lately","later","latter","latterly","least","less","lest","let","lets","like","liked","likely","line","little","'ll","look","looking","looks","ltd","made","mainly","make","makes","many","may","maybe","mean","means","meantime","meanwhile","merely","mg","might","million","miss","ml","moreover","mostly","mr","mrs","much","mug","must","n","na","name","namely","nay","nd","near","nearly","necessarily","necessary","need","needs","neither","never","nevertheless","new","next","nine","ninety","nobody","non","none","nonetheless","noone","normally","nos","noted","nothing","nowhere","obtain","obtained","obviously","often","oh","ok","okay","old","omitted","one","ones","onto","ord","others","otherwise","outside","overall","owing","p","page","pages","part","particular","particularly","past","per","perhaps","placed","please","plus","poorly","possible","possibly","potentially","pp","predominantly","present","previously","primarily","probably","promptly","proud","provides","put","q","que","quickly","quite","qv","r","ran","rather","rd","readily","really","recent","recently","ref","refs","regarding","regardless","regards","related","relatively","research","respectively","resulted","resulting","results","right","run","said","saw","say","saying","says","sec","section","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sent","seven","several","shall","shed","shes","show","showed","shown","showns","shows","significant","significantly","similar","similarly","since","six","slightly","somebody","somehow","someone","somethan","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specifically","specified","specify","specifying","still","stop","strongly","sub","substantially","successfully","sufficiently","suggest","sup","sure","take","taken","taking","tell","tends","th","thank","thanks","thanx","thats","that've","thence","thereafter","thereby","thered","therefore","therein","there'll","thereof","therere","theres","thereto","thereupon","there've","theyd","theyre","think","thou","though","thoughh","thousand","throug","throughout","thru","thus","til","tip","together","took","toward","towards","tried","tries","truly","try","trying","ts","twice","two","u","un","unfortunately","unless","unlike","unlikely","unto","upon","ups","us","use","used","useful","usefully","usefulness","uses","using","usually","v","value","various","'ve","via","viz","vol","vols","vs","w","want","wants","wasnt","way","wed","welcome","went","werent","whatever","what'll","whats","whence","whenever","whereafter","whereas","whereby","wherein","wheres","whereupon","wherever","whether","whim","whither","whod","whoever","whole","who'll","whomever","whos","whose","widely","willing","wish","within","without","wont","words","world","wouldnt","www","x","yes","yet","youd","youre","z","zero","a's","ain't","allow","allows","apart","appear","appreciate","appropriate","associated","best","better","c'mon","c's","cant","changes","clearly","concerning","consequently","consider","considering","corresponding","course","currently","definitely","described","despite","entirely","exactly","example","going","greetings","hello","help","hopefully","ignored","inasmuch","indicate","indicated","indicates","inner","insofar","it'd","keep","keeps","novel","presumably","reasonably","second","secondly","sensible","serious","seriously","sure","t's","third","thorough","thoroughly","three","well","wonder"]
cap_stop_words_list = [x.title() for x in stop_words_list]
stop_words_all = []
stop_words_all.extend(stop_words_list)
stop_words_all.extend(cap_stop_words_list)
len(stop_words_all)

1568

In [None]:
df_name = ['com_sim_valid_df','com_sim_test_df']

In [None]:
intersect_NE = []

for i in range(2):
  tmp_complex_ratio = []
  tmp_simple_ratio = []
  tmp_NE = []

  for j in range(len(globals()[df_name[i]])):
    
    if j % 10000 == 0:
      print(i,j)

    a = globals()[df_name[i]].NER_complex[j].copy()
    b = globals()[df_name[i]].NER_simple[j].copy()

    
    a_in_b = [x for x in a for y in b if x in y ]
    b_in_a = [x for x in b for y in a if x in y ]

    # break down NE in complex_NE and simple_NE list to check if any NE is not included in the final output due to some special case such as

    # split the item and flat the list of lists
    c = [item for sublist in [x.split(' ') for x in a] for item in sublist] # e.g.  ['Germany', 'Stadtpfeifer', 'Italy', 'Pifferi', 'Stadspijpers', 'Holland']
    d = [item for sublist in [x.split(' ') for x in b] for item in sublist] # e.g. ['Stadtpfeifer', 'in', 'Germany', 'Pifferi', 'in', 'Italy']
    
    intersection_1 = [x for ele in c for x in d if x in ele] # element in d that also is a (sub)string in c
    intersection_2 = [x for ele in d for x in c if x in ele]


    final_output = substringSieve(substringSieve(a_in_b,b_in_a),substringSieve(intersection_1,intersection_2))
  #   # print('final_output',final_output)

    FINAL = [element for element in final_output if element not in stop_words_all]
    
  #   # print('FINAL',FINAL)
  #   ####################
    tmp_NE.append(FINAL)
  #   ####################

    # find the ratio of len(complex_NE_in_both)/len(complex) and the same for simple
    if len(FINAL) != 0:

      cc = list(set([element for element in c if element not in stop_words_all]))
      dd = list(set([element for element in d if element not in stop_words_all]))

      complex_in_inter_num = sum([x in FINAL for x in cc])
      # print('complex_in_inter_num',complex_in_inter_num)

      simple_in_inter_num = sum([x in FINAL for x in dd])
      # print('simple_in_inter_num',simple_in_inter_num)

      if len(a) != 0:
        complex_in_inter_ratio = complex_in_inter_num/len(a)
      else:
        complex_in_inter_ratio = np.nan

      if len(b) != 0:
        simple_in_inter_ratio = simple_in_inter_num/len(b)
      else:
        simple_in_inter_ratio = np.nan

      # print(complex_in_inter_ratio,simple_in_inter_ratio)
      ####################
      tmp_complex_ratio.append(complex_in_inter_ratio)
      tmp_simple_ratio.append(simple_in_inter_ratio)
      ####################

  complex_ratio += [tmp_complex_ratio]
  simple_ratio += [tmp_simple_ratio]
  intersect_NE += [tmp_NE]
    
  

0 0
1 0


In [None]:
len(intersect_NE)

2

In [None]:
com_sim_valid_df['co_occur_NER'] = intersect_NE[0]
com_sim_valid_df

Unnamed: 0,ori_text,NER_complex,NER_simple,co_occur_NER
0,"Adjacent counties are Marin (to the south), Mendocino (to the north), Lake (northeast), Napa (to the east), and Solano and Contra Costa (to the southeast).","[Contra Costa, Mendocino, Adjacent, Solano, Marin, Lake, Napa]","[Mendocino, Lake, Contra Costa, Adjacent, Solano, Marin, Napa]","[Contra Costa, Mendocino, Adjacent, Solano, Marin, Lake, Napa]"
1,A Georgian inscription around the drum attests his name.,[Georgian],[Georgian],[Georgian]
2,"They would later return to the revived series in the 2008 Christmas Special ""The Next Doctor"", introducing two new variants of the race; the Cyber-Shades and the Cyber-King.","[Christmas Special ""The Next Doctor, the Cyber-Shades, the Cyber-King, 2008]","[Christmas Special ""The Next Doctor, the Cyber-Shades, the Cyber-King, 2008, two]","[Christmas Special ""The Next Doctor, the Cyber-Shades, the Cyber-King, 2008]"
3,"Jameson's autobiography, How to Make Love Like a Porn Star: A Cautionary Tale was published August 17, 2004.","[A Cautionary Tale, August 17, 2004, Make Love Like, Porn Star, Jameson]","[A Cautionary Tale, August 17, 2004, Make Love Like, Porn Star, Jameson]","[A Cautionary Tale, August 17, 2004, Make Love Like, Porn Star, Jameson]"
4,It is particularly famous for the cultivation of kiwifruit.,[],[],[]
...,...,...,...,...
1995,On the 15 November 2003 a Sprinter train travelling to Ballarat was derailed between Ballan and Gordon injuring 61 people when it hit a stationary car on a country railway crossing.,"[November 2003, Sprinter, Ballarat, Ballan, Gordon, 15, 61]","[November 15, 2003, November 2003, Ballarat, Sprinter, Ballan, Gordon, 61]","[November 2003, Sprinter, Ballarat, Ballan, Gordon, 15, 61]"
1996,A console manufacturer is a company that manufactures and distributes video game consoles.,[],[],[]
1997,"Links with poverty and crime Those who are functionally illiterate may be subject to social intimidation, health risks, stress, low income, and other pitfalls associated with their reading and writing deficits.",[Links],[Links],[Links]
1998,"The term ""union council"" may be used for localities that are part of cities.",[],[],[]


In [None]:
com_sim_test_df['co_occur_NER'] = intersect_NE[1]
com_sim_test_df

Unnamed: 0,ori_text,NER_complex,NER_simple,co_occur_NER
0,"One side of the armed conflicts is composed mainly of the Sudanese military and the Janjaweed, a Sudanese militia group recruited mostly from the Afro-Arab Abbala tribes of the northern Rizeigat region in Sudan.","[Afro-Arab Abbala, Janjaweed, Sudanese, Rizeigat]","[Afro-Arab Abbala, The Janjaweed, Rizeigat, Sudanese]","[Afro-Arab Abbala, Janjaweed, Sudanese, Rizeigat]"
1,"Jeddah is the principal gateway to Mecca, Islam's holiest city, which able-bodied Muslims are required to visit at least once in their lifetime.","[Muslims, Jeddah, Mecca, Islam]","[Muslims, Jeddah, Islam, Mecca]","[Muslims, Jeddah, Mecca, Islam]"
2,The Great Dark Spot is thought to represent a hole in the methane cloud deck of Neptune.,"[Great Dark Spot, Neptune]","[Great Dark Spot, Neptune]","[Great Dark Spot, Neptune]"
3,"His next work, Saturday, follows an especially eventful day in the life of a successful neurosurgeon.",[Saturday],[Saturday],[Saturday]
4,"The tarantula, the trickster character, spun a black cord and, attaching it to the ball, crawled away fast to the east, pulling on the cord with all his strength.",[east],[east],[east]
...,...,...,...,...
354,"Although the name suggests that they are located in the Bernese Oberland region of the canton of Bern, portions of the Bernese Alps are in the adjacent cantons of Valais, Lucerne, Obwalden, Fribourg and Vaud.","[the Bernese Alps, Bernese Oberland, Obwalden, Fribourg, Lucerne, Valais, Vaud]","[Parts of the Bernese Alps, Bernese Oberland, Fribourg, Obwalden, Lucerne, Valais, Vaud]","[the Bernese Alps, Bernese Oberland, Obwalden, Fribourg, Lucerne, Valais, Vaud]"
355,"There he had one daughter, later baptized as Mary Ann Fisher Power, to Ann (e) Power.",[Mary Ann Fisher Power],"[Mary Ann Fisher Power, one]",[Mary Ann Fisher Power]
356,"During an interview, Edward Gorey mentioned that Bawden was one of his favorite artists, lamenting the fact that not many people remembered or knew about this fine artist.","[Edward Gorey, Bawden]","[Edward Gorey, Bawden, one]","[Edward Gorey, Bawden]"
357,"The string can vibrate in different modes just as a guitar string can produce different notes, and every mode appears as a different particle: electron, photon, gluon, etc.","[electron, photon]","[electron, photon]","[electron, photon]"


### create dataset

In [None]:
target_dir = '/content/drive/MyDrive/muss/resources/datasets/asset_both_0912'
!mkdir $target_dir

In [None]:
data_names = ['valid_complex','test_complex']

In [None]:
df_list = ['com_sim_valid_df','com_sim_test_df']

In [None]:
for index,data_name in enumerate(data_names):

  chosen_df = globals()[df_list[index]]
  file_name = data_name.replace('_','.')

  
  with open(target_dir+'/'+file_name, 'w') as f:

    for index, row in chosen_df.iterrows():

        sentence = row.ori_text
        NER = row.co_occur_NER

        if isinstance(NER, float):
          sentence_w_control_token = " <NEXT_NE> " + " <SENT_START> " + sentence
        else:
          sentence_w_control_token = " <NEXT_NE> " +" <NEXT_NE> ".join(NER) + " <SENT_START> " + sentence
        # sentence_w_control_token = " <NEXT_NE>  <SENT_START> " + sentence

        if index%10000==0:
            print(index)
            print(sentence_w_control_token)

        f.write("{}\n".format(sentence_w_control_token))

        # break

  f.close()


0
 <NEXT_NE> Contra Costa <NEXT_NE> Mendocino <NEXT_NE> Adjacent <NEXT_NE> Solano <NEXT_NE> Marin <NEXT_NE> Lake <NEXT_NE> Napa <SENT_START> Adjacent counties are Marin (to the south), Mendocino (to the north), Lake (northeast), Napa (to the east), and Solano and Contra Costa (to the southeast).
0
 <NEXT_NE> Afro-Arab Abbala <NEXT_NE> Janjaweed <NEXT_NE> Sudanese <NEXT_NE> Rizeigat <SENT_START> One side of the armed conflicts is composed mainly of the Sudanese military and the Janjaweed, a Sudanese militia group recruited mostly from the Afro-Arab Abbala tribes of the northern Rizeigat region in Sudan.


# operation lexical simplification

In [None]:
import requests
import json
import ast
from multiprocessing import Pool


In [None]:
# API
url = #

headers = {
    'Content-Type': 'application/json',
    'x-api-key' : #
}

In [None]:
# request word level difficulty 
def get_data(text):
    r1 = requests.request("POST", url, headers=headers, data=json.dumps([{'id':'test','text':text}]))
    a = r1.json()
    return a

In [None]:
with Pool(5) as p:
    valid_result = p.map(get_data, valid_df['ori_text'])

with open('/content/drive/MyDrive/muss/qualitative/', 'wb') as fp:
    pickle.dump(result, fp)

In [None]:
with Pool(5) as p:
    test_result = p.map(get_data, test_df['ori_text'])

with open('/content/drive/MyDrive/muss/qualitative/', 'wb') as fp:
    pickle.dump(result, fp)

In [None]:
# combine complex and simple sentences word difficulty level 
from pandas import json_normalize

def process_file(f):
  jsonObj = json.loads(f)
  return json_normalize(jsonObj)

In [None]:
valid_frames = []

for i in range(len(valid_result)):

  if 'body' not in valid_result[i].keys():
    print('error',i)
    tmp = process_file(valid_result[i-1]['body'])
    tmp.loc[:] = np.nan
    valid_frames.append(tmp)
  else:
    valid_frames.append(process_file(valid_result[i]['body']))

valid_df_ = pd.concat(valid_frames)


In [None]:
test_frames = []

for i in range(len(test_result)):

  if 'body' not in test_result[i].keys():
    print('error',i)
    tmp = process_file(test_result[i-1]['body'])
    tmp.loc[:] = np.nan
    test_frames.append(tmp)
  else:
    test_frames.append(process_file(test_result[i]['body']))

test_df_ = pd.concat(test_frames)


In [None]:
test_df_

Unnamed: 0,id,read_level,words,read.A1,read.B1,read.B2,read.C1,read.overall
0,test,B2,"[{'id': 1, 'text': 'One', 'start_idx': 0, 'end...","[29.629629629629626, B2]","[14.814814814814813, B2]","[18.51851851851852, C1]","[0.0, A1]",B2
0,test,B1,"[{'id': 1, 'text': 'Jeddah', 'start_idx': 0, '...","[45.83333333333333, B1]","[12.5, B1]","[16.666666666666664, C1]","[0.0, A1]",B1
0,test,C1,"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...","[52.94117647058824, B1]","[23.52941176470588, C1]","[11.76470588235294, B2]","[0.0, A1]",C1
0,test,C1,"[{'id': 1, 'text': 'His', 'start_idx': 0, 'end...","[68.75, A2]","[18.75, C1]","[6.25, B1]","[0.0, A1]",C1
0,test,A1,"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...","[45.83333333333333, B1]","[4.166666666666666, A1]","[29.166666666666668, C2]","[0.0, A1]",A1
...,...,...,...,...,...,...,...,...
0,test,B2,"[{'id': 1, 'text': 'Although', 'start_idx': 0,...","[32.0, B2]","[16.0, B2]","[12.0, B2]","[8.0, C2]",B2
0,test,C2,"[{'id': 1, 'text': 'There', 'start_idx': 0, 'e...","[35.714285714285715, B2]","[7.142857142857142, A2]","[7.142857142857142, B1]","[7.142857142857142, C2]",C2
0,test,A1,"[{'id': 1, 'text': 'During', 'start_idx': 0, '...","[53.84615384615385, B1]","[3.8461538461538463, A1]","[3.8461538461538463, A2]","[0.0, A1]",A1
0,test,B1,"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...","[45.45454545454545, B1]","[13.636363636363635, B1]","[0.0, A1]","[0.0, A1]",B1


In [None]:
new_valid_df_ = valid_df_.drop(columns=['id','read.A1','read.B1','read.B2','read.C1','read.overall']).add_prefix('complex_')
new_test_df_ = test_df_.drop(columns=['id','read.A1','read.B1','read.B2','read.C1','read.overall']).add_prefix('complex_')

## validation set

In [None]:
valid_final_df = pd.concat([new_valid_df_.reset_index(), valid_df.drop(columns=['NER'])], axis=1).drop(columns='index')
valid_final_df

Unnamed: 0,complex_read_level,complex_words,ori_text
0,A1,"[{'id': 1, 'text': 'Adjacent', 'start_idx': 0,...","Adjacent counties are Marin (to the south), Me..."
1,B1,"[{'id': 1, 'text': 'A', 'start_idx': 0, 'end_i...",A Georgian inscription around the drum attests...
2,A1,"[{'id': 1, 'text': 'They', 'start_idx': 0, 'en...",They would later return to the revived series ...
3,B1,"[{'id': 1, 'text': 'Jameson', 'start_idx': 0, ...","Jameson's autobiography, How to Make Love Like..."
4,B2,"[{'id': 1, 'text': 'It', 'start_idx': 0, 'end_...",It is particularly famous for the cultivation ...
...,...,...,...
1995,B1,"[{'id': 1, 'text': 'On', 'start_idx': 0, 'end_...",On the 15 November 2003 a Sprinter train trave...
1996,A1,"[{'id': 1, 'text': 'A', 'start_idx': 0, 'end_i...",A console manufacturer is a company that manuf...
1997,B1,"[{'id': 1, 'text': 'Links', 'start_idx': 0, 'e...",Links with poverty and crime Those who are fun...
1998,A1,"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...","The term ""union council"" may be used for local..."


In [None]:
def get_word_lists(row,difficulty_level_list_like):
  C2_word_list = []

  # print(row['simple'])
  for words in row['complex_words']:
    # print(words['read_level'],words['text'])
    if words['read_level'] in difficulty_level_list_like:
      # print(words['text'])
      # if words['text'].lower() not in row['simple'].lower():
      C2_word_list.append(words['text'])

  if len(C2_word_list) >= 1:
    return C2_word_list
  else:
    return np.nan

In [None]:
C2C1B2_word_lists = []
for index,row in valid_final_df.iterrows():
  C2C1B2_word_lists.append(get_word_lists(row,['C2','C1','B2']))

In [None]:
valid_final_df['C2C1B2_in_complex'] = C2C1B2_word_lists

In [None]:
valid_final_df

Unnamed: 0,complex_read_level,complex_words,ori_text,C2C1B2_in_complex
0,A1,"[{'id': 1, 'text': 'Adjacent', 'start_idx': 0,...","Adjacent counties are Marin (to the south), Me...","[Adjacent, counties, Mendocino, Napa, Solano, ..."
1,B1,"[{'id': 1, 'text': 'A', 'start_idx': 0, 'end_i...",A Georgian inscription around the drum attests...,"[Georgian, attests]"
2,A1,"[{'id': 1, 'text': 'They', 'start_idx': 0, 'en...",They would later return to the revived series ...,"[revived, Special, Doctor, Cyber, Shades, Cyber]"
3,B1,"[{'id': 1, 'text': 'Jameson', 'start_idx': 0, ...","Jameson's autobiography, How to Make Love Like...","[Jameson, Love, Porn, Cautionary, Tale]"
4,B2,"[{'id': 1, 'text': 'It', 'start_idx': 0, 'end_...",It is particularly famous for the cultivation ...,[cultivation]
...,...,...,...,...
1995,B1,"[{'id': 1, 'text': 'On', 'start_idx': 0, 'end_...",On the 15 November 2003 a Sprinter train trave...,"[Sprinter, derailed, Ballan, Gordon, stationary]"
1996,A1,"[{'id': 1, 'text': 'A', 'start_idx': 0, 'end_i...",A console manufacturer is a company that manuf...,"[console, manufacturer, manufactures, distribu..."
1997,B1,"[{'id': 1, 'text': 'Links', 'start_idx': 0, 'e...",Links with poverty and crime Those who are fun...,"[Links, poverty, crime, functionally, illitera..."
1998,A1,"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...","The term ""union council"" may be used for local...",[localities]


In [None]:
# named entities not filtered out

with open('/content/drive/MyDrive/muss/qualitative/ABCD_asset_valid_C2C1B2', 'wb') as fp:
    pickle.dump(valid_final_df, fp)

In [None]:
# filter out hard words that are detected named entities
 
with open ('/content/drive/MyDrive/muss/qualitative/ABCD_asset_valid_C2C1B2', 'rb') as fp:
    valid_df_ABCD = pickle.load(fp)


In [None]:
FINAL_VALID_DF=pd.concat([com_sim_valid_df, valid_df_ABCD.drop(columns=['complex_read_level','C2C1B2_in_complex','ori_text'])], axis=1)
FINAL_VALID_DF

Unnamed: 0,ori_text,NER_complex,NER_simple,complex_words
0,"Adjacent counties are Marin (to the south), Me...","[Contra Costa, Mendocino, Adjacent, Solano, Ma...","[Mendocino, Lake, Contra Costa, Adjacent, Sola...","[{'id': 1, 'text': 'Adjacent', 'start_idx': 0,..."
1,A Georgian inscription around the drum attests...,[Georgian],[Georgian],"[{'id': 1, 'text': 'A', 'start_idx': 0, 'end_i..."
2,They would later return to the revived series ...,"[Christmas Special ""The Next Doctor, the Cyber...","[Christmas Special ""The Next Doctor, the Cyber...","[{'id': 1, 'text': 'They', 'start_idx': 0, 'en..."
3,"Jameson's autobiography, How to Make Love Like...","[A Cautionary Tale, August 17, 2004, Make Love...","[A Cautionary Tale, August 17, 2004, Make Love...","[{'id': 1, 'text': 'Jameson', 'start_idx': 0, ..."
4,It is particularly famous for the cultivation ...,[],[],"[{'id': 1, 'text': 'It', 'start_idx': 0, 'end_..."
...,...,...,...,...
1995,On the 15 November 2003 a Sprinter train trave...,"[November 2003, Sprinter, Ballarat, Ballan, Go...","[November 15, 2003, November 2003, Ballarat, S...","[{'id': 1, 'text': 'On', 'start_idx': 0, 'end_..."
1996,A console manufacturer is a company that manuf...,[],[],"[{'id': 1, 'text': 'A', 'start_idx': 0, 'end_i..."
1997,Links with poverty and crime Those who are fun...,[Links],[Links],"[{'id': 1, 'text': 'Links', 'start_idx': 0, 'e..."
1998,"The term ""union council"" may be used for local...",[],[],"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end..."


In [None]:
def get_word_lists(row,difficulty_level_list_like):
  C2_word_list = []

  for words in row['complex_words']:
    # print(words['read_level'],words['text'])
    if words['read_level'] in difficulty_level_list_like:
      # print(words['text'])

      if all(words['text'] not in x for x in row['NER_complex']):
        C2_word_list.append(words['text'])

  if len(C2_word_list) >= 1:
    return C2_word_list
  else:
    return np.nan

In [None]:
C2C1B2_word_lists = []
for index,row in FINAL_VALID_DF.iterrows():
  # print(index)
  C2C1B2_word_lists.append(get_word_lists(row,['C2','C1','B2']))

In [None]:
FINAL_VALID_DF['C2C1B2_in_complex'] = C2C1B2_word_lists

In [None]:
FINAL_VALID_DF

Unnamed: 0,ori_text,NER_complex,NER_simple,complex_words,C2C1B2_in_complex
0,"Adjacent counties are Marin (to the south), Me...","[Contra Costa, Mendocino, Adjacent, Solano, Ma...","[Mendocino, Lake, Contra Costa, Adjacent, Sola...","[{'id': 1, 'text': 'Adjacent', 'start_idx': 0,...",[counties]
1,A Georgian inscription around the drum attests...,[Georgian],[Georgian],"[{'id': 1, 'text': 'A', 'start_idx': 0, 'end_i...",[attests]
2,They would later return to the revived series ...,"[Christmas Special ""The Next Doctor, the Cyber...","[Christmas Special ""The Next Doctor, the Cyber...","[{'id': 1, 'text': 'They', 'start_idx': 0, 'en...",[revived]
3,"Jameson's autobiography, How to Make Love Like...","[A Cautionary Tale, August 17, 2004, Make Love...","[A Cautionary Tale, August 17, 2004, Make Love...","[{'id': 1, 'text': 'Jameson', 'start_idx': 0, ...",
4,It is particularly famous for the cultivation ...,[],[],"[{'id': 1, 'text': 'It', 'start_idx': 0, 'end_...",[cultivation]
...,...,...,...,...,...
1995,On the 15 November 2003 a Sprinter train trave...,"[November 2003, Sprinter, Ballarat, Ballan, Go...","[November 15, 2003, November 2003, Ballarat, S...","[{'id': 1, 'text': 'On', 'start_idx': 0, 'end_...","[derailed, stationary]"
1996,A console manufacturer is a company that manuf...,[],[],"[{'id': 1, 'text': 'A', 'start_idx': 0, 'end_i...","[console, manufacturer, manufactures, distribu..."
1997,Links with poverty and crime Those who are fun...,[Links],[Links],"[{'id': 1, 'text': 'Links', 'start_idx': 0, 'e...","[poverty, crime, functionally, illiterate, int..."
1998,"The term ""union council"" may be used for local...",[],[],"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...",[localities]


In [None]:
with open('/content/drive/MyDrive/muss/qualitative/asset_ABCD_valid_0911', 'wb') as fp:
    pickle.dump(FINAL_VALID_DF, fp)

## test set

In [None]:
test_final_df = pd.concat([new_test_df_.reset_index(), test_df.drop(columns=['NER'])], axis=1).drop(columns='index')
test_final_df

Unnamed: 0,complex_read_level,complex_words,ori_text
0,B2,"[{'id': 1, 'text': 'One', 'start_idx': 0, 'end...",One side of the armed conflicts is composed ma...
1,B1,"[{'id': 1, 'text': 'Jeddah', 'start_idx': 0, '...","Jeddah is the principal gateway to Mecca, Isla..."
2,C1,"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...",The Great Dark Spot is thought to represent a ...
3,C1,"[{'id': 1, 'text': 'His', 'start_idx': 0, 'end...","His next work, Saturday, follows an especially..."
4,A1,"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...","The tarantula, the trickster character, spun a..."
...,...,...,...
354,B2,"[{'id': 1, 'text': 'Although', 'start_idx': 0,...",Although the name suggests that they are locat...
355,C2,"[{'id': 1, 'text': 'There', 'start_idx': 0, 'e...","There he had one daughter, later baptized as M..."
356,A1,"[{'id': 1, 'text': 'During', 'start_idx': 0, '...","During an interview, Edward Gorey mentioned th..."
357,B1,"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...",The string can vibrate in different modes just...


In [None]:
def get_word_lists(row,difficulty_level_list_like):
  C2_word_list = []

  # print(row['simple'])
  for words in row['complex_words']:
    # print(words['read_level'],words['text'])
    if words['read_level'] in difficulty_level_list_like:
      # print(words['text'])
      # if words['text'].lower() not in row['simple'].lower():
      C2_word_list.append(words['text'])

  if len(C2_word_list) >= 1:
    return C2_word_list
  else:
    return np.nan

In [None]:
C2C1B2_word_lists = []
for index,row in test_final_df.iterrows():
  C2C1B2_word_lists.append(get_word_lists(row,['C2','C1','B2']))

In [None]:
test_final_df['C2C1B2_in_complex'] = C2C1B2_word_lists

In [None]:
test_final_df

Unnamed: 0,complex_read_level,complex_words,ori_text,C2C1B2_in_complex
0,B2,"[{'id': 1, 'text': 'One', 'start_idx': 0, 'end...",One side of the armed conflicts is composed ma...,"[conflicts, mainly, Sudanese, Janjaweed, Sudan..."
1,B1,"[{'id': 1, 'text': 'Jeddah', 'start_idx': 0, '...","Jeddah is the principal gateway to Mecca, Isla...","[Jeddah, Mecca, 's, holiest, able, bodied, lif..."
2,C1,"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...",The Great Dark Spot is thought to represent a ...,"[methane, Neptune]"
3,C1,"[{'id': 1, 'text': 'His', 'start_idx': 0, 'end...","His next work, Saturday, follows an especially...",[neurosurgeon]
4,A1,"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...","The tarantula, the trickster character, spun a...","[trickster, spun, cord, attaching, crawled, aw..."
...,...,...,...,...
354,B2,"[{'id': 1, 'text': 'Although', 'start_idx': 0,...",Although the name suggests that they are locat...,"[suggests, canton, Bern, Alps, cantons, Valais..."
355,C2,"[{'id': 1, 'text': 'There', 'start_idx': 0, 'e...","There he had one daughter, later baptized as M...","[baptized, Ann]"
356,A1,"[{'id': 1, 'text': 'During', 'start_idx': 0, '...","During an interview, Edward Gorey mentioned th...","[Edward, Gorey, lamenting]"
357,B1,"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...",The string can vibrate in different modes just...,"[vibrate, photon, gluon]"


In [None]:
with open('/content/drive/MyDrive/muss/qualitative/ABCD_asset_test_C2C1B2', 'wb') as fp:
    pickle.dump(test_final_df, fp)

In [None]:
with open ('/content/drive/MyDrive/muss/qualitative/ABCD_asset_test_C2C1B2', 'rb') as fp:
    test_df_ABCD = pickle.load(fp)


In [None]:
FINAL_TEST_DF=pd.concat([com_sim_test_df, test_df_ABCD.drop(columns=['complex_read_level','C2C1B2_in_complex','ori_text'])], axis=1)
FINAL_TEST_DF

Unnamed: 0,ori_text,NER_complex,NER_simple,complex_words
0,One side of the armed conflicts is composed ma...,"[Afro-Arab Abbala, Janjaweed, Sudanese, Rizeigat]","[Afro-Arab Abbala, The Janjaweed, Rizeigat, Su...","[{'id': 1, 'text': 'One', 'start_idx': 0, 'end..."
1,"Jeddah is the principal gateway to Mecca, Isla...","[Muslims, Jeddah, Mecca, Islam]","[Muslims, Jeddah, Islam, Mecca]","[{'id': 1, 'text': 'Jeddah', 'start_idx': 0, '..."
2,The Great Dark Spot is thought to represent a ...,"[Great Dark Spot, Neptune]","[Great Dark Spot, Neptune]","[{'id': 1, 'text': 'The', 'start_idx': 0, 'end..."
3,"His next work, Saturday, follows an especially...",[Saturday],[Saturday],"[{'id': 1, 'text': 'His', 'start_idx': 0, 'end..."
4,"The tarantula, the trickster character, spun a...",[east],[east],"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end..."
...,...,...,...,...
354,Although the name suggests that they are locat...,"[the Bernese Alps, Bernese Oberland, Obwalden,...","[Parts of the Bernese Alps, Bernese Oberland, ...","[{'id': 1, 'text': 'Although', 'start_idx': 0,..."
355,"There he had one daughter, later baptized as M...",[Mary Ann Fisher Power],"[Mary Ann Fisher Power, one]","[{'id': 1, 'text': 'There', 'start_idx': 0, 'e..."
356,"During an interview, Edward Gorey mentioned th...","[Edward Gorey, Bawden]","[Edward Gorey, Bawden, one]","[{'id': 1, 'text': 'During', 'start_idx': 0, '..."
357,The string can vibrate in different modes just...,"[electron, photon]","[electron, photon]","[{'id': 1, 'text': 'The', 'start_idx': 0, 'end..."


In [None]:
def get_word_lists(row,difficulty_level_list_like):
  C2_word_list = []

  for words in row['complex_words']:
    # print(words['read_level'],words['text'])
    if words['read_level'] in difficulty_level_list_like:
      # print(words['text'])

      if all(words['text'] not in x for x in row['NER_complex']):
        C2_word_list.append(words['text'])

  if len(C2_word_list) >= 1:
    return C2_word_list
  else:
    return np.nan

In [None]:
C2C1B2_word_lists = []
for index,row in FINAL_TEST_DF.iterrows():
  # print(index)
  C2C1B2_word_lists.append(get_word_lists(row,['C2','C1','B2']))

In [None]:
FINAL_TEST_DF['C2C1B2_in_complex'] = C2C1B2_word_lists

In [None]:
FINAL_TEST_DF

Unnamed: 0,ori_text,NER_complex,NER_simple,complex_words,C2C1B2_in_complex
0,One side of the armed conflicts is composed ma...,"[Afro-Arab Abbala, Janjaweed, Sudanese, Rizeigat]","[Afro-Arab Abbala, The Janjaweed, Rizeigat, Su...","[{'id': 1, 'text': 'One', 'start_idx': 0, 'end...","[conflicts, mainly, militia, mostly, tribes]"
1,"Jeddah is the principal gateway to Mecca, Isla...","[Muslims, Jeddah, Mecca, Islam]","[Muslims, Jeddah, Islam, Mecca]","[{'id': 1, 'text': 'Jeddah', 'start_idx': 0, '...","['s, holiest, able, bodied, lifetime]"
2,The Great Dark Spot is thought to represent a ...,"[Great Dark Spot, Neptune]","[Great Dark Spot, Neptune]","[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...",[methane]
3,"His next work, Saturday, follows an especially...",[Saturday],[Saturday],"[{'id': 1, 'text': 'His', 'start_idx': 0, 'end...",[neurosurgeon]
4,"The tarantula, the trickster character, spun a...",[east],[east],"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...","[trickster, spun, cord, attaching, crawled, aw..."
...,...,...,...,...,...
354,Although the name suggests that they are locat...,"[the Bernese Alps, Bernese Oberland, Obwalden,...","[Parts of the Bernese Alps, Bernese Oberland, ...","[{'id': 1, 'text': 'Although', 'start_idx': 0,...","[suggests, canton, cantons]"
355,"There he had one daughter, later baptized as M...",[Mary Ann Fisher Power],"[Mary Ann Fisher Power, one]","[{'id': 1, 'text': 'There', 'start_idx': 0, 'e...",[baptized]
356,"During an interview, Edward Gorey mentioned th...","[Edward Gorey, Bawden]","[Edward Gorey, Bawden, one]","[{'id': 1, 'text': 'During', 'start_idx': 0, '...",[lamenting]
357,The string can vibrate in different modes just...,"[electron, photon]","[electron, photon]","[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...","[vibrate, gluon]"


In [None]:
with open('/content/drive/MyDrive/muss/qualitative/asset_ABCD_test_0911', 'wb') as fp:
    pickle.dump(FINAL_TEST_DF, fp)

## create dataset (all hard words in prefix)

In [None]:
target_dir = '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_C1C2B2'
!mkdir $target_dir

In [None]:
data_names = ['valid_complex','test_complex']

In [None]:
df_list = ['FINAL_VALID_DF','FINAL_TEST_DF']

In [None]:
for index,data_name in enumerate(data_names):

  chosen_df = globals()[df_list[index]]
  file_name = data_name.replace('_','.')

  
  with open(target_dir+'/'+file_name, 'w') as f:

    for index, row in chosen_df.iterrows():

        sentence = row.ori_text
        NER = row.C2C1B2_in_complex

        if isinstance(NER, float):
          sentence_w_control_token = " <NEXT_DIFFICULT_WORD> " + " <SENT_START> " + sentence
        else:
          sentence_w_control_token = " <NEXT_DIFFICULT_WORD> " +" <NEXT_DIFFICULT_WORD> ".join(NER) + " <SENT_START> " + sentence

        if index%10000==0:
            print(index)
            print(sentence_w_control_token)

        f.write("{}\n".format(sentence_w_control_token))

        # break

  f.close()


0
 <NEXT_DIFFICULT_WORD> counties <SENT_START> Adjacent counties are Marin (to the south), Mendocino (to the north), Lake (northeast), Napa (to the east), and Solano and Contra Costa (to the southeast).
0
 <NEXT_DIFFICULT_WORD> conflicts <NEXT_DIFFICULT_WORD> mainly <NEXT_DIFFICULT_WORD> militia <NEXT_DIFFICULT_WORD> mostly <NEXT_DIFFICULT_WORD> tribes <SENT_START> One side of the armed conflicts is composed mainly of the Sudanese military and the Janjaweed, a Sudanese militia group recruited mostly from the Afro-Arab Abbala tribes of the northern Rizeigat region in Sudan.


## dataset with differnt number of ABCD word

In [None]:
def make_dataset(folder_name,data_names,num_NE,all_word=False,all_word_random=False):
    
    target_dir = '/content/drive/MyDrive/muss/resources/datasets/'+folder_name
    os.makedirs(target_dir)#,exist_ok=True)

    for index,data_name in enumerate(data_names):
        df = globals()[df_list[index]]
        file_name = data_name.replace('_','.')  
        # file_name = data_name+'_ori'
        # data_tuples = list(zip(globals()[file_name],intersection[index]))
        # df = pd.DataFrame(data_tuples, columns=['ori_text','NER'])
        file_name = data_name.replace('_','.')

        with open(target_dir+'/'+file_name, 'w') as f:
          for index, row in df.iterrows():
              
              if all_word == False:
                  sentence = row.ori_text
                  NER = row.C2C1B2_in_complex


                  if not isinstance(NER, float):

                    if len(NER) != 0:
                        NE_remained = row.C2C1B2_in_complex[:num_NE]
                        sentence_w_control_token = " <NEXT_DIFFICULT_WORD> " + " <NEXT_DIFFICULT_WORD> ".join(NE_remained) + " <SENT_START> " + sentence
                    else:
                      sentence_w_control_token = " <NEXT_DIFFICULT_WORD> "+ " <SENT_START> " + sentence
                  else:
                      sentence_w_control_token = " <NEXT_DIFFICULT_WORD> "+ " <SENT_START> " + sentence

                  if index%100==0:
                      print(index)
                      print(sentence_w_control_token)

                  f.write("{}\n".format(sentence_w_control_token))
              else:
                  sentence = row.ori_text
                  words_NE_list = sentence.split(' ')

                  if all_word_random == True:
                      random.shuffle(words_NE_list)

                  sentence_w_control_token = " <NEXT_DIFFICULT_WORD> " + " <NEXT_DIFFICULT_WORD> ".join(words_NE_list) + " <SENT_START> " + sentence

                  if index%100==0:
                      print(index)
                      print(sentence_w_control_token)

                  f.write("{}\n".format(sentence_w_control_token))

        f.close()


In [None]:
make_dataset('asset_ABCD_all_words_random_0911',data_names,9999,all_word=True,all_word_random=True)

0
 <NEXT_DIFFICULT_WORD> Mendocino <NEXT_DIFFICULT_WORD> (to <NEXT_DIFFICULT_WORD> southeast). <NEXT_DIFFICULT_WORD> are <NEXT_DIFFICULT_WORD> the <NEXT_DIFFICULT_WORD> Napa <NEXT_DIFFICULT_WORD> Marin <NEXT_DIFFICULT_WORD> (northeast), <NEXT_DIFFICULT_WORD> Costa <NEXT_DIFFICULT_WORD> Lake <NEXT_DIFFICULT_WORD> (to <NEXT_DIFFICULT_WORD> Contra <NEXT_DIFFICULT_WORD> and <NEXT_DIFFICULT_WORD> and <NEXT_DIFFICULT_WORD> south), <NEXT_DIFFICULT_WORD> the <NEXT_DIFFICULT_WORD> (to <NEXT_DIFFICULT_WORD> counties <NEXT_DIFFICULT_WORD> (to <NEXT_DIFFICULT_WORD> Adjacent <NEXT_DIFFICULT_WORD> the <NEXT_DIFFICULT_WORD> east), <NEXT_DIFFICULT_WORD> the <NEXT_DIFFICULT_WORD> north), <NEXT_DIFFICULT_WORD> Solano <SENT_START> Adjacent counties are Marin (to the south), Mendocino (to the north), Lake (northeast), Napa (to the east), and Solano and Contra Costa (to the southeast).
100
 <NEXT_DIFFICULT_WORD> appealing <NEXT_DIFFICULT_WORD> and <NEXT_DIFFICULT_WORD> aims <NEXT_DIFFICULT_WORD> easy <NEXT

# opeartion pererving + lexical simplifition


In [None]:
FINAL_VALID_DF = pd.read_pickle("/content/drive/MyDrive/muss/qualitative/asset_ABCD_valid_0911")
FINAL_TEST_DF = pd.read_pickle("/content/drive/MyDrive/muss/qualitative/asset_ABCD_test_0911")


In [None]:
FINAL_VALID_DF

Unnamed: 0,ori_text,NER_complex,NER_simple,complex_words,C2C1B2_in_complex
0,"Adjacent counties are Marin (to the south), Me...","[Contra Costa, Mendocino, Adjacent, Solano, Ma...","[Mendocino, Lake, Contra Costa, Adjacent, Sola...","[{'id': 1, 'text': 'Adjacent', 'start_idx': 0,...",[counties]
1,A Georgian inscription around the drum attests...,[Georgian],[Georgian],"[{'id': 1, 'text': 'A', 'start_idx': 0, 'end_i...",[attests]
2,They would later return to the revived series ...,"[Christmas Special ""The Next Doctor, the Cyber...","[Christmas Special ""The Next Doctor, the Cyber...","[{'id': 1, 'text': 'They', 'start_idx': 0, 'en...",[revived]
3,"Jameson's autobiography, How to Make Love Like...","[A Cautionary Tale, August 17, 2004, Make Love...","[A Cautionary Tale, August 17, 2004, Make Love...","[{'id': 1, 'text': 'Jameson', 'start_idx': 0, ...",
4,It is particularly famous for the cultivation ...,[],[],"[{'id': 1, 'text': 'It', 'start_idx': 0, 'end_...",[cultivation]
...,...,...,...,...,...
1995,On the 15 November 2003 a Sprinter train trave...,"[November 2003, Sprinter, Ballarat, Ballan, Go...","[November 15, 2003, November 2003, Ballarat, S...","[{'id': 1, 'text': 'On', 'start_idx': 0, 'end_...","[derailed, stationary]"
1996,A console manufacturer is a company that manuf...,[],[],"[{'id': 1, 'text': 'A', 'start_idx': 0, 'end_i...","[console, manufacturer, manufactures, distribu..."
1997,Links with poverty and crime Those who are fun...,[Links],[Links],"[{'id': 1, 'text': 'Links', 'start_idx': 0, 'e...","[poverty, crime, functionally, illiterate, int..."
1998,"The term ""union council"" may be used for local...",[],[],"[{'id': 1, 'text': 'The', 'start_idx': 0, 'end...",[localities]


## create dataset (all in prefix)

In [None]:
target_dir = '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER'
!mkdir $target_dir

In [None]:
data_names = ['valid_complex','test_complex']

In [None]:
df_list = ['FINAL_VALID_DF','FINAL_TEST_DF']

In [None]:
for index,data_name in enumerate(data_names):

  chosen_df = globals()[df_list[index]]
  file_name = data_name.replace('_','.')

  
  with open(target_dir+'/'+file_name, 'w') as f:

    for index, row in chosen_df.iterrows():

        sentence = row.ori_text
        NER = row.NER_simple
        hard_word = row.C2C1B2_in_complex

        if not isinstance(hard_word, float):

          sentence_w_control_token = " <NEXT_NE> "+" <NEXT_NE> ".join(NER)+" <NEXT_DIFFICULT_WORD> "+" <NEXT_DIFFICULT_WORD> ".join(hard_word)+ " <SENT_START> " + sentence

          if index%10000==0:
              print(index)
              print(sentence_w_control_token)

          f.write("{}\n".format(sentence_w_control_token))

        # break

  f.close()


0
 <NEXT_NE> Mendocino, Lake <NEXT_NE> Contra Costa <NEXT_NE> Adjacent <NEXT_NE> Solano <NEXT_NE> Marin <NEXT_NE> Napa <NEXT_DIFFICULT_WORD> counties <SENT_START> Adjacent counties are Marin (to the south), Mendocino (to the north), Lake (northeast), Napa (to the east), and Solano and Contra Costa (to the southeast).


In [None]:
data_names = ['test_complex']

In [None]:
df_list = ['FINAL_TEST_DF']

In [None]:
for index,data_name in enumerate(data_names):

  chosen_df = globals()[df_list[index]]
  file_name = data_name.replace('_','.')

  
  with open(target_dir+'/'+file_name, 'w') as f:

    for index, row in chosen_df.iterrows():

        sentence = row.ori_text
        NER = row.NER_simple
        hard_word = row.C2C1B2_in_complex

        if isinstance(hard_word, float):
          hard_word=[]

        sentence_w_control_token = " <NEXT_NE> "+" <NEXT_NE> ".join(NER)+" <NEXT_DIFFICULT_WORD> "+" <NEXT_DIFFICULT_WORD> ".join(hard_word)+ " <SENT_START> " + sentence


        if index%10000==0:
            print(index)
            print(sentence_w_control_token)

        f.write("{}\n".format(sentence_w_control_token))

        # break

  f.close()


0
 <NEXT_NE> Mendocino, Lake <NEXT_NE> Contra Costa <NEXT_NE> Adjacent <NEXT_NE> Solano <NEXT_NE> Marin <NEXT_NE> Napa <NEXT_DIFFICULT_WORD> counties <SENT_START> Adjacent counties are Marin (to the south), Mendocino (to the north), Lake (northeast), Napa (to the east), and Solano and Contra Costa (to the southeast).
0
 <NEXT_NE> Afro-Arab Abbala <NEXT_NE> The Janjaweed <NEXT_NE> Rizeigat <NEXT_NE> Sudanese <NEXT_DIFFICULT_WORD> conflicts <NEXT_DIFFICULT_WORD> mainly <NEXT_DIFFICULT_WORD> militia <NEXT_DIFFICULT_WORD> mostly <NEXT_DIFFICULT_WORD> tribes <SENT_START> One side of the armed conflicts is composed mainly of the Sudanese military and the Janjaweed, a Sudanese militia group recruited mostly from the Afro-Arab Abbala tribes of the northern Rizeigat region in Sudan.


## dataset with differnt number of NER & ABCD

In [None]:
data_names,df_list

(['test_complex'], ['FINAL_TEST_DF'])

In [None]:
def make_dataset(folder_name,data_names,num_NE,num_ABCD):
    
    target_dir = '/content/drive/MyDrive/muss/resources/datasets/'+folder_name
    os.makedirs(target_dir)#,exist_ok=True)

    for index,data_name in enumerate(data_names):
        df = globals()[df_list[index]]
        file_name = data_name.replace('_','.')  

        with open(target_dir+'/'+file_name, 'w') as f:
          for index, row in df.iterrows():

            sentence = row.ori_text
            NER = row.NER_simple
            hard_word = row.C2C1B2_in_complex

            if isinstance(hard_word, float):
              hard_word=[]
            
            if num_ABCD == 'all':
              # fix num ABCD word and change number of NE

              if len(NER) != 0:
                NE_remained = NER[:num_NE]
                sentence_w_control_token = " <NEXT_NE> "+" <NEXT_NE> ".join(NE_remained)+" <NEXT_DIFFICULT_WORD> "+" <NEXT_DIFFICULT_WORD> ".join(hard_word)+ " <SENT_START> " + sentence
              else:
                sentence_w_control_token = " <NEXT_NE> "+" <NEXT_DIFFICULT_WORD> "+" <NEXT_DIFFICULT_WORD> ".join(hard_word)+ " <SENT_START> " + sentence
            
            elif num_NE == 'all':
              # fix num NE and change number of ABCD word
              if len(hard_word) != 0:
                hard_word_remained = hard_word[:num_ABCD]
                sentence_w_control_token = " <NEXT_NE> "+" <NEXT_NE> ".join(NER)+" <NEXT_DIFFICULT_WORD> "+" <NEXT_DIFFICULT_WORD> ".join(hard_word_remained)+ " <SENT_START> " + sentence
              else:
                sentence_w_control_token = " <NEXT_NE> "+" <NEXT_NE> ".join(NER)+" <NEXT_DIFFICULT_WORD> "+" <SENT_START> " + sentence


            if index%100==0:
                print(index)
                print(sentence_w_control_token)

            f.write("{}\n".format(sentence_w_control_token))
        

        f.close()


In [None]:
make_dataset('asset_ABCD_NER_fix_NE_6CERF',data_names,'all',6)

0
 <NEXT_NE> Afro-Arab Abbala <NEXT_NE> The Janjaweed <NEXT_NE> Rizeigat <NEXT_NE> Sudanese <NEXT_DIFFICULT_WORD> conflicts <NEXT_DIFFICULT_WORD> mainly <NEXT_DIFFICULT_WORD> militia <NEXT_DIFFICULT_WORD> mostly <NEXT_DIFFICULT_WORD> tribes <SENT_START> One side of the armed conflicts is composed mainly of the Sudanese military and the Janjaweed, a Sudanese militia group recruited mostly from the Afro-Arab Abbala tribes of the northern Rizeigat region in Sudan.
100
 <NEXT_NE>  <NEXT_DIFFICULT_WORD> dislodge <NEXT_DIFFICULT_WORD> sink <NEXT_DIFFICULT_WORD> digest <SENT_START> It will then dislodge itself and sink back to the river bed in order to digest its food and wait for its next meal.
200
 <NEXT_NE> the centuries <NEXT_NE> Christian <NEXT_DIFFICULT_WORD> proposed <NEXT_DIFFICULT_WORD> rejected <NEXT_DIFFICULT_WORD> mainstream <SENT_START> Alternative views on the subject have been proposed throughout the centuries (see below), but all were rejected by mainstream Christian bodies.
3