In [None]:
!pip install qwikidata -q
!pip install datasets -q
!pip install transformers -q
!pip install seqeval -q
!pip install emoji -q

[K     |████████████████████████████████| 451 kB 4.8 MB/s 
[K     |████████████████████████████████| 132 kB 61.5 MB/s 
[K     |████████████████████████████████| 212 kB 68.4 MB/s 
[K     |████████████████████████████████| 182 kB 48.2 MB/s 
[K     |████████████████████████████████| 127 kB 76.8 MB/s 
[K     |████████████████████████████████| 5.8 MB 5.0 MB/s 
[K     |████████████████████████████████| 7.6 MB 50.1 MB/s 
[K     |████████████████████████████████| 43 kB 1.3 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 240 kB 4.7 MB/s 
[?25h  Building wheel for emoji (setup.py) ... [?25l[?25hdone


# Functions

In [None]:
import re
def process_hashtag(input_text: str) -> str:
    return re.sub(
        r'#[a-z]\S*',
        lambda m: ' '.join(re.findall('[A-Z][^A-Z]*|[a-z][^A-Z]*', m.group().lstrip('#'))),
        input_text,
    )

def clean_entity(entity):
  entity = entity.replace('# ','#')
  entity = process_hashtag(entity)
  entity = entity.replace('#','')
  entity = entity.replace('  ',' ')
  return entity.strip()


def get_dataset_entities(df, tok):
  list1 = set()
  entity=''
  for i,j in zip(df['tokens'], df['ner_tags']):
    
    
    for i2,j2 in zip(i,j):
    
        if j2=='B-'+tok:
         
          entity =i2
          continue
        if j2=='I-'+tok:
          entity =entity +' '+i2
          continue
        else:
          if entity != '':
            list1.add(clean_entity(entity.lower()))
            entity=''

  return list1

In [None]:



def get_samples(dataset):
  
  ls=[]
  for index, row in dataset.iterrows():
    tok= row['tokens']
    tags= row['ner_tags']
    

    if not all(element == 'O' for element in tags):
      print(tok)
      print(tags)
      ls.append([tok,tags])
  return ls

In [None]:
import pandas as pd

def read_bio_dataset(dir):
  tok = []  #Aux list of tokens for current sentence
  bio = []  #Aux list of ner tags for current sentence
  df_list = []  #Final list with all the information

  with open(dir,'r',encoding='utf-8') as file:
    for line in file.readlines():

      #When reaching the end of a sentence, we append and restart tok and bio
      #We also check for non-empty sentences
      if line == '\n' and tok!=[] and bio!=[]:
        df_list.append([tok,bio])
        tok = []
        bio = []

      else:

        #We add the token and ner_tag to the list
        tok.append(line.split(' ')[0])
        bio.append(line.split(' ')[-1].replace('\n',''))

  #Returning df_list to a dataframe
  return pd.DataFrame(df_list, columns=['tokens','ner_tags'])

def write_bio_dataset(dataset,outputfile):
  with open(outputfile, 'w',encoding='utf-8') as f:
    
    for index, row in dataset.iterrows():
      toks= row['tokens']
      tags= row['ner_tags']
      for tok,tag in zip(toks,tags):
        f.write(str(tok)+' '+str(tag)+'\n')
      f.write('\n')

import pandas as pd

def read_entities(dir):

  df_list = [] 

  with open(dir,'r',encoding='utf-8') as file:
    for line in file.readlines():

      df_list.append(str(line).strip())

  #Returning df_list to a dataframe
  return df_list

In [None]:
training_data = read_bio_dataset('train_spacy.txt')

In [None]:
valid_data = read_bio_dataset('valid_spacy.txt')

In [None]:
training_data

In [None]:
import pandas as pd
import emoji

def is_valid_token(tok):
  if 'http' in tok:
    return False
  if emoji.is_emoji(tok):
    return False
  if '' == tok:
    return False
  if '\'' == tok:
    return False
  if '#' == tok:
    return False
  if '"' == tok:
    return False
  if '@' in tok:
    return False
  if 'u200d' in tok:
    return False
  if '“' == tok:
    return False
  if '“' == tok:
    return False

  return True


def is_valid_char(c):
  if ord(c)>252:
    return False
  if c == '#':
    return False
  if '“' == c:
    return False
  if '“' == c:
    return False
  if '\'' == c:
    return False
  
  if '"' == c:
    return False
  if '@' == c:
    return False
  if '“' == c:
    return False
  if '“' == c:
    return False

  return True


def clean_data(dataset):
  rows_delete=[]
  for index, row in dataset.iterrows():
    toks= row['tokens']
    tags= row['ner_tags']
    new_tok=[]
    new_tags=[]

    ## for each token
    for i in range(len(toks)):
      
      t=toks[i]
      l=tags[i]

      if not is_valid_token(t):
        continue

      st=''
      for c in t:
        if is_valid_char(c):
          st=st+c

      if st == '':
        continue
      new_tok.append(st)
      new_tags.append(l)

    row['tokens']=new_tok
    row['ner_tags']=new_tags
    if len(new_tok)== 0:
      rows_delete.append(index)

    if len(new_tok)< 4 and all(element == 'O' for element in tags):
      rows_delete.append(index)

  dataset.drop(rows_delete, axis=0, inplace=True)
  dataset.reset_index(inplace=True, drop=True)

  
  return dataset


In [None]:
training_data_clean = clean_data(training_data)
valid_data_clean = clean_data(valid_data)

In [None]:
training_data_clean

In [None]:
write_bio_dataset(training_data_clean,'train_clean.txt')
write_bio_dataset(valid_data_clean,'valid_clean.txt')

In [None]:
def count_entities(dataset,tag):
  counter=0
  for index, row in dataset.iterrows():
    #print(row[1])

    if tag in row[1]:
      counter+=1
  return counter

# Destroy corpus

In [None]:
training_data_pruned_10 = training_data_clean.sample(frac = 0.1,random_state=8)
training_data_pruned_30 = training_data_clean.sample(frac = 0.3,random_state=8)
training_data_pruned_50 = training_data_clean.sample(frac = 0.5,random_state=8)
training_data_pruned_10.reset_index(inplace=True, drop=True)
training_data_pruned_30.reset_index(inplace=True, drop=True)
training_data_pruned_50.reset_index(inplace=True, drop=True)

In [None]:
print(count_entities(training_data_pruned_10,'B-PROFESION'))
print(count_entities(training_data_pruned_30,'B-PROFESION'))
print(count_entities(training_data_pruned_50,'B-PROFESION'))



124
361
596


In [None]:
write_bio_dataset(training_data_pruned_10,'train_10.txt')
write_bio_dataset(training_data_pruned_30,'train_30.txt')
write_bio_dataset(training_data_pruned_50,'train_50.txt')

In [None]:
!rm -r pruned

In [None]:
training_data_pruned_10

In [None]:
training_data_pruned_10.to_csv('pruned/training_10.tsv', sep="\t",index=False,encoding='utf8')
training_data_pruned_30.to_csv('pruned/training_30.tsv', sep="\t",index=False,encoding='utf8')
training_data_pruned_50.to_csv('pruned/training_50.tsv', sep="\t",index=False,encoding='utf8')
training_data.to_csv('pruned/training_or.tsv', sep="\t",index=False,encoding='utf8')

In [None]:
training_data_pruned_10 = read_bio_dataset('train_10.txt')
training_data_pruned_30 = read_bio_dataset('train_30.txt')
training_data_pruned_50 = read_bio_dataset('train_50.txt')
training_data = read_bio_dataset('train_clean.txt')


# List of entities

In [None]:
total_entities =get_dataset_entities(training_data,'PROFESION')

# WIKIDATA FUNCTIONS

In [None]:
!pip install qwikidata -q

  Building wheel for qwikidata (setup.py) ... [?25l[?25hdone


In [None]:
from qwikidata.sparql import return_sparql_query_results

def get_concept_code(term):
  try: 
        query = '''
        select ?item 
        where{ ?item rdfs:label '#TERM'@es
        }
        '''
        query = query.replace('#TERM',term)

        query_res = return_sparql_query_results(query)
        val = query_res['results']['bindings'][0]['item']['value']
        
        return val.split('/')[-1]  
        

  except Exception as e:
        print("Exception:",e)
        return None

def get_description(code,lang):
  try: 
        query = '''
        select ?label
        where{
          
            
            wd:#ENTITY schema:description ?label filter (lang(?label) = "#LANG").
        }
        
        '''
        query = query.replace('#ENTITY',code).replace('#LANG',lang)

        query_res = return_sparql_query_results(query)
        val = query_res['results']['bindings'][0]['label']['value']
        
        return val  
        

  except Exception as e:
        print("Exception:",e)
        return None


def get_related_properties(code, rel, lang):
    res = []
    try:
        query = '''
        select ?label
        where{
            wd:#ENTITY wdt:#REL ?item .
            
            ?item rdfs:label ?label filter (lang(?label) = "#LANG").
        }

        '''
        query = query.replace('#ENTITY',code).replace('#LANG',lang).replace('#REL',rel)

        query_res = return_sparql_query_results(query)

        for i in query_res['results']['bindings']:
            res.append(i['label']['value'])

        return res

    except Exception as e:
        print("Exception:",e)
        return []


def search_child(entity, relation, lang, limit):
    res = []
    try: #bind(SHA512(concat(str(rand()), str(?item))) as ?random) .
        query = '''
        select ?item 
        where{
           
            
            ?item rdfs:label 'científico'@es
        }
        
        limit #LIMIT
        '''
        query = query.replace('#ENTITY',entity).replace('#LIMIT',str(limit)).replace('#RELATION',relation).replace('#LANG',lang)

        query_res = return_sparql_query_results(query)
        print(query_res)
        for i in query_res['results']['bindings']:
            res.append(i['rel']['value'])

        return res 

    except Exception as e:
        print("Exception:",e)
        return None

#search_child('Q28640','P31','es',20)
print(get_concept_code('científico'))
print(get_description('Q901','es'))
print(get_related_properties('Q901','P279','es'))
print(get_related_properties('Q160131','P1056','es'))
print(get_related_properties('Q160131','P2283','es'))





Q901
persona dedicada al estudio de una ciencia
['erudito']
['pan']
['calor', 'panadería', 'horno']


In [None]:
def produce_nlp_sentences(term):
   res=[] 
   code = get_concept_code(term)
   if code == None:
     return []
   val= get_description(code,'es')
   tagged_term= '['+term+']'
   if val != None:
     res.append(tagged_term +' es '+val)
   
   val= get_related_properties(code,'P279','es') #sub
   if len(val) >0:
     temp='de '
     for v in val:
       temp=temp+'['+str(v)+']'+' y de '
     temp=temp[:-6]
     res.append(tagged_term +' es un tipo '+temp)
   
   val= get_related_properties(code,'P1056','es') #produce
  
   if len(val) >0:
     temp=''
     for v in val:
       temp=temp+str(v)+' y '
     temp=temp[:-3]
     res.append(tagged_term +' produce '+temp)

   val= get_related_properties(code,'P2283','es') #produce
   if len(val) >0:
     temp=''
     for v in val:
       temp=temp+str(v)+' y '
     temp=temp[:-3]
     res.append(tagged_term +' usa '+temp)

   val= get_related_properties(code,'P2521','es') #femb
   if len(val) >0:
     temp=''
     for v in val:
       temp=temp+'['+str(v)+']'+' y '
     temp=temp[:-3]
     res.append(tagged_term +' es la forma femenina de '+tagged_term)

   val= get_related_properties(code,'P425','es') #produce
   if len(val) >0:
     temp=''
     for v in val:
       temp=temp+str(v)+' y '
     temp=temp[:-3]
     res.append(temp +' es el ámbito de ocupación de '+tagged_term)

   return res

print(produce_nlp_sentences('científico'))


['[científico] es persona dedicada al estudio de una ciencia', '[científico] es un tipo de [erudito]', '[científico] usa método científico', 'ciencia es el ámbito de ocupación de [científico]']


In [None]:
P2521 #femenina
P425 #ámbito de la ocupación

In [None]:
print(produce_nlp_sentences('panadero'))


['[panadero] es persona que prepara o vende pan', '[panadero] produce pan', '[panadero] usa calor y horno y panadería', 'horneado es el ámbito de ocupación de [panadero]']


In [None]:
print(produce_nlp_sentences('cocinero'))


['[cocinero] es persona que cocina por oficio y profesión', '[cocinero] es un tipo de [artesano]', 'cocinar y cocina es el ámbito de ocupación de [cocinero]']


In [None]:
print(produce_nlp_sentences('actor'))


['[actor] es persona que actúa en una producción fílmica, televisiva, teatral o radial', '[actor] es un tipo de [artista escénico]', 'actuación es el ámbito de ocupación de [actor]']


In [None]:
from nltk.tokenize.toktok import ToktokTokenizer
toktok = ToktokTokenizer()
res =toktok.tokenize('[banquero de finanzas] es aquel sujeto al banco que trabaja con [banqueros]')
res

def annotate_sentence_bio(sentence,tag):
  tok=[]
  lab=[]
  found=0
  for a in sentence:
    if a == '[':
      found=1
      continue
    if a==']':
      found=0
      continue
    if found==0:
      tok.append(a)
      lab.append('O')
      continue
    if found==1:
      tok.append(a)
      lab.append('B-'+tag)
      found=2
      continue
    if found==2:
      tok.append(a)
      lab.append('I-'+tag)

  return tok, lab  
annotate_sentence_bio(res,'PROFESION')


(['banquero',
  'de',
  'finanzas',
  'es',
  'aquel',
  'sujeto',
  'al',
  'banco',
  'que',
  'trabaja',
  'con',
  'banqueros'],
 ['B-PROFESION',
  'I-PROFESION',
  'I-PROFESION',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PROFESION'])

In [None]:
produce_nlp_sentences('enfermero especialista gerontológico')

Exception: list index out of range


['enfermero especialista gerontológico es un tipo de enfermero especialista']

In [None]:
from nltk.tokenize.toktok import ToktokTokenizer

def generate_bio_sentences(sentences,term):
  toktok = ToktokTokenizer()
  res=[]
  if sentences != []:
    wrd_first = term.split(' ')[0]
    wrd_next = term.split(' ')[1:]

    tok_temp_res = list(map(lambda x : toktok.tokenize(x), sentences))
    bio_temp_res = []
    for i in tok_temp_res:
      bio_temp_res.append(list(map(lambda x :'B-PROFESION' if (wrd_first in x) else ('I-PROFESION' if any(item in x for item in wrd_next) else 'O'), i)))
        
      for i,j in zip(tok_temp_res, bio_temp_res):
        res.append([i,j])
  return res


In [None]:
term= 'director de finanzas'
var= produce_nlp_sentences(term)
generate_bio_sentences(var,term)

In [None]:
#Wikidata query that searches the "child" of an entity given a relation in a language with a word limit
import requests
from bs4 import BeautifulSoup
import re
from nltk.stem import PorterStemmer
from nltk.tokenize.toktok import ToktokTokenizer
from qwikidata.sparql import return_sparql_query_results

def search_child(entity, relation, lang, limit):
    res = []
    try: #bind(SHA512(concat(str(rand()), str(?item))) as ?random) .
        query = '''
        select ?label
        where{
            ?item wdt:#RELATION wd:#ENTITY .
            
            ?item rdfs:label ?label filter (lang(?label) = "#LANG").
        }
        
        limit #LIMIT
        '''
        query = query.replace('#ENTITY',entity).replace('#LIMIT',str(limit)).replace('#RELATION',relation).replace('#LANG',lang)

        query_res = return_sparql_query_results(query)

        for i in query_res['results']['bindings']:
            res.append(i['label']['value'])

        return res

    except Exception as e:
        print("Exception:",e)
        return None



#Gets a profession and generates a list with the tokens and ner_tags corresponding
def treat_prof_list(str):
  tok = str.split(' ')
  bio = ['B-PROFESION'] + (['I-PROFESION']*(len(tok)-1))
  return([tok, bio])



#Generates a list with 1000 professions tokenized and with ner_tags
def gen_prof():
  prof_list = search_child('Q28640','P31','es',1000)
  return list(map(treat_prof_list, prof_list))



def find_prof_sentences(num,list_entities):
  toktok = ToktokTokenizer()
  lang = 'es'
  
  
  rep = round((num**(1/2)/2))

  proflist=[]
  if len(list_entities) >0:
    proflist=list_entities
  else:
    proflist = search_child('Q28640','P31','es',num*3)
  n=len(proflist)
  if num > n:
    num=n
    

  
  #if proflist==None or proflist==[]:
  #    continue
  res = []
  while len(res) < num:

    

    for wrd in proflist:
      text = []
      
      elem = wrd.replace(' ','_').capitalize()

      response = requests.get(f'https://{lang}.wikipedia.org/wiki/{elem}')
      soup = BeautifulSoup(response.content, 'html.parser')
      
      if response.status_code != 200:
        continue

      for paragraph in soup.find_all('p'):
        p = re.sub('[\(\[].*?[\)\]]', '', paragraph.text).strip()
        if p.find('may refer to:')==-1 and p.find('\\')==-1 and p.find('{')==-1:
          #text += p.split('.')
          text.extend(p.split('.'))

      temp_res = []
      for count,i in enumerate(text):
        if i.find(wrd)!=-1:
          var = i.strip().replace(wrd, '['+wrd+']')
          temp_res.append(var)

          if count>rep:
            break

      tok_temp_res = []
      bio_temp_res = []
      for sent in temp_res:
        toks,labs = annotate_sentence_bio(toktok.tokenize(sent),'PROFESION')
        res.append([toks,labs])
        tok_temp_res.append(toks)
        bio_temp_res.append(labs)
        
      
      '''
      if temp_res != []:
        wrd_first = wrd.split(' ')[0]
        wrd_next = wrd.split(' ')[1:]

        tok_temp_res = list(map(lambda x : toktok.tokenize(x), temp_res))
        bio_temp_res = []
        for i in tok_temp_res:
          bio_temp_res.append(list(map(lambda x :'B-PROFESION' if (wrd_first in x) else ('I-PROFESION' if any(item in x for item in wrd_next) else 'O'), i)))
        
        for i,j in zip(tok_temp_res, bio_temp_res):
          res.append([i,j])
      '''

      if (len(res) > num):
        return res
  return res


def get_wikipedia_aug_dataset(n,list_entities=[]):
  return pd.DataFrame(find_prof_sentences(n,list_entities), columns=['tokens','ner_tags'])

In [None]:
print(total_candidates)

In [None]:
res=get_wikipedia_aug_dataset(10,total_candidates)


In [None]:
res

Unnamed: 0,tokens,ner_tags
0,"[Un, científico, ​, y, a, su, vez, de, scienti...","[O, B-PROFESION, O, O, O, O, O, O, O, O, O, O,..."
1,"[En, un, sentido, más, restringido, ,, un, cie...","[O, O, O, O, O, O, O, B-PROFESION, O, O, O, O,..."
2,"[Un, astronauta, ​, o, cosmonauta​, es, una, p...","[O, B-PROFESION, O, O, O, O, O, O, O, O, O, O,..."
3,"[Con, la, llegada, de, los, viajes, comerciale...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-P..."
4,"[Ser, astronauta, es, una, certificación, que,...","[O, B-PROFESION, O, O, O, O, O, O, O, O]"
5,"[En, la, historia, de, Japón, ,, los, ninja, ​...","[O, O, O, O, O, O, O, B-PROFESION, O, O, O, O,..."
6,"[Pocas, organizaciones, militares, han, sido, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
7,"[El, término, árbitro, es, usado, en, varios, ...","[O, O, B-PROFESION, O, O, O, O, O]"
8,"[Un, compositor, de, canciones, es, aquella, p...","[O, B-PROFESION, I-PROFESION, I-PROFESION, O, ..."
9,"[​, En, el, caso, de, que, una, persona, escri...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [None]:
### Busco profesiones
wikidata_profesions = search_child('Q28640','P31','es',5000)


In [None]:
len(wikidata_profesions)

2689

In [None]:
wikidata_profesions[480:500]

In [None]:
la= []
vc=[]
for w in wikidata_profesions:
  a= w.split(' ')[0]
  if a in  wordvectors.vocab:
    
      vc.append(wordvectors.get_vector(a))
      la.append(w)


In [None]:
la2= []
vc2=[]
for w in total_entities:
  a= w.split(' ')[0]
  if a in wordvectors.vocab:
    if not a in la2:
      vc2.append(wordvectors.get_vector(a))
      la2.append(a)


In [None]:
len(la2)

359

In [None]:
wordvectors.similarity(la[0],la2[0])

In [None]:
print(la2)

['científica', 'jueces', 'cajera', 'colegiados', 'wedding', 'vicepresidente', 'académicos', 'delegada', 'gestores', 'futbolistas', 'ama', 'pediatra', 'vp', 'madero', 'copresidente', 'historiadores', 'trabajadores', 'compositor', 'jefe', 'médico', 'número', 'monitor', 'sanitarios', 'diputada', 'expertos', 'arzobispo', 'presidentes', 'doctoras', 'dirigentes', 'maestro', 'fiscal', 'médicos', 'actores', 'policía', 'presentadora', 'subdelegado', 'cirujanos', 'funcionaria', 'reportera', 'ministro', 'rastreadores', 'cuerpos', 'portavoz', 'políticas', 'delegado', 'político', 'directivo', 'camioneros', 'rey', 'servidores', 'consejero', 'director', 'experto', 'juez', 'personal', 'arrendatarios', 'narco', 'secretario', 'persona', 'presidenta', 'profesionales', 'jugadora', 'profesor', 'profesora', 'directora', 'rastreador', 'socorrista', 'políticos', 'investigadora', 'senadores', 'dr', 'rabino', 'monjas', 'marinos', 'policías', 'primer', 'modelos', 'equipo', 'mujeres', 'divulgador', 'oculista', 'm

In [None]:
def get_total_semantic(candidate, originals):
  #print(candidate)
  max=0
  counts=0
  for o in originals:
    v=wordvectors.similarity(candidate.split(' ')[0],o)
    if v > 0.70:
      if v > max:
        max=v
      counts+=1
  return max,counts

In [None]:
get_total_semantic(la[755],la2)

jugador de hockey sobre hielo


(0.7337443, 5)

In [None]:
total_candidates=[]
for l in la:
  val,rep= get_total_semantic(l,la2)
  if val>0 and rep>0:
    total_candidates.append(l)


In [None]:
len(total_candidates)

852

In [None]:
total_candidates[9]

'técnico de ascensores'

In [None]:
total_candidates

In [None]:
import codecs

file = codecs.open("wikipedia_entities.txt", "w", "utf-8")
for c in total_candidates:
  file.write(c+'\n')

file.close()

In [None]:
total_candidates= read_entities('wikipedia_entities.txt')

In [None]:
total_candidates[9]

'técnico de ascensores'

In [None]:
'personal de farmacia' in wikidata_profesions

True

In [None]:
wordvectors.most_similar('jugador_futbol')

# Vector filter

In [None]:
!wget https://zenodo.org/record/3234051/files/embeddings-l-model.vec?download=1


--2022-12-01 08:57:01--  https://zenodo.org/record/3234051/files/embeddings-l-model.vec?download=1
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3446609027 (3.2G) [application/octet-stream]
Saving to: ‘embeddings-l-model.vec?download=1’


2022-12-01 09:16:07 (2.87 MB/s) - ‘embeddings-l-model.vec?download=1’ saved [3446609027/3446609027]



In [None]:


from gensim.models.keyedvectors import KeyedVectors
wordvectors = KeyedVectors.load_word2vec_format('embeddings-l-model.vec?download=1', limit=100000)


In [None]:
wordvectors.similarity('perro','gato')


0.8025179

In [None]:
scores = ['hola', 'mundo', 'que', 'tal', 'somos']
filtered = filter(lambda score: len(score) > 4, scores)

print(list(filtered))

['mundo', 'somos']


In [None]:
wordvectors.

In [None]:

def substitute_synonyms(list_tokens,list_tags,list_new_toks,max):
  n_t=[]
  n_l=[]
  found=0
  global wordvectors
  filtered_tok = list(filter(lambda score: len(score) > 3, list_new_toks))
  filtered_tok = list(filter(lambda word: word in wordvectors.vocab, filtered_tok))

  for tok,lab in zip(list_tokens,list_tags):
    if lab == 'O' and len(tok)>3 and found<max and tok in wordvectors.vocab:
      candidate=tok
      
      for ft in filtered_tok:
        
        
        val = wordvectors.similarity(ft,tok)
        
        if val > 0.35:
          print(tok)
          print(val)
          print(ft)
          candidate=ft
          found=found+1
          print(filtered_tok)
          print(candidate)
          filtered_tok.remove(str(candidate))
          
          break
      
      n_t.append(candidate)
      n_l.append(lab)




    else:
      n_t.append(tok)
      n_l.append(lab)

  return n_t,n_l


In [None]:
tokens= training_data.iloc[2111]['tokens']
labels= training_data.iloc[2111]['ner_tags']
tokens2= dd.iloc[2]['tokens']
substitute_synonyms(tokens,labels,tokens2,3)

In [None]:
tokens

In [None]:
clus= ['presidente','ministro','doctor','médico','payaso','malabarista']

In [None]:
lis_cluster= []
for c in clus:
  lis_cluster.append(wordvectors.get_vector(c))

In [None]:
from sklearn.cluster import KMeans
km_2 = KMeans(n_clusters=3)
labels = km_2.fit(lis_cluster).labels_ 

In [None]:
labels

array([2, 2, 1, 1, 0, 0], dtype=int32)

## Create vectors

In [None]:
import numpy as np


In [None]:
from numpy import dot, float32 as REAL, empty, memmap as np_memmap, \
    double, array, zeros, vstack, sqrt, newaxis, integer, \
    ndarray, sum as np_sum, prod, argmax, divide as np_divide
import numpy as np
from gensim import utils, matutils

def calculate_vector(term):
  words= term.split(' ')
  filter_words = [item for item in words if len(item)>3]
  val=np.zeros(300)
  vectors=[]

  for word in filter_words:
    if word in  wordvectors.vocab:
      vectors.append(wordvectors.get_vector(word))

  


  if len(vectors)==0:
    return np.zeros(300) 

  #if len(vectors)==1:
  #  return matutils.unitvec(array(vectors).mean(axis=0))

  return matutils.unitvec(array(vectors).mean(axis=0))

  

# Augmentation Process

In [None]:
training_data

In [None]:
wikidata_profesions

In [None]:
lis_total_entities= total_candidates#list(wikidata_profesions)
pointer_entities=0

#import random
#random.seed(1)
#random.shuffle(lis_total_entities)

In [None]:
def get_random_profesion():
  global lis_total_entities
  global pointer_entities
  if pointer_entities >= len(lis_total_entities):
    pointer_entities=0

  n= pointer_entities
  pointer_entities+=1
  return lis_total_entities[n]

def get_ocurrences_positions(labels,types):
  pattern=[]
  pattern.append('B-'+types)
  I_lab='I-'+types
  pattern.extend([I_lab,I_lab,I_lab,I_lab,I_lab,I_lab])
  starting_points=set()
  lis_occurrences=[]
  while len(pattern) > 0:
    occurrences=[(i, i+len(pattern)) for i in range(len(labels)) if labels[i:i+len(pattern)] == pattern]
    
    
    for oc in occurrences:
      if not oc[0] in starting_points:
        starting_points.add(oc[0])
        lis_occurrences.append(oc)

    pattern.pop()


  return lis_occurrences


def replace_mention(tokens,labels,typ):

  occurrences= get_ocurrences_positions(labels,typ)

  if len(occurrences) ==0:
    return tokens, labels

  
  counter=0
  new_tokens=[]
  new_labels=[]

  mention_counter=0
  current_mention=occurrences[mention_counter]

  while counter< len(tokens):
    token = tokens[counter]
    label= labels[counter]
    #print(counter)
    if counter== current_mention[0]:
      n_t,n_l= create_new_entity()
      new_tokens.extend(n_t)
      new_labels.extend(n_l)
      counter= current_mention[1]

      mention_counter += 1
      
      if mention_counter<len(occurrences):
        current_mention=occurrences[mention_counter]
    else:
      
      new_tokens.append(token)
      new_labels.append(label)
      counter+=1
  
  return new_tokens,new_labels



def create_new_entity():
  
  typ= 'PROFESION' 
  entity = get_random_profesion()
  tokens= entity.split(' ')
  labels= ['B-'+typ]
  for a in range(0,len(tokens)-1):
    labels.append('I-'+typ)

  return tokens,labels

#var=190
#print(training_data['tokens'][var])
#replace_mention(training_data['tokens'][var],training_data['ner_tags'][var],'PROFESION')


# SENTENCE CREATION


In [None]:
res=get_wikipedia_aug_dataset(5,total_candidates)


In [None]:
res

In [None]:
res

In [None]:
augment = get_wikipedia_aug_dataset(2000,total_candidates)


In [None]:
augment_data = augment

In [None]:
len(total_candidates)

852

In [None]:
len(augment)

855

In [None]:
augment_data_cleaned= clean_data(augment_data)


In [None]:
augment_data_cleaned

In [None]:
training_data_10 = read_bio_dataset('train_10.txt')
training_data_30 = read_bio_dataset('train_30.txt')
training_data_50 = read_bio_dataset('train_50.txt')

training_data = read_bio_dataset('train_clean.txt')

In [None]:
training_data_10_sc = pd.concat( [training_data_10, augment_data_cleaned])
training_data_30_sc = pd.concat( [training_data_30, augment_data_cleaned])
training_data_50_sc = pd.concat( [training_data_50, augment_data_cleaned])
training_data_or_sc = pd.concat( [training_data, augment_data_cleaned])


training_data_10_sc.reset_index(inplace=True, drop=True)
training_data_30_sc.reset_index(inplace=True, drop=True)
training_data_50_sc.reset_index(inplace=True, drop=True)
training_data_or_sc.reset_index(inplace=True, drop=True)

In [None]:
training_data_10_sc

In [None]:
write_bio_dataset(training_data_10_sc,'train_10_sc.txt')
write_bio_dataset(training_data_30_sc,'train_30_sc.txt')
write_bio_dataset(training_data_50_sc,'train_50_sc.txt')
write_bio_dataset(training_data_or_sc,'train_or_sc.txt')

## Code For Mention Replacement

In [None]:
def mention_replacement(dataset, length):

  global pointer_entities
  pointer_entities=0


  aug_mr = pd.DataFrame(columns = ['tokens', 'ner_tags'])
  dataset_size= len(dataset)-1

  counter=0
  pos=0
  while counter < length:
    
    t=dataset.loc[pos, "tokens"]
    l=dataset.loc[pos, "ner_tags"]
    pos+=1
    

    if pos == dataset_size:
      pos=0

    if all(element == 'O' for element in l):
      continue
    
    
    
    n_t,n_l= replace_mention(t,l,'PROFESION') 
    
    new_df = pd.DataFrame([{'tokens' : n_t, 'ner_tags' : n_l}])
    aug_mr = pd.concat([aug_mr, new_df], ignore_index=True)
    counter+=1
  
    
  return aug_mr
      

  

In [None]:
aug_mr = pd.DataFrame(columns = ['tokens', 'ner_tags'])

In [None]:
n_t,n_l=replace_mention(['hola','mundo','presidente'],['O','O','B-PROFESION'],'PROFESION')
new_df = pd.DataFrame([{'tokens' : n_t, 'ner_tags' : n_l}])

In [None]:
aug_mr = pd.concat([aug_mr, new_df], ignore_index=True)

In [None]:
aug_mr

In [None]:
len(training_data_pruned_10)

1358

In [None]:
training_data_pruned_10

In [None]:
augment10 = mention_replacement(training_data_pruned_10, len(training_data)-len(training_data_pruned_10))


In [None]:
total_candidates

In [None]:
augmentDouble.iloc[1]['tokens']

In [None]:
augment10.iloc[0]

In [None]:
augment30 = mention_replacement(training_data_pruned_30, len(training_data)-len(training_data_pruned_30))
augment50 = mention_replacement(training_data_pruned_50, len(training_data)-len(training_data_pruned_50))
augmentDouble = mention_replacement(training_data, len(training_data))

In [None]:
augmentor = mention_replacement(training_data_or, len(training_data_or)/2)

In [None]:
training_data_or_mr = pd.concat( [training_data_or, augmentor])
training_data_or_mr.reset_index(inplace=True, drop=True)



In [None]:
write_bio_dataset(training_data_or_mr,'drive/MyDrive/CorpusProfner/train_or_mr.txt')

In [None]:
len(augmentor)

5525

In [None]:
len(augment50)+len(training_data_pruned_50)

11051

In [None]:
training_data_10_mr = pd.concat( [training_data_pruned_10, augment10])
training_data_30_mr = pd.concat( [training_data_pruned_30, augment30])
training_data_50_mr = pd.concat( [training_data_pruned_50, augment50])
training_data_do_mr = pd.concat( [training_data, augmentDouble])


training_data_10_mr.reset_index(inplace=True, drop=True)
training_data_30_mr.reset_index(inplace=True, drop=True)
training_data_50_mr.reset_index(inplace=True, drop=True)
training_data_do_mr.reset_index(inplace=True, drop=True)

In [None]:
training_data_10_mr

In [None]:
training_data_10_mr.to_csv('training_10_mr.tsv', sep="\t",index=False)
training_data_30_mr.to_csv('training_30_mr.tsv', sep="\t",index=False)
training_data_50_mr.to_csv('training_50_mr.tsv', sep="\t",index=False)
training_data_do_mr.to_csv('training_do_mr.tsv', sep="\t",index=False)

In [None]:
training_data.to_csv('training_or.tsv', sep="\t",index=False)


In [None]:
write_bio_dataset(training_data_10_mr,'train_10_mr.txt')
write_bio_dataset(training_data_30_mr,'train_30_mr.txt')
write_bio_dataset(training_data_50_mr,'train_50_mr.txt')
write_bio_dataset(training_data_do_mr,'train_or_mr.txt')

# Complete


In [None]:
augment10

In [None]:
len(training_data_10_mr)

In [None]:
training_data_10_t = pd.concat( [training_data_10_mr, augment_data_cleaned])
training_data_30_t = pd.concat( [training_data_30_mr, augment_data_cleaned])
training_data_50_t = pd.concat( [training_data_50_mr, augment_data_cleaned])
training_data_do_t = pd.concat( [training_data_do_mr, augment_data_cleaned])

training_data_10_t.reset_index(inplace=True, drop=True)
training_data_30_t.reset_index(inplace=True, drop=True)
training_data_50_t.reset_index(inplace=True, drop=True)
training_data_do_t.reset_index(inplace=True, drop=True)

In [None]:
training_data_30_t

In [None]:
write_bio_dataset(training_data_10_t,'train_10_t.txt')
write_bio_dataset(training_data_30_t,'train_30_t.txt')
write_bio_dataset(training_data_50_t,'train_50_t.txt')
write_bio_dataset(training_data_do_t,'train_or_t.txt')

In [None]:
training_data_10_t.to_csv('training_10_t.tsv', sep="\t",index=False)
training_data_30_t.to_csv('training_30_t.tsv', sep="\t",index=False)
training_data_50_t.to_csv('training_50_t.tsv', sep="\t",index=False)
training_data_do_t.to_csv('training_do_t.tsv', sep="\t",index=False)

# Complete but Sentence Creation is part of the Mention Replacement

In [None]:
training_data_10_sc = read_bio_dataset('train_10_sc.txt')
training_data_30_sc = read_bio_dataset('train_30_sc.txt')
training_data_50_sc = read_bio_dataset('train_50_sc.txt')
training_data_or_sc = read_bio_dataset('train_or_sc.txt')

training_data = read_bio_dataset('train_clean.txt')

In [None]:
total_candidates= read_entities('wikipedia_entities.txt')

In [None]:
lis_total_entities= total_candidates

In [None]:
print(len(training_data))
print(len(training_data_10_sc))
print(len(training_data_30_sc))
print(len(training_data_50_sc))
print(len(training_data_or_sc))

11050
1962
4172
6382
11907


In [None]:
len( training_data_or_sc)

11907

In [None]:
augment_sc_mr_10 = mention_replacement(training_data_10_sc, len(training_data)-len(training_data_10_sc))
augment_sc_mr_30 = mention_replacement(training_data_30_sc, len(training_data)-len(training_data_30_sc))
augment_sc_mr_50 = mention_replacement(training_data_50_sc, len(training_data)-len(training_data_50_sc))
augment_sc_mr_or = mention_replacement(training_data_or_sc, len(training_data)/2)

In [None]:
print(len(augment_sc_mr_10))
print(len(augment_sc_mr_30))
print(len(augment_sc_mr_50))
print(len(augment_sc_mr_or))

9088
6878
4668
5525


In [None]:
training_data_10_mr_sc = pd.concat( [training_data_10_sc, augment_sc_mr_10])
training_data_30_mr_sc = pd.concat( [training_data_30_sc, augment_sc_mr_30])
training_data_50_mr_sc = pd.concat( [training_data_50_sc, augment_sc_mr_50])
training_data_or_mr_sc = pd.concat( [training_data_or_sc, augment_sc_mr_or])

training_data_10_mr_sc.reset_index(inplace=True, drop=True)
training_data_30_mr_sc.reset_index(inplace=True, drop=True)
training_data_50_mr_sc.reset_index(inplace=True, drop=True)
training_data_or_mr_sc.reset_index(inplace=True, drop=True)

In [None]:
write_bio_dataset(training_data_10_mr_sc,'train_10_sc_mr.txt')
write_bio_dataset(training_data_30_mr_sc,'train_30_sc_mr.txt')
write_bio_dataset(training_data_50_mr_sc,'train_50_sc_mr.txt')
write_bio_dataset(training_data_or_mr_sc,'train_or_sc_mr.txt')

# Augmenting more information

In [None]:
training_data_10_sc = read_bio_dataset('train_10_sc.txt')
training_data_30_sc = read_bio_dataset('train_30_sc.txt')
training_data_50_sc = read_bio_dataset('train_50_sc.txt')
training_data_or_sc = read_bio_dataset('train_or_sc.txt')


In [None]:
total_candidates= read_entities('wikipedia_entities.txt')
lis_total_entities= total_candidates

['[director de finanzas] es ejecutivo a cargo de la gestión financiera de una organización',
 '[director de finanzas] es un tipo de [ejecutivo] y de [director]']

In [None]:
term= 'director de finanzas'
from nltk.tokenize.toktok import ToktokTokenizer
def create_bio_sentences_of_term(term):
  var= produce_nlp_sentences(term)
  res_tok=[]
  res_lab=[]
  toktok = ToktokTokenizer()
  for v in var:
    tok, lab = annotate_sentence_bio(toktok.tokenize(v),'PROFESION')
    res_tok.append([tok,lab])
    #res_lab.append(lab)
  return res_tok

import time
def create_wikidata_dataset(total_candidates):
  res=[]
  i=0
  for candidate in total_candidates:
    time.sleep(2) # Sleep for 3 seconds
    news= create_bio_sentences_of_term(candidate)
    res.extend(news)
    i=i+1
    if i%20==0:
      time.sleep(5)
  return pd.DataFrame(res, columns=['tokens','ner_tags'])





In [None]:
augment_wikidata = create_wikidata_dataset(total_candidates)

In [None]:
len(total_candidates)

852

In [None]:
augment_wikidata

In [None]:
write_bio_dataset(augment_wikidata,'drive/MyDrive/CorpusProfner/wikidata_verb.txt')


In [None]:
training_data_10_sc_t = pd.concat( [training_data_10_sc, augment_wikidata])
training_data_30_sc_t = pd.concat( [training_data_30_sc, augment_wikidata])
training_data_50_sc_t = pd.concat( [training_data_50_sc, augment_wikidata])
training_data_or_sc_t = pd.concat( [training_data_or_sc, augment_wikidata])

training_data_10_sc_t.reset_index(inplace=True, drop=True)
training_data_30_sc_t.reset_index(inplace=True, drop=True)
training_data_50_sc_t.reset_index(inplace=True, drop=True)
training_data_or_sc_t.reset_index(inplace=True, drop=True)

In [None]:
write_bio_dataset(training_data_10_sc_t,'drive/MyDrive/CorpusProfner/train_10_sc_t.txt')
write_bio_dataset(training_data_30_sc_t,'drive/MyDrive/CorpusProfner/train_30_sc_t.txt')
write_bio_dataset(training_data_50_sc_t,'drive/MyDrive/CorpusProfner/train_50_sc_t.txt')
write_bio_dataset(training_data_or_sc_t,'drive/MyDrive/CorpusProfner/train_or_sc_t.txt')

In [None]:
!zip -r sg.zip ./sg

  adding: sg/ (stored 0%)
  adding: sg/training_30_sg.tsv (deflated 80%)
  adding: sg/training_do_sg.tsv (deflated 79%)
  adding: sg/training_50_sg.tsv (deflated 79%)
  adding: sg/training_10_sg.tsv (deflated 80%)


In [None]:
!zip -r mr.zip ./mr

  adding: mr/ (stored 0%)
  adding: mr/training_do_mr.tsv (deflated 80%)
  adding: mr/training_10_mr.tsv (deflated 80%)
  adding: mr/training_30_mr.tsv (deflated 80%)
  adding: mr/training_50_mr.tsv (deflated 79%)


In [None]:
!pip install pyocclient -q
import owncloud
oc = owncloud.Client('https://delicias.dia.fi.upm.es/nextcloud/')

oc.login('pcalleja', 'oWn.ser.5')

  Building wheel for pyocclient (setup.py) ... [?25l[?25hdone


In [None]:
oc.put_file('sg.zip', 'sg.zip')


True

In [None]:
oc.put_file('t.zip', 't.zip')


True

In [None]:
aug_sr = pd.DataFrame(columns = ['tokens', 'ner_tags'])


In [None]:
aug_sr

In [None]:

val= 0

for index, row in augment1.iterrows():
    if index <val:
      continue
    print(index)
    n_t,n_l= lm_sentence_augmentation(row['tokens'],row['ner_tags'])
    aug_sr = aug_sr.append({'tokens' : n_t, 'ner_tags' : n_l},
        ignore_index = True)

# Back Translation

In [None]:
!pip install BackTranslation -q


[K     |████████████████████████████████| 55 kB 1.7 MB/s 
[K     |████████████████████████████████| 42 kB 613 kB/s 
[K     |████████████████████████████████| 1.5 MB 17.4 MB/s 
[K     |████████████████████████████████| 53 kB 1.2 MB/s 
[K     |████████████████████████████████| 65 kB 1.7 MB/s 
[?25h  Building wheel for googletrans (setup.py) ... [?25l[?25hdone


In [None]:
from BackTranslation import BackTranslation
trans = BackTranslation(url=[
      'translate.google.com',
      'translate.google.co.kr',
    ], proxies={'http': '127.0.0.1:1234', 'http://host.name': '127.0.0.1:4012'})
result = trans.translate('hola mundo que tal', src='es', tmp = 'en')
print(result.result_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Hola mundo como


In [None]:
training_data_10 = read_bio_dataset('train_10.txt')
training_data_30 = read_bio_dataset('train_30.txt')
training_data_50 = read_bio_dataset('train_50.txt')
training_data_or = read_bio_dataset('train_clean.txt')

In [None]:
from BackTranslation import BackTranslation

def get_entities_of_bio(tokens,labels):
  lis_entities=[]
  entity=''
  found=False
  for t,l in zip(tokens,labels):
    if 'B-' in l:
      if found:
        lis_entities.append(entity)
        entity = t
      else: 
         found=True
         #lis_entities.append(entity)
         entity = t
    if 'I-' in l:
      entity= entity +' '+t
    if 'O' == l and found==True:
      lis_entities.append(entity)
      entity = ''
      found=False
  return lis_entities

from nltk.tokenize.toktok import ToktokTokenizer


def annotate_sentence_bio(sentence,tag):
  tok=[]
  lab=[]
  found=0
  for a in sentence:
    if a == '[':
      found=1
      continue
    if a==']':
      found=0
      continue
    if found==0:
      tok.append(a)
      lab.append('O')
      continue
    if found==1:
      tok.append(a)
      lab.append('B-'+tag)
      found=2
      continue
    if found==2:
      tok.append(a)
      lab.append('I-'+tag)

  return tok, lab  
    

def backTranslate_sentence(tokens,labels,trans, toktok):
  try:
    sentence= ' '.join(tokens)
    entities =  get_entities_of_bio(tokens,labels)
    #print(entities)
    # validation
    text= sentence
    for ent in entities:
      if not ent in text:
        print(text)
        print(ent)
        print('strange')
      else: 
        text= text.replace(ent,'['+ent+']')
    

    #print(text)
    result = trans.translate(text, src='es', tmp = 'en').result_text
    #print(result)
    res =toktok.tokenize(result)


    tok,lab= annotate_sentence_bio(res,'PROFESION')
    return [tok,lab]
  except Exception as e:
    print(e)
    return None



def bt_dataset(dataset):
  total=[]

  trans = BackTranslation(url=[
      'translate.google.com',
      'translate.google.co.kr',
    ], proxies={'http': '127.0.0.1:1234', 'http://host.name': '127.0.0.1:4012'})
  toktok = ToktokTokenizer()
  for index, row in dataset.iterrows():
    toks= row['tokens']
    tags= row['ner_tags']
    if not 'B-PROFESION' in tags:
      continue
    print(index)
    res = backTranslate_sentence(toks,tags,trans,toktok)
    if res == None:
      print('bad translation')
      
      #total.append([[],[]])
    else:
      total.append(res)

                                
  return pd.DataFrame(total, columns=['tokens','ner_tags'])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from BackTranslation import BackTranslation
trans = BackTranslation(url=[
      'translate.google.com',
      'translate.google.co.kr',
    ], proxies={'http': '127.0.0.1:1234', 'http://host.name': '127.0.0.1:4012'})
result = trans.translate("están trabajando codo a codo con los [técnicos del departament de salut], completamente fuera del ruido político", src='es', tmp = 'en')
print(result.result_text)

Están trabajando junto con los [técnicos del departamento de Salut], completamente fuera del ruido político


In [None]:
training_data_10_bt_only = bt_dataset(training_data_10)

In [None]:
training_data_30_bt_only = bt_dataset(training_data_30)
training_data_50_bt_only = bt_dataset(training_data_50)


In [None]:
training_data_50_bt_only = bt_dataset(training_data_50)


In [None]:
write_bio_dataset(training_data_10_bt_only,'drive/MyDrive/CorpusProfner/train_10_bt_only.txt')
write_bio_dataset(training_data_30_bt_only,'drive/MyDrive/CorpusProfner/train_30_bt_only.txt')
write_bio_dataset(training_data_50_bt_only,'drive/MyDrive/CorpusProfner/train_50_bt_only.txt')

In [None]:
write_bio_dataset(training_data_50_bt_only,'drive/MyDrive/CorpusProfner/train_50_bt_only.txt')

In [None]:
training_data_or_bt_only = bt_dataset(training_data_or)

In [None]:
write_bio_dataset(training_data_or_bt_only,'drive/MyDrive/CorpusProfner/train_or_bt_only.txt')

In [None]:
write_bio_dataset(training_data_or_bt,'drive/MyDrive/CorpusProfner/train_or_bt.txt')

In [None]:
training_data_10_bt = bt_dataset(training_data_10)

In [None]:
training_data_30_bt = bt_dataset(training_data_30)
training_data_50_bt = bt_dataset(training_data_50)


In [None]:
training_data_or_bt = bt_dataset(training_data_or)

In [None]:
training_data_10_bt = training_data_or_bt.sample(frac = 0.1,random_state=8)
training_data_30_bt = training_data_or_bt.sample(frac = 0.3,random_state=8)
training_data_50_bt = training_data_or_bt.sample(frac = 0.5,random_state=8)
training_data_10_bt.reset_index(inplace=True, drop=True)
training_data_30_bt.reset_index(inplace=True, drop=True)
training_data_50_bt.reset_index(inplace=True, drop=True)

In [None]:
training_data_10

In [None]:
training_data_30_bt

Unnamed: 0,tokens,ner_tags
0,"[Nadie, puede, estar, a, salvo, solo, ,, todos...","[O, O, O, O, O, O, O, O, O, O, O, O]"
1,"[La, pandemia, no, evita, los, dividendos, de,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[Fabricado, en, Vitoria, Gasteiz, ,, y, efecti...","[O, O, O, O, O, O, O, O, O, O, O]"
3,"[Los, valencianos, que, no, tienen, que, comer...","[O, O, O, O, O, O, O, O, O, O, O, O]"
4,"[Tendría, que, callarse, porque, no, entiende,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...
3310,"[¿, Realmente, un, puto, bar, de, bar, ?]","[O, O, O, O, O, O, O, O]"
3311,"[Nuestro, patrocinador, de, impresión, Noval, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3312,"[Te, decimos, en]","[O, O, O]"
3313,"[Fue, de, Barcelona, a, Vilassar, de, Mar, par...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [None]:
write_bio_dataset(training_data_10_bt,'drive/MyDrive/CorpusProfner/train_10_bt.txt')
write_bio_dataset(training_data_30_bt,'drive/MyDrive/CorpusProfner/train_30_bt.txt')
write_bio_dataset(training_data_50_bt,'drive/MyDrive/CorpusProfner/train_50_bt.txt')

In [None]:
training_data_TOTAL_bt = training_data_or_bt

In [None]:
def clean_empty(dataset):
  rows_delete=[]
  for index, row in dataset.iterrows():
    toks= row['tokens']
    tags= row['ner_tags']
    if len(toks) ==0:
      print('s')
      rows_delete.append(index)

  dataset.drop(rows_delete, axis=0, inplace=True)
  dataset.reset_index(inplace=True, drop=True)

  
  return dataset

In [None]:
training_data_10_bt_c= clean_empty(training_data_10_bt)
training_data_30_bt_c= clean_empty(training_data_30_bt)
training_data_50_bt_c= clean_empty(training_data_50_bt)
training_data_or_bt_c= clean_empty(training_data_or_bt)

In [None]:
training_data_10_bt_c.head(20)

In [None]:
print(count_entities(training_data_10_bt_c,'B-PROFESION'))
print(count_entities(training_data_30_bt_c,'B-PROFESION'))
print(count_entities(training_data_50_bt_c,'B-PROFESION'))

In [None]:
training_data_10_bt_only = read_bio_dataset('drive/MyDrive/CorpusProfner/train_10_bt_only.txt')
training_data_30_bt_only = read_bio_dataset('drive/MyDrive/CorpusProfner/train_30_bt_only.txt')
training_data_50_bt_only = read_bio_dataset('drive/MyDrive/CorpusProfner/train_50_bt_only.txt')
training_data_or_bt_only = read_bio_dataset('drive/MyDrive/CorpusProfner/train_or_bt_only.txt')

In [None]:
training_data_10_bt = pd.concat( [training_data_10, training_data_10_bt_only])
training_data_30_bt = pd.concat( [training_data_30, training_data_30_bt_only])
training_data_50_bt = pd.concat( [training_data_50, training_data_50_bt_only])
training_data_or_bt = pd.concat( [training_data_or, training_data_or_bt_only])

training_data_10_bt.reset_index(inplace=True, drop=True)
training_data_30_bt.reset_index(inplace=True, drop=True)
training_data_50_bt.reset_index(inplace=True, drop=True)
training_data_or_bt.reset_index(inplace=True, drop=True)

In [None]:
training_data_or_bt

In [None]:
write_bio_dataset(training_data_10_bt,'drive/MyDrive/CorpusProfner/train_10_bt.txt')
write_bio_dataset(training_data_30_bt,'drive/MyDrive/CorpusProfner/train_30_bt.txt')
write_bio_dataset(training_data_50_bt,'drive/MyDrive/CorpusProfner/train_50_bt.txt')
write_bio_dataset(training_data_or_bt,'drive/MyDrive/CorpusProfner/train_or_bt.txt')


# BT + SG

In [None]:
training_data_10_bt_only = read_bio_dataset('drive/MyDrive/CorpusProfner/train_10_bt_only.txt')
training_data_30_bt_only = read_bio_dataset('drive/MyDrive/CorpusProfner/train_30_bt_only.txt')
training_data_50_bt_only = read_bio_dataset('drive/MyDrive/CorpusProfner/train_50_bt_only.txt')
training_data_or_bt_only = read_bio_dataset('drive/MyDrive/CorpusProfner/train_or_bt_only.txt')

training_data_10_sc = read_bio_dataset('drive/MyDrive/CorpusProfner/train_10_sc_t.txt')
training_data_30_sc = read_bio_dataset('drive/MyDrive/CorpusProfner/train_30_sc_t.txt')
training_data_50_sc = read_bio_dataset('drive/MyDrive/CorpusProfner/train_50_sc_t.txt')
training_data_or_sc = read_bio_dataset('drive/MyDrive/CorpusProfner/train_or_sc_t.txt')

In [None]:
training_data_10_sc_bt = pd.concat( [training_data_10_sc, training_data_10_bt_only])
training_data_30_sc_bt = pd.concat( [training_data_30_sc, training_data_30_bt_only])
training_data_50_sc_bt = pd.concat( [training_data_50_sc, training_data_50_bt_only])
training_data_or_sc_bt = pd.concat( [training_data_or_sc, training_data_or_bt_only])

training_data_10_sc_bt.reset_index(inplace=True, drop=True)
training_data_30_sc_bt.reset_index(inplace=True, drop=True)
training_data_50_sc_bt.reset_index(inplace=True, drop=True)
training_data_or_sc_bt.reset_index(inplace=True, drop=True)

In [None]:
write_bio_dataset(training_data_10_sc_bt,'drive/MyDrive/CorpusProfner/train_10_sc_bt.txt')
write_bio_dataset(training_data_30_sc_bt,'drive/MyDrive/CorpusProfner/train_30_sc_bt.txt')
write_bio_dataset(training_data_50_sc_bt,'drive/MyDrive/CorpusProfner/train_50_sc_bt.txt')
write_bio_dataset(training_data_or_sc_bt,'drive/MyDrive/CorpusProfner/train_or_sc_bt.txt')

# Count


In [None]:
t1 = read_bio_dataset('drive/MyDrive/CorpusProfner/train_30_sc_t.txt')
t2 = read_bio_dataset('drive/MyDrive/CorpusProfner/train_30_sc.txt')
t3 = read_bio_dataset('drive/MyDrive/CorpusProfner/train_30.txt')


In [None]:
print(len(t1)-len(t2))
print(len(t2)-len(t3))


532
855


In [None]:
t2

In [None]:
t1