In [2]:
import json
import pandas as pd
import re 
%env CUDA_VISIBLE_DEVICES=4

env: CUDA_VISIBLE_DEVICES=4


In [3]:
with open("reviews.json","r") as f:
    reviews = json.load(f)

with open("entities.json","r") as f:
    entities = json.load(f)

In [4]:
df_reviews  = pd.DataFrame(reviews)
df_entities = pd.DataFrame(entities)

# join the datasets containing reviews and their entities
df_anno = df_reviews.merge(df_entities,how="left",left_on="uid",right_on="review_uid")
df_anno = df_anno.drop("uid",axis=1)
df_anno = df_anno.dropna()

# convert reviews to lowercase
df_anno["body"] = df_anno.body.apply(lambda x : x.lower())
print(len(df_anno))

18003


In [5]:
df_anno

Unnamed: 0,body,review_uid,start,end,type,term
0,buena selección de ostras.,00000f8808a9789cfe57be5884ff1ad5c3b96580,0.0,5.0,modifier,buena
1,buena selección de ostras.,00000f8808a9789cfe57be5884ff1ad5c3b96580,19.0,25.0,concept,recetas ostras
2,"tiene gran variedad de tapas a 2,50 de gran ca...",000010f29b5d65ad7c073acc31e327dc3ff9af54,6.0,10.0,modifier,gran
3,"tiene gran variedad de tapas a 2,50 de gran ca...",000010f29b5d65ad7c073acc31e327dc3ff9af54,23.0,28.0,concept,tapas
4,"tiene gran variedad de tapas a 2,50 de gran ca...",000010f29b5d65ad7c073acc31e327dc3ff9af54,39.0,43.0,modifier,gran
...,...,...,...,...,...,...
18003,"después casi tres horas, con la pizza en estóm...",dde2e67c6ecce7be8fd521600fbd844a7d62007c,246.0,252.0,concept,música
18004,"después casi tres horas, con la pizza en estóm...",dde2e67c6ecce7be8fd521600fbd844a7d62007c,319.0,325.0,concept,pizzas
18005,"después casi tres horas, con la pizza en estóm...",dde2e67c6ecce7be8fd521600fbd844a7d62007c,332.0,337.0,concept,tapas
18006,"después casi tres horas, con la pizza en estóm...",dde2e67c6ecce7be8fd521600fbd844a7d62007c,404.0,412.0,concept,clientes


In [6]:
def convert_to_tokens(text):
    return re.findall(r"[\w']+|[.,!?;]", text)

def recompose_stc(stc_tokens):
    stc_recomposed = ""
    i = 0
    for tok in stc_tokens:
        if i == 0 or not tok.isalnum():
            stc_recomposed += tok
        else:
            stc_recomposed += " "+tok
        i +=1
    return stc_recomposed

df_anno["body_recomposed"] = df_anno.body.apply(convert_to_tokens).apply(recompose_stc)
df_anno = df_anno[df_anno.body==df_anno.body_recomposed]
df_anno = df_anno.drop("body_recomposed",axis=1)
print(len(df_anno))

12767


In [7]:
uids = df_anno.review_uid.unique()

In [8]:
revs = []
for uid in uids:
    df_review = df_anno[df_anno.review_uid==uid]
    body = df_review.body.unique()[0]
    list_ents = []
    for i,row in df_review.iterrows():
        text  = row.term
        type  = row.type 
        start = row.start 
        end   = row.end 
        r     = {"text":text,"type":type,"start":start,"end":end}
        list_ents.append(r)
    row = {"document":body,"entities" : list_ents}
    revs.append(row)

In [9]:
reviews_json = revs
del revs

In [10]:
reviews_json[:2]

[{'document': 'buena selección de ostras.',
  'entities': [{'text': 'buena', 'type': 'modifier', 'start': 0.0, 'end': 5.0},
   {'text': 'recetas ostras', 'type': 'concept', 'start': 19.0, 'end': 25.0}]},
 {'document': 'buen ambiente, trato excelente y jamón exquisito.',
  'entities': [{'text': 'buen', 'type': 'modifier', 'start': 0.0, 'end': 4.0},
   {'text': 'ambiente', 'type': 'concept', 'start': 5.0, 'end': 13.0},
   {'text': 'excelente', 'type': 'modifier', 'start': 21.0, 'end': 30.0},
   {'text': 'jamón', 'type': 'concept', 'start': 33.0, 'end': 38.0},
   {'text': 'exquisito', 'type': 'modifier', 'start': 39.0, 'end': 48.0}]}]

In [11]:
# add column with text split into tokens
for i in range(len(reviews_json)) :
    tokens = convert_to_tokens(reviews_json[i]["document"])
    reviews_json[i]["tokens"] = tokens

In [12]:
reviews_json[200]

{'document': 'un restaurante de 5 estrellas, uno de los pocos donde se disfruta de la auténtica gastronomía española.',
 'entities': [{'text': 'restaurante',
   'type': 'concept',
   'start': 3.0,
   'end': 14.0},
  {'text': 'pocos', 'type': 'modifier', 'start': 42.0, 'end': 47.0},
  {'text': 'auténtica', 'type': 'modifier', 'start': 72.0, 'end': 81.0},
  {'text': 'gastronomía', 'type': 'concept', 'start': 82.0, 'end': 93.0}],
 'tokens': ['un',
  'restaurante',
  'de',
  '5',
  'estrellas',
  ',',
  'uno',
  'de',
  'los',
  'pocos',
  'donde',
  'se',
  'disfruta',
  'de',
  'la',
  'auténtica',
  'gastronomía',
  'española',
  '.']}

In [13]:
def find_sub_list(sl,l):
    results=[]
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            results.append((ind,ind+sll-1))
    return results

Given a review with multi-word entities; how to assign to each token whether it belongs to a "B-MODIFIER" , "B-CONCEPT" or "I-MODIFIER" ,"I-CONCEPT" or an "O" entity type. 

In [14]:
#function to write NER tags in required format
def write_ner_tags(rev):
    entities = rev["entities"]
    ner_tags = []

    ner_tags = []
    for i in range(len(entities)):
        type = entities[i]["type"]
        sub_list = convert_to_tokens(entities[i]["text"])
        try:
            token_start,token_end = find_sub_list(sub_list,rev["tokens"])[0]
            itm = {"token_start":token_start,"token_end":token_end,"type":type}
            ner_tags.append(itm)
        except:
            pass
    # initialize all labels to "O"
    ner_labels = []
    for i in range(len(rev["tokens"])):
        ner_labels.append("O")
    # edit modifier and concept tokens
    for tag in ner_tags:
        if tag["type"] =="modifier" : 
            ner_labels[tag["token_start"]] = "B-MODIFIER"
            if tag["token_start"] != tag["token_end"]:
                for j in range(tag["token_start"]+1 , tag["token_end"]+1):
                    ner_labels[j] = "I-MODIFIER"
        if tag["type"] =="concept" : 
            ner_labels[tag["token_start"]] = "B-CONCEPT"
            if tag["token_start"] != tag["token_end"]:
                for j in range(tag["token_start"]+1 , tag["token_end"]+1):
                    ner_labels[j] = "I-CONCEPT"
    return ner_labels

In [15]:
id_doc = 1995
reviews_json[id_doc]["ner_tags"] = write_ner_tags(reviews_json[id_doc])

In [16]:
for i in range(len(reviews_json[id_doc]["tokens"])):
    print(reviews_json[id_doc]["tokens"][i],str("----------->"),reviews_json[id_doc]["ner_tags"][i])


la -----------> O
paella -----------> B-CONCEPT
de -----------> I-CONCEPT
verduras -----------> I-CONCEPT
del -----------> O
andaluz -----------> O
para -----------> O
nosotros -----------> O
es -----------> O
de -----------> O
las -----------> B-MODIFIER
mejores -----------> I-MODIFIER
que -----------> O
hemos -----------> O
probado -----------> O
sin -----------> O
duda -----------> O
. -----------> O


In [17]:
for i in range(len(reviews_json)) :
    reviews_json[i]["ner_tags"] = write_ner_tags(reviews_json[i])

In [18]:
id_doc = 442
for i in range(len(reviews_json[id_doc]["tokens"])):
    print(reviews_json[id_doc]["tokens"][i],str("----------->"),reviews_json[id_doc]["ner_tags"][i])

nos -----------> O
lo -----------> O
recomendaron -----------> O
en -----------> O
marqués -----------> B-CONCEPT
de -----------> I-CONCEPT
riscal -----------> I-CONCEPT
. -----------> O


In [19]:
reviews_json[:2]

[{'document': 'buena selección de ostras.',
  'entities': [{'text': 'buena', 'type': 'modifier', 'start': 0.0, 'end': 5.0},
   {'text': 'recetas ostras', 'type': 'concept', 'start': 19.0, 'end': 25.0}],
  'tokens': ['buena', 'selección', 'de', 'ostras', '.'],
  'ner_tags': ['B-MODIFIER', 'O', 'O', 'O', 'O']},
 {'document': 'buen ambiente, trato excelente y jamón exquisito.',
  'entities': [{'text': 'buen', 'type': 'modifier', 'start': 0.0, 'end': 4.0},
   {'text': 'ambiente', 'type': 'concept', 'start': 5.0, 'end': 13.0},
   {'text': 'excelente', 'type': 'modifier', 'start': 21.0, 'end': 30.0},
   {'text': 'jamón', 'type': 'concept', 'start': 33.0, 'end': 38.0},
   {'text': 'exquisito', 'type': 'modifier', 'start': 39.0, 'end': 48.0}],
  'tokens': ['buen',
   'ambiente',
   ',',
   'trato',
   'excelente',
   'y',
   'jamón',
   'exquisito',
   '.'],
  'ner_tags': ['B-MODIFIER',
   'B-CONCEPT',
   'O',
   'O',
   'B-MODIFIER',
   'O',
   'B-CONCEPT',
   'B-MODIFIER',
   'O']}]

In [20]:
df_reviews = pd.DataFrame(reviews_json)
df_reviews = df_reviews.drop(["entities"],axis=1)

In [21]:
df_reviews.sample(3)

Unnamed: 0,document,tokens,ner_tags
1554,"al menos la tabla de ibéricos salvará el día, ...","[al, menos, la, tabla, de, ibéricos, salvará, ...","[O, O, O, B-CONCEPT, I-CONCEPT, I-CONCEPT, O, ..."
3414,"buena atención, comida rica y muy abundante.","[buena, atención, ,, comida, rica, y, muy, abu...","[B-MODIFIER, O, O, B-CONCEPT, B-MODIFIER, O, B..."
2536,"pedimos el pulpo, brocheta de rape y el secret...","[pedimos, el, pulpo, ,, brocheta, de, rape, y,...","[O, O, O, O, B-CONCEPT, O, B-CONCEPT, O, O, B-..."


In [22]:
def convert_tag_to_label(x):
    if x == "O":
        return 0
    if x == "B-MODIFIER":
        return 1
    if x == "I-MODIFIER":
        return 2
    if x == "B-CONCEPT":
        return 3
    if x == "I-CONCEPT":
        return 4

def convert_tags_to_labels(list_tokens):
    return list(map(convert_tag_to_label,list_tokens))



In [23]:
df_reviews["labels"] = df_reviews.ner_tags.apply(convert_tags_to_labels)

In [24]:
df_reviews.columns = ['document', 'tokens', 'labels', 'ner_tags']

In [25]:
df_reviews

Unnamed: 0,document,tokens,labels,ner_tags
0,buena selección de ostras.,"[buena, selección, de, ostras, .]","[B-MODIFIER, O, O, O, O]","[1, 0, 0, 0, 0]"
1,"buen ambiente, trato excelente y jamón exquisito.","[buen, ambiente, ,, trato, excelente, y, jamón...","[B-MODIFIER, B-CONCEPT, O, O, B-MODIFIER, O, B...","[1, 3, 0, 0, 1, 0, 3, 1, 0]"
2,las pasta correcta pero es cara para las racio...,"[las, pasta, correcta, pero, es, cara, para, l...","[O, B-CONCEPT, B-MODIFIER, O, O, B-MODIFIER, O...","[0, 3, 1, 0, 0, 1, 0, 0, 0, 0]"
3,"si he estado unas 50 veces, nunca, he salido m...","[si, he, estado, unas, 50, veces, ,, nunca, ,,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,un trato excelente y profesional.,"[un, trato, excelente, y, profesional, .]","[O, O, B-MODIFIER, O, B-MODIFIER, O]","[0, 0, 1, 0, 1, 0]"
...,...,...,...,...
3958,ya éramos clientes de más pi antes que se conv...,"[ya, éramos, clientes, de, más, pi, antes, que...","[O, O, B-CONCEPT, O, O, O, O, O, O, O, O, O, O...","[0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3959,veo por muchos de los comentarios que coincidi...,"[veo, por, muchos, de, los, comentarios, que, ...","[O, O, O, O, O, O, O, O, O, O, O, O, B-MODIFIE...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
3960,vine al restaurante animado por las opiniones ...,"[vine, al, restaurante, animado, por, las, opi...","[O, O, B-CONCEPT, B-MODIFIER, O, O, O, O, O, B...","[0, 0, 3, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, ..."
3961,"los desayunos y dulces son estupendos, el siti...","[los, desayunos, y, dulces, son, estupendos, ,...","[O, B-CONCEPT, O, B-CONCEPT, O, B-MODIFIER, O,...","[0, 3, 0, 3, 0, 1, 0, 0, 3, 0, 1, 2, 0, 0, 0, ..."


In [26]:
from sklearn.utils import shuffle
df_reviews = shuffle(df_reviews)

In [27]:
df_train = df_reviews.head(2600)
df_test  = df_reviews[2600:3200]
df_val   = df_reviews[3200:]

In [28]:
df_train.to_pickle("df_train.pkl")
df_test.to_pickle("df_test.pkl")
df_val.to_pickle("df_val.pkl")

In [29]:
df_reviews[df_reviews.document.str.startswith("un conjunto")]

Unnamed: 0,document,tokens,labels,ner_tags
2762,"un conjunto de edificio, sala, atención, varie...","[un, conjunto, de, edificio, ,, sala, ,, atenc...","[O, O, O, B-CONCEPT, O, B-CONCEPT, O, O, O, O,...","[0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


In [30]:
reviews_json[2762]

{'document': 'un conjunto de edificio, sala, atención, variedad, calidad y precio muy recomendables.',
 'entities': [{'text': 'edificio',
   'type': 'concept',
   'start': 15.0,
   'end': 23.0},
  {'text': 'sala', 'type': 'concept', 'start': 25.0, 'end': 29.0},
  {'text': 'muy recomendables',
   'type': 'modifier',
   'start': 68.0,
   'end': 85.0}],
 'tokens': ['un',
  'conjunto',
  'de',
  'edificio',
  ',',
  'sala',
  ',',
  'atención',
  ',',
  'variedad',
  ',',
  'calidad',
  'y',
  'precio',
  'muy',
  'recomendables',
  '.'],
 'ner_tags': ['O',
  'O',
  'O',
  'B-CONCEPT',
  'O',
  'B-CONCEPT',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MODIFIER',
  'I-MODIFIER',
  'O']}