# Aspect based sentiment analysis
## Preprocessing

In [40]:
from transformers import BertTokenizer, BertModel
import torch
import re

polarity_dict = {'positive':'pos', 'negative':'neg', 'neutral':'neu', 'conflict':'con'}

def get_labels(sentence, tokens, aspect_list):
  if len(aspect_list) == 0:
    return ['O' for i in range(len(tokens))]

  sentence = sentence.lower()
  new_tokens = tokens[1:-1]
  cur_pos = 0
  cur_asp_idx = 0
  ans = ['O']
  for x in new_tokens:
    cur_sub_word = x if x[:2]!='##' else x[2:]
    #pattern = re.compile(cur_sub_word)
    #match = pattern.search(sentence, cur_pos)
    #s,e = match.span()
    s = sentence.find(cur_sub_word, cur_pos)
    e = s + len(cur_sub_word)
    if cur_asp_idx < len(aspect_list) and s >= aspect_list[cur_asp_idx]['start'] and s<aspect_list[cur_asp_idx]['end']:
      cur_char = 'B' if ans[-1]=='O' else 'I'
      ans.append(f'{cur_char}-{polarity_dict[aspect_list[cur_asp_idx]["polarity"]]}')
    else:
      ans.append('O')
    if cur_asp_idx < len(aspect_list) and s >= aspect_list[cur_asp_idx]['end']:
      cur_asp_idx += 1
    cur_pos = e
  ans.append('O')
  return ans


my_sentence = "I charge it at night and skip taking the cord with me because of the good battery life."
polarity = ['neutral', 'positive']
start_of_aspect = [41, 74]
end_of_aspect = [45, 86]
aspect_list = [{'polarity':'neutral', 'start':41, 'end':45}, {'polarity':'positive', 'start':74, 'end':86}]

print(len(my_sentence))
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

input_ids = tokenizer.encode(my_sentence)
subwords = tokenizer.convert_ids_to_tokens(input_ids)
print(input_ids)
print(subwords)
labels = get_labels(my_sentence, subwords, aspect_list)
print(labels)
assert len(input_ids) == len(labels)

# model = BertModel.from_pretrained('bert-base-uncased')

# a = 1/0
# output = model(input_ids, type_ids, att_m)

# final_layer = output.last_hidden_state
# final_layer.shape

87
[101, 1045, 3715, 2009, 2012, 2305, 1998, 13558, 2635, 1996, 11601, 2007, 2033, 2138, 1997, 1996, 2204, 6046, 2166, 1012, 102]
['[CLS]', 'i', 'charge', 'it', 'at', 'night', 'and', 'skip', 'taking', 'the', 'cord', 'with', 'me', 'because', 'of', 'the', 'good', 'battery', 'life', '.', '[SEP]']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-neu', 'O', 'O', 'O', 'O', 'O', 'O', 'B-pos', 'I-pos', 'O', 'O']


In [46]:
import xmltodict
import pandas as pd
from tqdm import tqdm

with open("Laptop_Train_v2.xml", "r") as f:
  obj = xmltodict.parse(f.read())

obj = obj['sentences']['sentence']
out_sid = []
out_tokens = []
out_labels = []
for x in tqdm(obj, total=len(obj)):
  sentence = x['text']
  sent_id = x['@id']
  aspect_list = []
  if 'aspectTerms' in x.keys():
    aspects = x['aspectTerms']['aspectTerm']
    if type(aspects) != list:
      aspects = [aspects]
    for a in aspects:
      aspect_list.append({'polarity':a['@polarity'], 'start':int(a['@from']), 'end':int(a['@to'])})
    aspect_list.sort(key=lambda x: x['start'])
  input_ids = tokenizer.encode(sentence)
  subwords = tokenizer.convert_ids_to_tokens(input_ids)
  
  labels = get_labels(sentence, subwords, aspect_list)
  # print(sentence)
  # print(subwords)
  # print(aspect_list)
  # print(labels)
  # print()
  assert len(labels) == len(subwords)
  out_sid.append(sent_id)
  out_tokens.append(input_ids)
  out_labels.append(labels)

print('Writing to file...')
df = pd.DataFrame()
df['sid'] = out_sid
df['token_ids'] = out_tokens
df['labels'] = out_labels
df.to_csv('preproc.csv', index=False)
print('DONE')

100%|██████████| 3045/3045 [00:01<00:00, 1602.51it/s]


Writing to file...
DONE
