In [None]:
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-35d64320-d33a-929a-2b0b-16885d22268f)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install spacy[transformers]
!python -m spacy download en_core_web_trf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.8-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.1 MB/s 
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 17.7 MB/s 
[?25hCollecting transformers<4.22.0,>=3.4.0
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 33.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 50.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 7

In [None]:
import json
import os
import random
import logging
import spacy
print(spacy.__version__)
import re

from collections import Counter
from spacy.tokens import DocBin
from tqdm import tqdm

3.4.1


In [None]:
!python -m spacy info

[1m

spaCy version    3.4.1                         
Location         /usr/local/lib/python3.7/dist-packages/spacy
Platform         Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic
Python version   3.7.13                        
Pipelines        en_core_web_trf (3.4.0), en_core_web_sm (3.4.0)



In [None]:
def train_dev_split(doccano_jsonl_filepath, output_train_path, output_dev_path, train_proportion=0.75):
  try:
    lines=[]
    with open(doccano_jsonl_filepath, 'r') as f:
      lines = f.readlines()
      random.shuffle(lines)
      train_size = round(len(lines)*train_proportion)

      # Train file
      with open(output_train_path, 'w') as f:
        for line in lines[:train_size]:
          f.write(f"{line}")

      # Dev file
      with open(output_dev_path, 'w') as f:
        for line in lines[train_size:]:
          f.write(f"{line}")

    return None
  except Exception as e:
    logging.exception("Unable to process " + doccano_jsonl_filepath + "\n" + "error = " + str(e))
    
    return None


def trim_entity_spans(data: list) -> list:
  """Removes leading and trailing white spaces from entity spans.

  Args:
      data (list): The data to be cleaned in spaCy JSON format.

  Returns:
      list: The cleaned data.
  """
  invalid_span_tokens = re.compile(r'\s')

  cleaned_data = []
  for text, annotations in data:
    entities = annotations['entities']
    valid_entities = []
    for entity in entities:
      #print(entity)
      valid_start = entity['start_offset']
      valid_end = entity['end_offset']
      #print(valid_start)
      #print(len(text))
      while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
        valid_start += 1
      while valid_end > 1 and invalid_span_tokens.match(text[valid_end - 1]):
        valid_end -= 1
      valid_entities.append([valid_start, valid_end, entity['label']])
      #print(valid_entities)
    cleaned_data.append([text, {'entities': valid_entities}])

  return cleaned_data


def make_spans(entities):
  spans = []
  for span in entities:
    # get start and end for each entity
    start = span[0]
    end = span[1]
    spans.append( (start, end) )

  return spans


def doccano_to_spacy(doccano_jsonl_filepath):
  try:
    training_data = []
    lines=[]
    with open(doccano_jsonl_filepath, 'r') as f:
      lines = f.readlines()

    for line in lines:
      data = json.loads(line)
      text = data['text']    

      entities = data['entities']
      if len(entities)>0:
        training_data.append([text, {"entities" : entities}])
      else:
        print('Skipping document without tags')
    
    return training_data
  except Exception as e:
    logging.exception("Unable to process " + doccano_jsonl_filepath + "\n" + "error = " + str(e))
      
    return None


def create_docbin(SPACY_DATA):
  """Create a DocBin from SPACY_DATA
    For each row:
    - define a `Doc` object from the `text`
    - create `Span` objects with the label fom TAGGSPACY_DATAED_DATA, 
      it assumes the SAME LABEL for all the entities for each sample
    - add these Span objects to the doc's entities and those to the doc's spans
    - add the created doc to the `DocBin` object
  """
  idx=1
  all_spans=[]
  doc_bin = DocBin()
  for text, entities in tqdm(SPACY_DATA): # data in previous format
    #print('\n---------------------------------------------------------------')
    #print(f'idx:{idx}')
    doc = nlp(text)
    #print('---------------------------------------------------------------')
    #print(f'text:{text}')
    ms = make_spans(entities["entities"])
    #print(f'make_spans:{ms}')
    label = entities["entities"][0][-1]
    #print(f'label:{label}\n\n')
    span_key= 'sc'
    span_lst = []
    for start, end in ms:
        span = doc.char_span(start, end, label=label)
        #print(f'start:{start}, end:{end}, span:{span}')
        if span is not None:
            span_lst.append(span)
    # span_lst is now a list of spaCy `Span` objects
    #print(f'span_lst:{span_lst}\n\n')
    # Set the document entities as spans
    doc.spans[span_key] = span_lst
    #spacy.displacy.render(doc, style="span", jupyter=True)
    doc_bin.add(doc)  
    all_spans.append(span_lst)  
    idx+=1

  #print(f'\n\nall_spans:{all_spans}')
  return doc_bin

In [None]:
import random
source_path = '/content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/NER_COMP_Doccano_Tags_All.jsonl'
train_path = '/content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/NER_COMP_Doccano_Tags_Train.jsonl'
dev_path = '/content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/NER_COMP_Doccano_Tags_Dev.jsonl'

train_dev_split(doccano_jsonl_filepath=source_path,
                output_train_path=train_path,
                output_dev_path=dev_path,
                train_proportion=0.80)

In [None]:
nlp = spacy.blank('en')

#Train Data
DATA_FILE_PATH_TRAIN = '/content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/NER_COMP_Doccano_Tags_Train.jsonl'
SPACY_DATA_TRAIN = doccano_to_spacy(DATA_FILE_PATH_TRAIN)
TRAIN_DATA=trim_entity_spans(SPACY_DATA_TRAIN)
train_data = create_docbin(TRAIN_DATA)
train_data.to_disk("/content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/NER_COMP_Doccano_Tags_Train.spacy")

Skipping document without tags
Skipping document without tags
Skipping document without tags


100%|██████████| 464/464 [00:01<00:00, 282.62it/s]


In [None]:
#Val data
DATA_FILE_PATH_VAL = '/content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/NER_COMP_Doccano_Tags_Dev.jsonl'
SPACY_DATA_VAL = doccano_to_spacy(DATA_FILE_PATH_VAL)
VAL_DATA=trim_entity_spans(SPACY_DATA_VAL)
val_data = create_docbin(VAL_DATA)
val_data.to_disk("/content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/NER_COMP_Doccano_Tags_Dev.spacy")

100%|██████████| 117/117 [00:00<00:00, 324.71it/s]


In [None]:
gpu = spacy.prefer_gpu()
print(gpu)

True


In [None]:
!pip install -qqq wandb

import os
import wandb
os.environ["WANDB_API_KEY"]="fdd7781abf964d3e05353120f73a5c93e10dc616"
os.environ["WANDB_ENTITY"]="mia3_esg_tfm"
os.environ["WANDB_PROJECT"]="SpanCat-NER COMP spaCy"
os.environ["WANDB_START_METHOD"] = "thread"

wandb.init(project="SpanCat-NER COMP spaCy", entity="mia3_esg_tfm")

[K     |████████████████████████████████| 1.8 MB 7.6 MB/s 
[K     |████████████████████████████████| 158 kB 69.8 MB/s 
[K     |████████████████████████████████| 181 kB 69.9 MB/s 
[K     |████████████████████████████████| 63 kB 2.0 MB/s 
[K     |████████████████████████████████| 157 kB 65.1 MB/s 
[K     |████████████████████████████████| 157 kB 74.1 MB/s 
[K     |████████████████████████████████| 157 kB 78.9 MB/s 
[K     |████████████████████████████████| 157 kB 70.9 MB/s 
[K     |████████████████████████████████| 157 kB 77.4 MB/s 
[K     |████████████████████████████████| 157 kB 73.8 MB/s 
[K     |████████████████████████████████| 157 kB 80.4 MB/s 
[K     |████████████████████████████████| 156 kB 76.4 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mosmalo[0m ([33mmia3_esg_tfm[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
!python -m spacy init fill-config /content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/transformers+spancat/Roberta/model_1/base_config_m1.cfg /content/base_config.cfg 

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/base_config.cfg
You can now add your data and train your pipeline:
python -m spacy train base_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
#!python -m spacy debug config /content/base_config.cfg

In [None]:
!python -m spacy train /content/base_config.cfg --output /content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/transformers+spancat/Roberta/model_1 --gpu-id 0 --paths.train /content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/NER_COMP_Doccano_Tags_Train.spacy --paths.dev /content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/NER_COMP_Doccano_Tags_Dev.spacy

[38;5;4mℹ Saving to output directory:
/content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/transformers+spancat/Roberta/model_1[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-09-07 18:41:09,980] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2022-09-07 18:41:10,711] [INFO] Pipeline: ['transformer', 'spancat']
INFO:spacy:Pipeline: ['transformer', 'spancat']
[2022-09-07 18:41:10,716] [INFO] Created vocabulary
INFO:spacy:Created vocabulary
[2022-09-07 18:41:10,718] [INFO] Finished initializing nlp object
INFO:spacy:Finished initializing nlp object
Downloading config.json: 100% 481/481 [00:00<00:00, 462kB/s]
Downloading vocab.json: 100% 878k/878k [00:00<00:00, 2.42MB/s]
Downloading merges.txt: 100% 446k/446k [00:00<00:00, 1.83MB/s]
Downloading tokenizer.json: 100% 1.29M/1.29M [00:00<00:00, 3.58MB/s]
Downloading pytorch_model.bin: 100% 478M/478M [00:06<00:00, 72.8MB/s]
Some weights of the model checkpoint at roberta-base were not used when initializing

In [14]:
!python -m spacy package --force /content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/transformers+spancat/Roberta/model_1/model-best /content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/transformers+spancat/Roberta/model_1/

[38;5;4mℹ Building package artifacts: sdist[0m
[38;5;2m✔ Including 1 package requirement(s) from meta and config[0m
spacy-transformers>=1.1.8,<1.2.0
[38;5;2m✔ Loaded meta.json from file[0m
/content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/transformers+spancat/Roberta/model_1/model-best/meta.json
[38;5;2m✔ Generated README.md from meta.json[0m
[38;5;2m✔ Successfully created package directory 'en_pipeline-0.0.0'[0m
/content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/transformers+spancat/Roberta/model_1/en_pipeline-0.0.0
running sdist
running egg_info
creating en_pipeline.egg-info
writing en_pipeline.egg-info/PKG-INFO
writing dependency_links to en_pipeline.egg-info/dependency_links.txt
writing entry points to en_pipeline.egg-info/entry_points.txt
writing requirements to en_pipeline.egg-info/requires.txt
writing top-level names to en_pipeline.egg-info/top_level.txt
writing manifest file 'en_pipeline.egg-info/SOURCES.txt'
reading manifest file 'en_pipeline

In [15]:
!pip install /content/drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/transformers+spancat/Roberta/model_1/en_pipeline-0.0.0/dist/en_pipeline-0.0.0.tar.gz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Processing ./drive/Shareddrives/UCM_SHARED/TFM_ESG/SpanCat-NER_CMP/transformers+spancat/Roberta/model_1/en_pipeline-0.0.0/dist/en_pipeline-0.0.0.tar.gz
Building wheels for collected packages: en-pipeline
  Building wheel for en-pipeline (setup.py) ... [?25l[?25hdone
  Created wheel for en-pipeline: filename=en_pipeline-0.0.0-py3-none-any.whl size=444273184 sha256=3b24e04febd5fdceb6ffc702e92956635d116587e33d7f3269cf3c4486992fef
  Stored in directory: /root/.cache/pip/wheels/9b/dc/54/430adb38886ae4cb9f78610208fd2bf3d3a6d231396e6fa716
Successfully built en-pipeline
Installing collected packages: en-pipeline
Successfully installed en-pipeline-0.0.0


## Some Tests

In [16]:
piece_of_news = '''TORONTO — Starbucks says its plan to close up to 300 coffee shops across Canada will be complete by the end of March.

The Seattle-based coffeehouse and roastery chain announced the acceleration of its five-year "transformation strategy" last year as it responded to changes in consumer habits during the COVID-19 pandemic.

A wonderful woman celebrating her 100th birthday When board members go rogue In a statement on Tuesday, the company says some of its locations closed last fall and it expects to complete its planned store closures by the end of its second quarter.

The restructuring includes adding new drive thru locations, the expansion of delivery and a pilot of curbside pick-up only coffee shops.

The company began experimenting with pick-up only locations before the pandemic. The first Canadian Starbucks store using the new format, which measured 93 square metres or 1,000 square feet, launched in Toronto's financial district last January.

Starbucks says the changes will help the coffee chain "best meet our customers where they are now." The company had previously said it would close up to 200 of its locations in Canada over two years.

This report by The Canadian Press was first published Jan. 12, 2021.'''

nlp = spacy.load("en_pipeline")

doc = nlp(piece_of_news)
spans = doc.spans["sc"]
for span, confidence in zip(spans, spans.attrs["scores"]):
    print(span.label_, confidence)


spans

COMP 0.99753004
COMP 0.9988331
COMP 0.9988273


[Starbucks, Starbucks, Starbucks]

In [17]:
from spacy.tokens.span import Span

text = "Welcome to the Bank of China."
doc = nlp(text)
spans = doc.spans["sc"]

for span, confidence in zip(spans, spans.attrs["scores"]):
    print(span, span.label_, confidence)

In [18]:
from spacy.tokens.span import Span

text = '''The Nintendo Switch OLED Splatoon 3 Edition is available now at multiple major retailers, including Best Buy, Walmart, and Target. The console just launched today, and it's the first special-edition Switch OLED Nintendo has made. If you're interested in picking it up, you may want to order soon. It's certainly possible that this will sell out quickly like most other special-edition Switch consoles.

The console does not come with a copy of the upcoming game, which releases on September 9, but it's only $10 more than the cost of the regular Switch OLED. It's not the only Splatoon 3-themed product Nintendo is releasing to celebrate the launch of one of the biggest upcoming Nintendo Switch exclusives. On September 9, you'll also be able to purchase a Splatoon 3-themed Pro controller and carrying case.

If you still need to preorder a copy of the game, you can order a physical edition for just $49 with our exclusive promo code. Also, make sure to take a peek at our Splatoon 3 preorder guide. Multiple retailers have unique preorder bonuses. Sadly, the free plush (arguably the best bonus) that Walmart was offering is currently out of stock, and it's unclear if more will be available at launch. But you can still get a free keychain at Best Buy or a sticker sheet at GameStop. Meanwhile, Amazon is offering free release-day delivery for Prime members.'''
doc = nlp(text)
spans = doc.spans["sc"]

for span, confidence in zip(spans, spans.attrs["scores"]):
    print(span, span.label_, confidence)

Nintendo COMP 0.86839545
Walmart COMP 0.9984285
Target COMP 0.99935395
Nintendo COMP 0.796654
Nintendo COMP 0.9556791
Nintendo COMP 0.9972511
Walmart COMP 0.9989831
GameStop COMP 0.997889
Amazon COMP 0.9992539
Best Buy COMP 0.99967456
Best Buy COMP 0.99949586


In [19]:
spacy.displacy.render(doc, style="span", jupyter=True)

In [20]:
from spacy.tokens.span import Span

text = '''Earlier today, a report suggested that Amazon was set to announce an acquisition of games mega publisher Electronic Arts. However, a subsequent report has swiftly refuted that notion. All of this comes amid previous reports that EA had been exploring the possibility of an acquisition or merger. Following the news of the rumored deal, Amazon's stock briefly rose before settling down again in morning trade.

According to sources that spoke to GLHF, Amazon's purchase of EA Games was set to be announced later today, but that claim has been disputed by CNBC. "I have talked to some people who would actually know if there was something going on, and they say there's nothing going on," CNBC's David Farber said. Farber also mentioned that Comcast-NBC Universal had been approached by EA for a potential deal, but it eventually fell through.

EA has been the subject of several rumors while other high-profile acquisitions in the gaming industry made headlines. A recent report claimed that EA had held talks with Apple and Disney over a potential sale, with EA executives aggressively pursuing a deal according to rumors.

During a quarterly earnings call this month, EA CEO Andrew Wilson was asked if any of the acquisition rumors were true. Wilson responded by saying that EA is in a strong position to be "the largest standalone independent developer and publisher of interactive entertainment" in the world, but hinted that the company would be "open" to doing business differently.

2022 has been a big year for acquisitions in the gaming space, Microsoft's proposed purchase of Activision-Blizzard is slowly being approved by regulatory bodies around the world, Sony purchased Bungie for $3.6 billion, and Embracer Group has been on a buying spree lately, acquiring Crystal Dynamics, Eidos Montreal, and several other studios.'''

doc = nlp(text)
spans = doc.spans["sc"]

for span, confidence in zip(spans, spans.attrs["scores"]):
    print(span, span.label_, confidence)

Amazon COMP 0.9985514
EA COMP 0.9979023
EA COMP 0.9984573
CNBC COMP 0.9986179
Comcast COMP 0.9930126
EA COMP 0.9896701
EA COMP 0.9934772
EA COMP 0.9984805
Apple COMP 0.9800061
Disney COMP 0.99918765
EA COMP 0.9981565
EA COMP 0.98399484
EA COMP 0.99836236
Activision COMP 0.9637708
Blizzard COMP 0.6000454
Sony COMP 0.99891007
Bungie COMP 0.99878937
Eidos COMP 0.98873526
Electronic Arts COMP 0.9999082
Amazon's COMP 0.9995067
Amazon's COMP 0.999526
EA Games COMP 0.99776316
CNBC's COMP 0.9964813
NBC Universal COMP 0.98993194
Microsoft's COMP 0.9998198
Embracer Group COMP 0.9995023
Crystal Dynamics COMP 0.9997836
Eidos Montreal COMP 0.9990453
Activision-Blizzard COMP 0.91139686


In [21]:
spacy.displacy.render(doc, style="span", jupyter=True)