# Stages

1.   Load data
2.   Check data
3.   Implement the model
4.   Evaluate the results

In [1]:
import pandas as pd
import spacy
import re
import requests, json 
import plac
import random
from spacy.util import minibatch, compounding

In [None]:
sheet_id = "1r6sqFGnPqPYsXL0BYVH60DAsXfHuT2w0"
sheet_name = "Sheet1"
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
data = pd.read_csv(url)
data.head()

Unnamed: 0.1,Unnamed: 0,Headline,Startup,Start_id,End_id,my_index,ne,org_by_index,test,spacy_anao
0,0,accedo raises us$17 million in funding,accedo,1,6,"[(0, 6)]",ORG,accedo,0,"('""accedo raises us$17 million in funding""', {..."
1,1,uniti scores £1 million funding target through...,uniti,1,5,"[(0, 5)]",ORG,uniti,0,"('""uniti scores £1 million funding target thro..."
2,2,"cannabis inhaler producer, syqe medical, raise...",syqe medical,28,12,"[(27, 39)]",ORG,syqe medical,0,"('""cannabis inhaler producer, syqe medical, ra..."
3,3,"alphonse's talents raises € 600,000",alphonse's talents,1,18,"[(0, 18)]",ORG,alphonse's talents,0,"('""alphonse\'s talents raises € 600,000 ""', {'..."
4,4,libon raises €1.8 million,libon,1,5,"[(0, 5)]",ORG,libon,0,"('""libon raises €1.8 million""', {'entities': [..."


In [None]:
data.Headline.iloc[2789] = 'a cloud guru raised $257 million round'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
data.Headline.iloc[2789]

'a cloud guru raised $257 million round'

In [None]:
def get_num(sentence):
  return len(sentence.split())

In [None]:
data["num"] = data.Startup.apply(get_num)

In [None]:
data[data.num > 2]

Unnamed: 0.1,Unnamed: 0,Headline,Startup,Start_id,End_id,my_index,ne,org_by_index,test,spacy_anao,num
23,23,blue yard capital raises 2nd fund of €105 mill...,blue yard capital,1,17,"[(0, 17)]",ORG,blue yard capital,0,"('""blue yard capital raises 2nd fund of €105 m...",3
132,132,draper & dash raises £3 million in funding,draper & dash,1,13,"[(0, 13)]",ORG,draper & dash,0,"('""draper & dash raises £3 million in funding""...",3
169,169,teach on mars closes €7 million third funding,teach on mars,1,13,"[(0, 13)]",ORG,teach on mars,0,"('""teach on mars closes €7 million third fundi...",3
241,241,"click & boat closes a round of € 4 million, co...",click & boat,1,12,"[(0, 12)]",ORG,click & boat,0,"('""click & boat closes a round of € 4 million,...",3
279,279,bergen carbon solutions has raised € 1.65 million,bergen carbon solutions,1,23,"[(0, 23)]",ORG,bergen carbon solutions,0,"('""bergen carbon solutions has raised € 1.65 m...",3
...,...,...,...,...,...,...,...,...,...,...,...
2617,2618,€12 million for the munich-based start-up glob...,global savings group,43,20,"[(42, 62)]",ORG,global savings group,0,"('""€12 million for the munich-based start-up g...",3
2636,2637,blue ocean robotics raises nearly $12 million ...,blue ocean robotics,1,19,"[(0, 19)]",ORG,blue ocean robotics,0,"('""blue ocean robotics raises nearly $12 milli...",3
2663,2664,secure code warrior snags $47.6 million to gro...,secure code warrior,1,19,"[(0, 19)]",ORG,secure code warrior,0,"('""secure code warrior snags $47.6 million to ...",3
2683,2684,virtual psychology clinic my online therapy ra...,my online therapy,27,17,"[(26, 43)]",ORG,my online therapy,0,"('""virtual psychology clinic my online therapy...",3


In [None]:
sample_text = data.Headline.iloc[2793]
sample_text

'longtime bootstrapped fiscal technologies raises £3.6 million with help of scaleup group'

In [None]:
def create_bert_data():
  bert_data = pd.DataFrame()
  tok = []
  words = []
  sentence_num = []
  for ind in data.index:
    for word in data['Headline'].iloc[ind].split():
      startup_name = data['Startup'].iloc[ind].split()
      if len(startup_name) > 1:
        if word in startup_name:
            if word == startup_name[0] :
              tok.append("B-ORG")
            elif word in startup_name:
              tok.append("I-ORG")
        else:
          tok.append("O")   
      elif word == startup_name[0]:
          tok.append("B-ORG")
      else:
          tok.append("O")
      words.append(word)
      sentence_num.append(ind)
  bert_data['words'] = words
  bert_data['sentence_id'] = sentence_num
  bert_data['labels'] = tok
  return bert_data

In [None]:
bert_data = create_bert_data()
bert_data.head()

Unnamed: 0,words,sentence_id,labels
0,accedo,0,B-ORG
1,raises,0,O
2,us$17,0,O
3,million,0,O
4,in,0,O


In [None]:
sample = data[['Headline','Startup']].iloc[2789]
sample

Headline    a cloud guru raised $257 million round
Startup                               a cloud guru
Name: 2789, dtype: object

In [None]:
bert_data[bert_data.sentence_id == 2789]

Unnamed: 0,words,sentence_id,labels
28961,a,2789,B-ORG
28962,cloud,2789,I-ORG
28963,guru,2789,I-ORG
28964,raised,2789,O
28965,$257,2789,O
28966,million,2789,O
28967,round,2789,O


In [None]:
index = bert_data.sentence_id[(bert_data.words == 'a') & (bert_data.labels == 'B-ORG')].to_list()

In [None]:
data.iloc[index]

Unnamed: 0.1,Unnamed: 0,Headline,Startup,Start_id,End_id,my_index,ne,org_by_index,test,spacy_anao,num
905,905,austin tech company a cloud guru raises $33 mi...,a cloud guru,21,12,"[(20, 32)]",ORG,a cloud guru,0,"('""austin tech company a cloud guru raises $33...",3
2789,2790,a cloud guru raised $257 million round,a cloud guru,1,12,"[(0, 12)]",ORG,a cloud guru,0,"('""a cloud guru raised $257 million round""', {...",3


In [None]:
bert_data.head()

Unnamed: 0,words,sentence_id,labels
0,accedo,0,B-ORG
1,raises,0,O
2,us$17,0,O
3,million,0,O
4,in,0,O


In [None]:
bert_data.to_csv("bert_data.csv", index=False)

In [None]:
bert_data = pd.read_csv("bert_data.csv")
bert_data.head()

Unnamed: 0,words,sentence_id,labels
0,accedo,0,B-ORG
1,raises,0,O
2,us$17,0,O
3,million,0,O
4,in,0,O


Here we are spliting in the data in in train, validation and test set

In [None]:
train_data = bert_data[bert_data.sentence_id < 1677]
validation = bert_data[(bert_data.sentence_id > 1676) & (bert_data.sentence_id < 2237) ]
test_data = bert_data[bert_data.sentence_id <= 2237]

In [None]:
# !pip install simpletransformers

In [None]:
import logging

import pandas as pd
from simpletransformers.ner import NERModel, NERArgs


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)


# Configure the model
model_args = NERArgs()
model_args.train_batch_size = 16
model_args.evaluate_during_training = True

model = NERModel(
    "roberta", "roberta-base", args=model_args
)

# Train the model
model.train_model(train_data, eval_data=validation)

# Evaluate the model
result, model_outputs, preds_list = model.eval_model(test_data)

# Make predictions with the model
predictions, raw_outputs = model.predict([sample_text])

INFO:filelock:Lock 140167396032080 acquired on /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b.lock


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

INFO:filelock:Lock 140167396032080 released on /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b.lock
INFO:filelock:Lock 140167396899472 acquired on /root/.cache/huggingface/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7.lock


Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

INFO:filelock:Lock 140167396899472 released on /root/.cache/huggingface/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7.lock
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaFor

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

INFO:filelock:Lock 140167285579856 released on /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
INFO:filelock:Lock 140167285714128 acquired on /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

INFO:filelock:Lock 140167285714128 released on /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
INFO:filelock:Lock 140167285685904 acquired on /root/.cache/huggingface/transformers/d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock


Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

INFO:filelock:Lock 140167285685904 released on /root/.cache/huggingface/transformers/d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/105 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/70 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Training of roberta model complete. Saved to outputs/.
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/280 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.0582046538025939, 'precision': 0.848314606741573, 'recall': 0.871824480369515, 'f1_score': 0.8599088838268792}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
print(predictions, raw_outputs)

[[{'longtime': 'O'}, {'bootstrapped': 'O'}, {'fiscal': 'O'}, {'technologies': 'O'}, {'raises': 'O'}, {'£3.6': 'O'}, {'million': 'O'}, {'with': 'O'}, {'help': 'O'}, {'of': 'O'}, {'scaleup': 'O'}, {'group': 'O'}]] [[{'longtime': [[7.734, -1.951, -2.121, -1.947, -1.898, 2.348, -1.889, -1.649, -1.661], [8.06, -1.89, -1.923, -1.861, -1.88, 1.607, -1.768, -1.616, -1.623]]}, {'bootstrapped': [[8.16, -1.927, -2.008, -1.778, -1.9, 1.641, -1.805, -1.751, -1.546], [8.19, -1.838, -1.911, -1.908, -1.77, 1.351, -1.859, -1.684, -1.638], [8.195, -1.916, -1.913, -1.786, -1.75, 1.3, -1.533, -1.752, -1.461]]}, {'fiscal': [[7.17, -2.252, -2.098, -2.105, -1.799, 3.557, -2.238, -1.729, -1.621], [7.55, -2.021, -2.066, -1.792, -2.006, 2.496, -1.974, -1.796, -1.677]]}, {'technologies': [[8.02, -2.092, -1.935, -1.845, -1.825, 1.782, -1.762, -1.89, -1.457], [7.35, -2.201, -1.777, -1.931, -1.905, 3.322, -1.874, -1.651, -1.732]]}, {'raises': [[8.21, -1.982, -1.71, -1.944, -1.795, 1.119, -1.744, -1.494, -1.479], [8

In [None]:
# Make predictions with the model
predictions, raw_outputs = model.predict(data.Headline.iloc[2790:2795])

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
predictions

[[{'wefox': 'O'},
  {'group': 'O'},
  {'closes': 'O'},
  {'$235': 'O'},
  {'million': 'O'},
  {'series': 'O'},
  {'b': 'O'},
  {'to': 'O'},
  {'continue': 'O'},
  {'insurtech’s': 'O'},
  {'global': 'O'},
  {'expansion': 'O'}],
 [{'gloat': 'B-ORG'},
  {'gathers': 'O'},
  {'$25': 'O'},
  {'million': 'O'},
  {'to': 'O'},
  {'help': 'O'},
  {'enterprise': 'O'},
  {'develop': 'O'},
  {'internal': 'O'},
  {'talent': 'O'},
  {'and': 'O'},
  {'build': 'O'},
  {'“future-proof': 'O'},
  {'workforces”': 'O'}],
 [{'abaka': 'B-ORG'},
  {'raises': 'O'},
  {'$6.5': 'O'},
  {'million': 'O'},
  {'in': 'O'},
  {'series': 'O'},
  {'a': 'O'},
  {'funding': 'O'},
  {'round': 'O'}],
 [{'longtime': 'O'},
  {'bootstrapped': 'O'},
  {'fiscal': 'O'},
  {'technologies': 'O'},
  {'raises': 'O'},
  {'£3.6': 'O'},
  {'million': 'O'},
  {'with': 'O'},
  {'help': 'O'},
  {'of': 'O'},
  {'scaleup': 'O'},
  {'group': 'O'}],
 [{'€3.01': 'O'}, {'million': 'O'}, {'for': 'O'}, {'resistell': 'O'}]]