# Part 1 - NER Model

### Download libraries

In [None]:
import pandas as pd
INPUT_FILE_PATH = 'teachers_db_practice.parquet'
PREDS_FILE_PATH = 'ner_predictions_v1.csv'

### Read data

In [None]:
#df_csv = pd.read_csv('teachers_db_practice.csv')
df_parquet = pd.read_parquet(INPUT_FILE_PATH, engine='pyarrow')

In [16]:
import pprint
pprint.pprint(df_parquet.iloc[0, 3])

('<p>  has worked as a designer for the last decade in roles spanning a '
 'variety of disciplines from graphics to product to interiors. In 2017, , '
 'became studio director at Vidivixi, a furniture and interiors design '
 'practice based in Mexico City. After leaving in 2023 and relocating to Spain '
 'he opened a new design studio with a focus on bespoke, contemporary '
 'design-led furniture.\xa0</p><h4>Corporate Experience</h4><p>• Studio '
 'Director, A&amp;M Studio, Spain, 2023 – Present</p><p>• Studio Director, '
 'Vidivixi, Mexico, 2017 – 2023</p><p>• Associate, Becquerel Capital, Mexico, '
 '2014 – 2017</p><p>• Design Partner, The Hub, Hong Kong, 2013 – '
 '2014</p><h4>Academic Background</h4><p>• Bachelor in Graphic Design, '
 'Camberwell College of Arts UAL, U.K., 2013</p>')


### Import libraries and model from huggingface: dslim/bert-base-NER

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
import torch

### Define Pipeline, Model and Tokenizer

In [6]:
#tokenizer = AutoTokenizer.from_pretrained('dslim/bert-base-NER')
#model = AutoModelForTokenClassification.from_pretrained('dslim/bert-base-NER')
ner_model = pipeline(task='ner', model='dslim/bert-base-NER', aggregation_strategy='simple', device=1)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


### Preprocess

In [None]:
from bs4 import BeautifulSoup
import re
import html

def preprocess_profile(text: str) -> str:
    text = html.unescape(text).replace("\xa0", " ")

    soup = BeautifulSoup(text, "html.parser")
    clean_text = " ".join(soup.stripped_strings)

    clean_text = re.sub(r"[•·•●♦▪▶]", "-", clean_text)

    clean_text = re.sub(r"\s+", " ", clean_text)
    clean_text = re.sub(r"\s*([,.;:])\s*", r"\1 ", clean_text)
    clean_text = re.sub(r"–", "-", clean_text)  # normalize dash

    clean_text = re.sub(r"(?<=\d)\s*-\s*(?=\d)", " to ", clean_text)  # year ranges
    clean_text = re.sub(r"([.!?])\s+(?=[A-Z])", r"\1\n", clean_text)

    return clean_text.strip()

## Trying out on short dataset (df_copy)

In [14]:
df_copy = df_parquet.iloc[0:5]

In [30]:
df_copy['full_info'] = df_copy['full_info'].apply(preprocess_profile)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy['full_info'] = df_copy['full_info'].apply(preprocess_profile)


In [31]:
df_copy['entities'] = df_copy['full_info'].apply(ner_model)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy['entities'] = df_copy['full_info'].apply(ner_model)


In [33]:
df_copy.iloc[0, -1]

[{'entity_group': 'ORG',
  'score': np.float32(0.97697926),
  'word': 'Vidivixi',
  'start': 164,
  'end': 172},
 {'entity_group': 'LOC',
  'score': np.float32(0.9996344),
  'word': 'Mexico City',
  'start': 225,
  'end': 236},
 {'entity_group': 'LOC',
  'score': np.float32(0.99977463),
  'word': 'Spain',
  'start': 278,
  'end': 283},
 {'entity_group': 'ORG',
  'score': np.float32(0.9648816),
  'word': 'A & M Studio',
  'start': 414,
  'end': 424},
 {'entity_group': 'LOC',
  'score': np.float32(0.99974984),
  'word': 'Spain',
  'start': 426,
  'end': 431},
 {'entity_group': 'ORG',
  'score': np.float32(0.89931095),
  'word': 'V',
  'start': 467,
  'end': 468},
 {'entity_group': 'LOC',
  'score': np.float32(0.45341945),
  'word': '##id',
  'start': 468,
  'end': 470},
 {'entity_group': 'ORG',
  'score': np.float32(0.7412178),
  'word': '##ivixi',
  'start': 470,
  'end': 475},
 {'entity_group': 'LOC',
  'score': np.float32(0.9997321),
  'word': 'Mexico',
  'start': 477,
  'end': 483},


## Apply to df_parquet

In [38]:
import tqdm
def run_pipeline(data, preprocessor, model):
    data['full_info'] = data['full_info'].apply(preprocessor)
    data['entities'] = data['full_info'].apply(model)
    return data

df_pred = run_pipeline(df_parquet, preprocess_profile, ner_model)

In [None]:
df_pred.to_csv(PREDS_FILE_PATH)