# Part 1 - NER Model

### Download libraries

In [1]:
import pandas as pd

### Read data

In [2]:
#df_csv = pd.read_csv('teachers_db_practice.csv')
df_parquet = pd.read_parquet('teachers_db_practice.parquet', engine='pyarrow')

In [3]:
df_parquet

Unnamed: 0,area,area_code,position,full_info,id,description,tags,published,size,width,height,orientation,site,gender,alias
0,Architecture & Design,40358,,<p> has worked as a designer for the last dec...,184670.0,,[],2025-06-16T11:48:18.952Z,22834.0,500.0,500.0,S,global,0,Appius Aemilius Agricola
1,Economics,17166,,<p>Mr. Madgar has been teaching economics par...,16106.0,Close-up portrait of a smiling man with a bear...,"[man, portrait, smiling, beard, close-up]",2022-11-18T07:21:20.407Z,7180.0,170.0,170.0,S,global,0,Appius Aemilius Cicero
2,Private & Business Law,40353,,<p>Lawyer with broad experience in Market Regu...,25635.0,,[],2023-03-17T12:44:46.932Z,14699.0,128.0,128.0,S,global,0,Appius Aemilius Crassus
3,Economics,17166,Adjunct professor,<p> is a seasoned leader with a proven track r...,37534.0,A professional portrait of a smiling woman wit...,"[woman, portrait, smiling, blond hair, blue bl...",2024-02-15T12:47:08.017Z,79371.0,500.0,500.0,S,global,1,Flavia Prisca
4,Science & Technology,40359,,"<p> Carrio is a seasoned technology leader, re...",182500.0,,[],2025-04-07T10:15:21.908Z,15906.0,500.0,500.0,S,global,0,Appius Aemilius Scipio
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1223,Information Systems & Technology,29474,,<p>Industrial engineer with economical backgro...,25326.0,,[],2023-03-09T13:26:33.827Z,30182.0,200.0,200.0,S,global,0,Tiberius Aemilius Lepidus
1224,Economics+Strategy+International Relations,17166,,<p><strong>Dr. Angel - </strong>is Adjunct Pro...,179524.0,A man in a suit with a tie speaking at a podium.,"[man, suit, podium, speaking, professional]",2025-01-14T13:18:10.523Z,22668.0,500.0,500.0,S,global,0,Tiberius Aemilius Paullus
1225,Finance,29472,,"<p>Angel (Madrid, 1965) has a degree in Econo...",20574.0,A man in a suit smiling while leaning on a rai...,"[man, suit, smiling, railing, building, outdoors]",2022-12-28T08:37:23.289Z,7893.0,170.0,170.0,S,global,0,Tiberius Aemilius Severus
1226,Architecture & Design,40358,,<p> is a BuiltTech specialist architect worki...,20565.0,A professional man in a grey suit posing for a...,"[man, professional, suit, portrait, grey suit,...",2022-12-27T13:40:40.080Z,5290.0,170.0,170.0,S,global,0,Tiberius Antonius Caesar


### Import libraries and model from huggingface: dslim/bert-base-NER

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
import torch

### Define Pipeline, Model and Tokenizer

In [6]:
#tokenizer = AutoTokenizer.from_pretrained('dslim/bert-base-NER')
#model = AutoModelForTokenClassification.from_pretrained('dslim/bert-base-NER')
ner_model = pipeline(task='ner', model='dslim/bert-base-NER', aggregation_strategy='simple', device=1)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


### Preprocess

In [29]:
from bs4 import BeautifulSoup
import re
import html

def preprocess_profile(text: str) -> str:
    # 1️⃣ Decode HTML entities (&amp; → &, \xa0 → space)
    text = html.unescape(text).replace("\xa0", " ")

    # 2️⃣ Strip HTML tags but keep section headers as separators
    soup = BeautifulSoup(text, "html.parser")
    clean_text = " ".join(soup.stripped_strings)

    # 3️⃣ Remove bullets or weird punctuation
    clean_text = re.sub(r"[•·•●♦▪▶]", "-", clean_text)

    # 4️⃣ Normalize whitespace and punctuation spacing
    clean_text = re.sub(r"\s+", " ", clean_text)
    clean_text = re.sub(r"\s*([,.;:])\s*", r"\1 ", clean_text)
    clean_text = re.sub(r"–", "-", clean_text)  # normalize dash

    # 5️⃣ Optional: Add sentence separators for NER readability
    clean_text = re.sub(r"(?<=\d)\s*-\s*(?=\d)", " to ", clean_text)  # year ranges
    clean_text = re.sub(r"([.!?])\s+(?=[A-Z])", r"\1\n", clean_text)

    return clean_text.strip()


## Trying out on short dataset (df_copy)

In [14]:
df_copy = df_parquet.iloc[0:5]

In [30]:
df_copy['full_info'] = df_copy['full_info'].apply(preprocess_profile)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy['full_info'] = df_copy['full_info'].apply(preprocess_profile)


In [31]:
df_copy['entities'] = df_copy['full_info'].apply(ner_model)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy['entities'] = df_copy['full_info'].apply(ner_model)


In [33]:
df_copy.iloc[0, -1]

[{'entity_group': 'ORG',
  'score': np.float32(0.97697926),
  'word': 'Vidivixi',
  'start': 164,
  'end': 172},
 {'entity_group': 'LOC',
  'score': np.float32(0.9996344),
  'word': 'Mexico City',
  'start': 225,
  'end': 236},
 {'entity_group': 'LOC',
  'score': np.float32(0.99977463),
  'word': 'Spain',
  'start': 278,
  'end': 283},
 {'entity_group': 'ORG',
  'score': np.float32(0.9648816),
  'word': 'A & M Studio',
  'start': 414,
  'end': 424},
 {'entity_group': 'LOC',
  'score': np.float32(0.99974984),
  'word': 'Spain',
  'start': 426,
  'end': 431},
 {'entity_group': 'ORG',
  'score': np.float32(0.89931095),
  'word': 'V',
  'start': 467,
  'end': 468},
 {'entity_group': 'LOC',
  'score': np.float32(0.45341945),
  'word': '##id',
  'start': 468,
  'end': 470},
 {'entity_group': 'ORG',
  'score': np.float32(0.7412178),
  'word': '##ivixi',
  'start': 470,
  'end': 475},
 {'entity_group': 'LOC',
  'score': np.float32(0.9997321),
  'word': 'Mexico',
  'start': 477,
  'end': 483},


## Apply to df_parquet

In [38]:
import tqdm
def run_pipeline(data, preprocessor, model):
    data['full_info'] = data['full_info'].apply(preprocessor)
    data['entities'] = data['full_info'].apply(model)
    return data

df_pred = run_pipeline(df_parquet, preprocess_profile, ner_model)

In [39]:
df_pred.to_csv('ner_predictions.csv')