# Part 1 - NER Model

### Download libraries

In [58]:
import pandas as pd
INPUT_FILE_PATH = 'teachers_db_practice.parquet'
PREDS_FILE_PATH = 'ner_predictions_v2.csv'

### Read data

In [None]:
#df_csv = pd.read_csv('teachers_db_practice.csv')
df_parquet = pd.read_parquet(INPUT_FILE_PATH, engine='pyarrow')

In [40]:
import pprint
pprint.pprint(df_parquet.iloc[0, 3])

('<p>  has worked as a designer for the last decade in roles spanning a '
 'variety of disciplines from graphics to product to interiors. In 2017, , '
 'became studio director at Vidivixi, a furniture and interiors design '
 'practice based in Mexico City. After leaving in 2023 and relocating to Spain '
 'he opened a new design studio with a focus on bespoke, contemporary '
 'design-led furniture.\xa0</p><h4>Corporate Experience</h4><p>• Studio '
 'Director, A&amp;M Studio, Spain, 2023 – Present</p><p>• Studio Director, '
 'Vidivixi, Mexico, 2017 – 2023</p><p>• Associate, Becquerel Capital, Mexico, '
 '2014 – 2017</p><p>• Design Partner, The Hub, Hong Kong, 2013 – '
 '2014</p><h4>Academic Background</h4><p>• Bachelor in Graphic Design, '
 'Camberwell College of Arts UAL, U.K., 2013</p>')


### Import libraries and model from huggingface: dslim/bert-base-NER

In [26]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
import torch

### Define Pipeline, Model and Tokenizer

In [27]:
#tokenizer = AutoTokenizer.from_pretrained('dslim/bert-base-NER')
#model = AutoModelForTokenClassification.from_pretrained('dslim/bert-base-NER')
ner_model = pipeline(task='ner', model='dslim/bert-base-NER', aggregation_strategy='simple', device=1)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


### Preprocess

In [None]:
from bs4 import BeautifulSoup
import re
import html

# V2

def parse_full_info(html_text):
    # 1️⃣ Decode HTML entities
    text = html.unescape(html_text.replace("\xa0", " "))
    soup = BeautifulSoup(text, "html.parser")

    # 2️⃣ Initialize containers
    summary = ""
    corporate_experience = []
    academic_background = []

    # 3️⃣ Split into sections
    sections = soup.find_all(["p", "h4"])
    current_section = "summary"

    for tag in sections:
        if tag.name == "h4":
            title = tag.get_text(strip=True).lower()
            if "corporate" in title.lower():
                current_section = "corporate"
            elif "academic" in title.lower():
                current_section = "academic"
            else:
                current_section = "summary"
        elif tag.name == "p":
            text_block = tag.get_text(strip=True)
            if current_section == "summary":
                summary += " " + text_block
            elif current_section == "corporate":
                corporate_experience.append(text_block)
            elif current_section == "academic":
                academic_background.append(text_block)

    return {
        'summary': summary.strip(),
        'corporate_experience': corporate_experience,
        'academic_background': academic_background
    }


In [48]:
def extract_info_from_professor(full_info):
    parsed = parse_full_info(full_info)

    results = {
        "companies": [],
        "locations": [],
        "universities": [],
        "degrees": [],
        "years": []
    }

    # NER on corporate experience
    for exp in parsed["corporate_experience"]:
        ents = ner_model(exp)
        for e in ents:
            if e["entity_group"] == "ORG":
                results["companies"].append(e["word"])
            elif e["entity_group"] == "LOC":
                results["locations"].append(e["word"])

    # NER on academic background
    for edu in parsed["academic_background"]:
        ents = ner_model(edu)
        for e in ents:
            if e["entity_group"] == "ORG":
                results["universities"].append(e["word"])
            elif e["entity_group"] == "MISC":
                results["degrees"].append(e["word"])
            elif e["entity_group"] == "LOC":
                results["locations"].append(e["word"])

    return results


## Trying out on short dataset (df_copy)

In [49]:
df_copy = df_parquet.iloc[0:5]

In [56]:
df_copy["entities"] = df_copy["full_info"].apply(extract_info_from_professor)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy["entities"] = df_copy["full_info"].apply(extract_info_from_professor)


In [57]:
df_copy.loc[0, 'entities']

{'companies': ['A & M Studio', 'Becquerel Capital'],
 'locations': ['Spain',
  'V',
  '##idivixi',
  'Mexico',
  'Mexico',
  'The Hub',
  'Hong Kong',
  'U. K.'],
 'universities': ['Camberwell College of Arts UAL'],
 'degrees': ['Design'],
 'years': []}

## Apply to df_parquet

In [62]:
df_pred = df_parquet.copy()
df_pred['entities'] = df_parquet['full_info'].apply(extract_info_from_professor)

In [63]:
df_pred.to_csv(PREDS_FILE_PATH)

In [64]:
df_pred

Unnamed: 0,area,area_code,position,full_info,id,description,tags,published,size,width,height,orientation,site,gender,alias,entities
0,Architecture & Design,40358,,<p> has worked as a designer for the last dec...,184670.0,,[],2025-06-16T11:48:18.952Z,22834.0,500.0,500.0,S,global,0,Appius Aemilius Agricola,"{'companies': ['A & M Studio', 'Becquerel Capi..."
1,Economics,17166,,<p>Mr. Madgar has been teaching economics par...,16106.0,Close-up portrait of a smiling man with a bear...,"[man, portrait, smiling, beard, close-up]",2022-11-18T07:21:20.407Z,7180.0,170.0,170.0,S,global,0,Appius Aemilius Cicero,"{'companies': ['Millwood Inc', 'Jet Research C..."
2,Private & Business Law,40353,,<p>Lawyer with broad experience in Market Regu...,25635.0,,[],2023-03-17T12:44:46.932Z,14699.0,128.0,128.0,S,global,0,Appius Aemilius Crassus,"{'companies': ['Creddia Advisors –', 'Issues',..."
3,Economics,17166,Adjunct professor,<p> is a seasoned leader with a proven track r...,37534.0,A professional portrait of a smiling woman wit...,"[woman, portrait, smiling, blond hair, blue bl...",2024-02-15T12:47:08.017Z,79371.0,500.0,500.0,S,global,1,Flavia Prisca,"{'companies': ['Harvard Strategy Consulting', ..."
4,Science & Technology,40359,,"<p> Carrio is a seasoned technology leader, re...",182500.0,,[],2025-04-07T10:15:21.908Z,15906.0,500.0,500.0,S,global,0,Appius Aemilius Scipio,"{'companies': ['A', '##urate', '##uant', '##rm..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1223,Information Systems & Technology,29474,,<p>Industrial engineer with economical backgro...,25326.0,,[],2023-03-09T13:26:33.827Z,30182.0,200.0,200.0,S,global,0,Tiberius Aemilius Lepidus,"{'companies': ['IE University', 'Information S..."
1224,Economics+Strategy+International Relations,17166,,<p><strong>Dr. Angel - </strong>is Adjunct Pro...,179524.0,A man in a suit with a tie speaking at a podium.,"[man, suit, podium, speaking, professional]",2025-01-14T13:18:10.523Z,22668.0,500.0,500.0,S,global,0,Tiberius Aemilius Paullus,"{'companies': ['Policy and Strategy Unit', 'Pu..."
1225,Finance,29472,,"<p>Angel (Madrid, 1965) has a degree in Econo...",20574.0,A man in a suit smiling while leaning on a rai...,"[man, suit, smiling, railing, building, outdoors]",2022-12-28T08:37:23.289Z,7893.0,170.0,170.0,S,global,0,Tiberius Aemilius Severus,"{'companies': ['BBVA Spain', 'B', '##BVA Group..."
1226,Architecture & Design,40358,,<p> is a BuiltTech specialist architect worki...,20565.0,A professional man in a grey suit posing for a...,"[man, professional, suit, portrait, grey suit,...",2022-12-27T13:40:40.080Z,5290.0,170.0,170.0,S,global,0,Tiberius Antonius Caesar,"{'companies': ['##ical', 'Architect'], 'locati..."
