# Part 1 - NER Model

### Download libraries

In [1]:
import pandas as pd
INPUT_FILE_PATH = 'teachers_db_practice.parquet'
PREDS_FILE_PATH = 'ner_predictions_v2.csv'

### Read data

In [2]:
#df_csv = pd.read_csv('teachers_db_practice.csv')
df_parquet = pd.read_parquet(INPUT_FILE_PATH, engine='pyarrow')

In [3]:
import pprint
pprint.pprint(df_parquet.iloc[0, 3])

('<p>  has worked as a designer for the last decade in roles spanning a '
 'variety of disciplines from graphics to product to interiors. In 2017, , '
 'became studio director at Vidivixi, a furniture and interiors design '
 'practice based in Mexico City. After leaving in 2023 and relocating to Spain '
 'he opened a new design studio with a focus on bespoke, contemporary '
 'design-led furniture.\xa0</p><h4>Corporate Experience</h4><p>• Studio '
 'Director, A&amp;M Studio, Spain, 2023 – Present</p><p>• Studio Director, '
 'Vidivixi, Mexico, 2017 – 2023</p><p>• Associate, Becquerel Capital, Mexico, '
 '2014 – 2017</p><p>• Design Partner, The Hub, Hong Kong, 2013 – '
 '2014</p><h4>Academic Background</h4><p>• Bachelor in Graphic Design, '
 'Camberwell College of Arts UAL, U.K., 2013</p>')


### Import libraries and model from huggingface: dslim/bert-base-NER

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
import torch

### Define Pipeline, Model and Tokenizer

In [5]:
#tokenizer = AutoTokenizer.from_pretrained('dslim/bert-base-NER')
#model = AutoModelForTokenClassification.from_pretrained('dslim/bert-base-NER')
ner_model = pipeline(task='ner', model='dslim/bert-base-NER', aggregation_strategy='simple', device=1)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:1


### Preprocess

In [6]:
from bs4 import BeautifulSoup
import re
import html

# V2

def parse_full_info(html_text):
    # parse html
    text = html.unescape(html_text.replace("\xa0", " "))
    soup = BeautifulSoup(text, "html.parser")

    # create lists
    summary = ""
    corporate_experience = []
    academic_experience = []
    academic_background = []

    # all is summary unless stated
    sections = soup.find_all(["p", "h4"])
    current_section = "summary"

    for tag in sections:
        if tag.name == "h4":
            title = tag.get_text(strip=True).lower()
            if "corporate" in title.lower():
                current_section = "corporate"
            elif "academic experience" in title.lower():
                current_section = "academic"
            elif "academic background" in title.lower():
                current_section = "studies"
            else:
                current_section = "summary"
        elif tag.name == "p":
            text_block = tag.get_text(strip=True)
            if current_section == "summary":
                summary += " " + text_block
            elif current_section == "corporate":
                corporate_experience.append(text_block)
            elif current_section == "academic":
                academic_experience.append(text_block)
            elif current_section == "studies":
                academic_background.append(text_block)

    return {
        'summary': summary.strip(),
        'corporate_experience': corporate_experience,
        'academic_experience': academic_experience,
        'academic_background': academic_background
    }


In [7]:
def extract_info_from_professor(full_info):
    parsed = parse_full_info(full_info)

    results = {
        'corporate_experience': {'companies': [], 'locations': []},
        'academic_experience': {'universities': [], 'degrees': [], 'locations': []},
        'academic_background': {'universities': [], 'degrees': [], 'locations': []}
    }

    # NER on corporate experience
    for exp in parsed['corporate_experience']:
        ents = ner_model(exp)
        for e in ents:
            if e['entity_group'] == 'ORG':
                results['corporate_experience']['companies'].append(e['word'])
            elif e['entity_group'] == 'LOC':
                results['corporate_experience']['locations'].append(e['word'])

    # NER on academic experience
    for aca in parsed['academic_experience']:
        ents = ner_model(aca)
        for e in ents:
            if e['entity_group'] == 'ORG':
                results['academic_experience']['universities'].append(e['word'])
            elif e['entity_group'] == 'MISC':
                results['academic_experience']['degrees'].append(e['word'])
            elif e['entity_group'] == 'LOC':
                results['academic_experience']['locations'].append(e['word'])
    
    # NER on academic background
    for edu in parsed['academic_background']:
        ents = ner_model(edu)
        for e in ents:
            if e['entity_group'] == 'ORG':
                results['academic_background']['universities'].append(e['word'])
            elif e['entity_group'] == 'MISC':
                results['academic_background']['degrees'].append(e['word'])
            elif e['entity_group'] == 'LOC':
                results['academic_background']['locations'].append(e['word'])

    return results


## Trying out on short dataset (df_copy)

In [8]:
df_copy = df_parquet.iloc[0:5]

In [9]:
df_copy["entities"] = df_copy["full_info"].apply(extract_info_from_professor)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy["entities"] = df_copy["full_info"].apply(extract_info_from_professor)


In [10]:
df_copy.loc[0, 'entities']

{'corporate_experience': {'companies': ['A & M Studio', 'Becquerel Capital'],
  'locations': ['Spain',
   'V',
   '##idivixi',
   'Mexico',
   'Mexico',
   'The Hub',
   'Hong Kong']},
 'academic_experience': {'universities': [], 'degrees': [], 'locations': []},
 'academic_background': {'universities': ['Camberwell College of Arts UAL'],
  'degrees': ['Design'],
  'locations': ['U. K.']}}

## Apply to df_parquet

In [11]:
df_pred = df_parquet.copy()
df_pred['entities'] = df_parquet['full_info'].apply(extract_info_from_professor)

In [18]:
df_pred.to_csv(PREDS_FILE_PATH)

Error: need to escape, but no escapechar set

In [19]:
df_pred.to_parquet("ner_predictions_v2.parquet")