Welcome to my notebook.  This is my first attempt at a Kaggle competition.  In order to get a better sense of the data, I took a decidedly basic approach for this first attempt at this competition, where I used the training data just for my own edification, studying familiar patterns.  This employs little to no contextural analysis, and instead simply evaluates each word/set of words and attempts to spot names or familiar patterns.  I decided to ignore usernames and addresses as there simply was not enough training data to help me pick up on specific patterns there.  

From here, I plan to examine some of the great work others have put together and hopefully, combined with my now intiminate knowledge of the training data, I will be able to add some real value on the predictions side of things.

I nevertheless feel this can be helpful for certain users, as it demonstrates a simple approach to this problem in a very digestable manner.  This performance (0.66) also establishes a useful baseline, as this is achieved without using the training data (notice the training data is never referenced throughout the code), leveraging instead only patterns I spot within the data.

Feedback welcome!

In [None]:
import json
import pandas as pd

# load the test data
data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))

# transformed_data will convert the full data set into one broken down by token
transformed_data = []

# each entry in the data has an associated doc_id and set of tokens
# here we create an array where each row consists of a separate token with its associated doc_id, and
# we add an associated row_id that refers to the position of each token within a given entry
for entry in data:
    tokens = entry['tokens']
    doc_id = entry['document']
    i = 0
    for token in tokens:
        transformed_data.append({'token': token, 'document': doc_id, 'row_id': i})
        i += 1

# convert transformed_data into a dataframe
df = pd.DataFrame(transformed_data)

In [None]:
from nltk.corpus import words
import re

# words.words() is a list of words from nltk.corpus.  We use this to distinguish between names and words. 
# we use set() so that searching for a unique word is O(1)
word_set = set(words.words())

# in this simple approach, we use regex for identifying email, url_personal, id_num, and phone_num
# for phone num, we classify begin (B) and inner (I) later; for the others, we assume always (B), as to
# an extent demonstrated in the training data.   
def find_string_type(string):
    patterns = {
        'B-EMAIL': r'[\w\.-]+@[\w\.-]+',
        'B-URL_PERSONAL': r'http[s]?://[^\s]+',
        'B-ID_NUM': r'\b\d{7,}[A-Za-z]{0,3}\b|\b[A-Za-z]{0,3}\d{7,}\b',
        'PHONE_NUM': r'\b(?:\+?\d{1,3}[\s.-]?)?(?:\(\d{3}\)|\d{3})[\s.-]?\d{3}[\s.-]?\d{4}(?:\s*(?:ext|x)\s*\d{2,6})?\b',
    }

    for key, pattern in patterns.items():
        if re.search(pattern, string):
            return key

    return 'O'

# in the training data, certain titles get flagged as names, so we use this limited list to help strip this out
TITLES = [
    'dr',   
    'mr',    
    'mrs',   
    'ms',    
    'mx',    
    'sir',  
    'madam', 
    'miss',  
    'prof',  
    'professor',
    'rev',   
    'hon',   
    'lt',    
    'capt', 
    'cdr',   
    'col',   
    'gen',   
    'judge', 
    'justice' 
]


# determines whether a given word is common, i.e. not a name
def is_common_word(word):
    return word.lower() in word_set or word.lower() in TITLES

# checks whether a given token is a name.  if so, add the corresponding label.
# in reviewing the training data, first names always come before last names,
# which is handled in this method
def check_names(first_name, last_name, document, cur_document, token, cur_token):
    if first_name.istitle() and isinstance(last_name, str) and last_name.istitle():
        search_first_name = first_name in first_names
        search_last_name = last_name in last_names
        if search_first_name and search_last_name:
            label.append('B-NAME_STUDENT')
            label.append('I-NAME_STUDENT')
            append_others(document, cur_document, token, cur_token)
            append_others(document, next_document, token, next_token)
            return True, True

    elif first_name.istitle():
        search_first_name = first_name in first_names
        if search_first_name:
            label.append('B-NAME_STUDENT')
            append_others(document, cur_document, token, cur_token)
            return False, True

    return False, False

# for appending things other than the label (document, token) to the submission file
def append_others(document, cur_document, token, cur_token):
    document.append(cur_document)
    token.append(cur_token)
    
# to hold the information that goes into the submission file
document, token, label = [], [], []

# load the list of first and last names from JSON files
# these files can be found here: https://github.com/philipperemy/name-dataset
first_names = json.load(open("/kaggle/input/first-names-json/first_names.json"))
last_names = json.load(open("/kaggle/input/first-names-json/first_names.json"))

# flags for helping drive the code logic
found_start = False
continue_next = False
found_name = False
found_phone = False

# for each row in our dataframe
for idx, row in df.iterrows():
    if idx == len(df) - 1:
        break
    
    # we retrieve this info to be submitted to the submission csv when applicable
    first_name = row['token']
    last_name = df.at[idx+1, 'token']
    cur_document = row['document']
    next_document = df.at[idx+1, 'document']
    cur_token = row['row_id']
    next_token = df.at[idx+1, 'row_id']

    # if on the last row we found this row's label, skip to the next row
    if continue_next:
        continue_next = False
        continue

    # only proceed if this token is a string
    if isinstance(first_name, str):
        if not is_common_word(first_name):
            continue_next, found_name = check_names(first_name, last_name, document, cur_document, token, cur_token)

            if found_name:
                continue

            new_label = find_string_type(first_name)
            next_label = find_string_type(last_name)
            
            # in examining the training data, I found certain patterns for phone numbers, which I have accounted for here
            if not found_phone:
                if first_name == '(' and next_label == 'PHONE_NUM':
                    label.append('B-PHONE_NUM')
                    label.append('I-PHONE_NUM')
                    append_others(document, cur_document, token, cur_token)
                    append_others(document, next_document, token, next_token)
                    
                    found_phone = True

                elif new_label != 'O':
                    starter = ''
                    if new_label == 'PHONE_NUM':
                        starter = 'B-'
                    label.append(starter + new_label)                        
                    append_others(document, cur_document, token, cur_token)
            else:
                if first_name == '-':
                    label.append('I-PHONE_NUM')                        
                    append_others(document, cur_document, token, cur_token)
                else:
                    starter = ''
                    if new_label == 'PHONE_NUM':
                        starter = 'I-'
                    label.append(starter + new_label)                        
                    append_others(document, cur_document, token, cur_token)

# create a dataframe with just the information needed for submission                    
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label
})

# add the row_id
df["row_id"] = list(range(len(df)))                    

# submit
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)