In [1]:
import os
import json
import pandas as pd
import numpy as np

with open('/kaggle/input/pii-detection-removal-from-educational-data/train.json', 'r') as file:
    data_train = json.load(file)

with open('/kaggle/input/pii-detection-removal-from-educational-data/test.json', 'r') as file:
    data_test = json.load(file)

common_words_filter = pd.read_csv('/kaggle/input/for-pii-detection/common_words_except_names.csv')
words_to_filter = 50000
common_filt = common_words_filter['word'].values.tolist()[:words_to_filter]

In [2]:
strings = []
test_strings = []

for i in range(len(data_train)):
    strings.append(' '.join(data_train[i]['tokens']))

for i in range(len(data_test)):
    test_strings.append(' '.join(data_test[i]['tokens']))

combined_strings = strings + test_strings

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words=common_filt)
tfidf_vectors = tfidf_vectorizer.fit_transform(combined_strings)
tfidf_array = tfidf_vectors.toarray()

tfidf_array_train = tfidf_array[:len(data_train)]
tfidf_array_test = tfidf_array[len(data_train):]

term_indices = []
for i in range(len(data_train)):
    tfidf_values = tfidf_array_train[i]
    indices = np.argwhere(tfidf_values > 0.05).flatten()
    term_indices.append(indices)

In [4]:
data_train_lower = [np.char.lower(np.array(doc['tokens'])) for doc in data_train]
inspect_list = []

for doc_index, doc in enumerate(data_train_lower):
    terms_to_inspect = set(tfidf_vectorizer.get_feature_names_out()[term_indices[doc_index]])
    indices_to_inspect = [i for i, term in enumerate(doc) if term in terms_to_inspect]
    inspect_list.append(indices_to_inspect)

In [5]:
num_inspect = []
word_inspect = []

for i in range(len(inspect_list)):
    num_inspect.append([x for x in (inspect_list[i]) if data_train[i]['tokens'][x].isdigit()])
    word_inspect.append([x for x in (inspect_list[i]) if not data_train[i]['tokens'][x].isdigit()])

In [6]:
NN_inputs = []
for docnum in range(len(word_inspect)):
    for index in word_inspect[docnum]:
        info = {}
        tokens = data_train[docnum]['tokens']
        info['Doc Index'] = docnum
        info['Token Index'] = index
        info['Token'] = tokens[index]
        doclen = len(tokens)
        info['Word Score'] = float(tfidf_array_train[docnum, tfidf_vectorizer.vocabulary_.get(tokens[index].lower())])
        info['Doc Distance'] = float(max((doclen-index)/doclen/2, index/doclen/2))
        try:
            words = [tokens[index-2].lower(), tokens[index-1].lower(), tokens[index+1].lower(), tokens[index+2].lower()]
            word_indices = [tfidf_vectorizer.vocabulary_.get(word) for word in words]
            scores = []
            for word_index in word_indices:
                if word_index == None:
                    scores.append(0)
                else:
                    scores.append(tfidf_array_train[docnum, word_index])
            info['Surrounding Scores'] = scores
        except:
            info['Surrounding Scores'] = [0, 0, 0, 0]
        correct = data_train[docnum]['labels'][index]
        if correct == 'O':
             info['Correct'] = 0
        else:
             info['Correct'] = 1
        NN_inputs.append(info)

In [7]:
from sklearn.model_selection import train_test_split
import keras

X = np.array([[sample['Word Score']] + [sample['Doc Distance']] + sample['Surrounding Scores'] for sample in NN_inputs])
Y = np.array([sample['Correct'] for sample in NN_inputs])

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=0)

model_NN = keras.Sequential()
model_NN.add(keras.layers.Input(shape=(6,)))
model_NN.add(keras.layers.Dense(24, activation='sigmoid'))
model_NN.add(keras.layers.Dense(12, activation='sigmoid'))
model_NN.add(keras.layers.Dense(1, activation='sigmoid'))
model_NN.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model_NN.fit(X_train, Y_train, epochs=15, validation_data=(X_val, Y_val))

2024-02-24 03:49:57.675287: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-24 03:49:57.675410: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-24 03:49:57.834871: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7ff18ab1aaa0>

In [8]:
TREE_inputs = []
for docnum in range(len(num_inspect)):
    for index in num_inspect[docnum]:
        info = {}
        tokens = data_train[docnum]['tokens']
        info['Doc Index'] = docnum
        info['Token Index'] = index
        info['Token'] = int(tokens[index])
        info['Number Score'] = float(tfidf_array_train[docnum, tfidf_vectorizer.vocabulary_.get(tokens[index])])
        info['Class'] = data_train[docnum]['labels'][index]
        TREE_inputs.append(info)

tree_df = pd.DataFrame(TREE_inputs)
train_tree_df = tree_df[['Token', 'Number Score', 'Class']]

class_map = {'B-ID_NUM': 0, 'B-STREET_ADDRESS': 1, 'I-ID_NUM': 0, 'I-PHONE_NUM': 3, 'I-STREET_ADDRESS': 4, 'O': 5}
train_tree_df['Class'] = train_tree_df['Class'].map(class_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_tree_df['Class'] = train_tree_df['Class'].map(class_map)


In [9]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf

input_features = [tfdf.keras.FeatureUsage(name='Number'), tfdf.keras.FeatureUsage(name='tfidf')]

model_TREE = tfdf.keras.CartModel()
model_TREE.compile(metrics=['accuracy'])

dataset = tfdf.keras.pd_dataframe_to_tf_dataset(train_tree_df, label='Class')
model_TREE.fit(dataset)

Use /tmp/tmpey3h4_e7 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:06.885861. Found 7336 examples.
Training model...
Model trained in 0:00:00.035089
Compiling model...


[INFO 24-02-24 03:52:42.2742 UTC kernel.cc:1233] Loading model from path /tmp/tmpey3h4_e7/model/ with prefix 8d13e4ae3f154937
[INFO 24-02-24 03:52:42.2764 UTC decision_forest.cc:660] Model loaded with 1 root(s), 3 node(s), and 1 input feature(s).
[INFO 24-02-24 03:52:42.2766 UTC abstract_model.cc:1344] Engine "RandomForestGeneric" built
[INFO 24-02-24 03:52:42.2768 UTC kernel.cc:1061] Use fast generic engine


Model compiled.


<keras.src.callbacks.History at 0x7ff1795c31f0>

In [10]:
test_term_indices = []
for i in range(len(data_test)):
    tfidf_values = tfidf_array_test[i]
    indices = np.argwhere(tfidf_values > 0.05).flatten()
    test_term_indices.append(indices)

In [11]:
data_test_lower = [np.char.lower(np.array(doc['tokens'])) for doc in data_test]
test_inspect_list = []

for doc_index, doc_tokens in enumerate(data_test_lower):
    terms_to_inspect = set(tfidf_vectorizer.get_feature_names_out()[test_term_indices[doc_index]])
    indices_to_inspect = [i for i, term in enumerate(doc_tokens) if term in terms_to_inspect]
    test_inspect_list.append(indices_to_inspect)

In [12]:
test_num_inspect = []
test_word_inspect = []

for i in range(len(test_inspect_list)):
    test_num_inspect.append([x for x in (test_inspect_list[i]) if data_test[i]['tokens'][x].isdigit()])
    test_word_inspect.append([x for x in (test_inspect_list[i]) if not data_test[i]['tokens'][x].isdigit()])

In [13]:
NN_test_inputs = []
for docnum in range(len(test_word_inspect)):
    for index in test_word_inspect[docnum]:
        info = {}
        tokens = data_test[docnum]['tokens']
        info['Doc Index'] = data_test[docnum]['document']
        info['Token Index'] = index
        info['Token'] = tokens[index]
        doclen = len(tokens)
        info['Word Score'] = float(tfidf_array_test[docnum, tfidf_vectorizer.vocabulary_.get(tokens[index].lower())])
        info['Doc Distance'] = float(max((doclen-index)/doclen/2, index/doclen/2))
        try:
            words = [tokens[index-2].lower(), tokens[index-1].lower(), tokens[index+1].lower(), tokens[index+2].lower()]
            word_indices = [tfidf_vectorizer.vocabulary_.get(word) for word in words]
            scores = []
            for word_index in word_indices:
                if word_index == None:
                    scores.append(0)
                else:
                    scores.append(tfidf_array_test[docnum, word_index])
            info['Surrounding Scores'] = scores
        except:
            info['Surrounding Scores'] = [0, 0, 0, 0]
        NN_test_inputs.append(info)

In [14]:
TREE_test_inputs = []
for docnum in range(len(test_num_inspect)):
    for index in test_num_inspect[docnum]:
        info = {}
        tokens = data_test[docnum]['tokens']
        info['Doc Index'] = data_test[docnum]['document']
        info['Token Index'] = index
        info['Token'] = int(tokens[index])
        info['Number Score'] = float(tfidf_array_test[docnum, tfidf_vectorizer.vocabulary_.get(tokens[index])])
        TREE_test_inputs.append(info)

tree_test_df = pd.DataFrame(TREE_test_inputs)
tree_predict_df = tree_test_df[['Token', 'Number Score']]

In [15]:
X_test = np.array([[sample['Word Score']] + [sample['Doc Distance']] + sample['Surrounding Scores'] for sample in NN_test_inputs])

NN_predictions = model_NN.predict(X_test)
NN_test_df = pd.DataFrame(NN_test_inputs)
NN_test_df['Pred'] = NN_predictions
NN_test_df['Correct'] = NN_test_df['Pred'] > 0.55

NN_test_df['Correct'] = NN_test_df.groupby(['Doc Index', 'Token'])['Correct'].transform(lambda x: x.mode().iloc[0])



In [16]:
label_map = {True: 'B-NAME_STUDENT', False: 'O'}
NN_test_df['Label'] = NN_test_df['Correct'].map(label_map)

mask = NN_test_df['Label'] != 'O'
mask2 = NN_test_df['Label'].shift(fill_value='O') == 'B-NAME_STUDENT'
mask3 = NN_test_df['Doc Index'] == NN_test_df['Doc Index'].shift(fill_value='')
mask4 = NN_test_df['Token Index'].shift(fill_value=0) == NN_test_df['Token Index'] - 1

NN_test_df.loc[mask & mask2 & mask3 & mask4, 'Label'] = 'I-NAME_STUDENT'

In [17]:
predict_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(tree_predict_df)
predictions_tree = model_TREE.predict(predict_dataset)
predictions_tree = predictions_tree.argmax(axis=1)

tree_test_df['Class'] = predictions_tree
inv_class_map = {num: clas for clas, num in class_map.items()}
tree_test_df['Class'] = tree_test_df['Class'].map(inv_class_map)



In [18]:
columns = ['document', 'token', 'label']
NN_test_df = NN_test_df.rename(columns={'Doc Index': 'document', 'Token Index': 'token', 'Label': 'label'})
NN_submission_df = NN_test_df[columns]
tree_test_df = tree_test_df.rename(columns={'Doc Index': 'document', 'Token Index': 'token', 'Class': 'label'})
tree_submission_df = tree_test_df[columns]

In [19]:
submission_df = pd.concat([NN_submission_df, tree_submission_df])
submission_df.reset_index(drop=True, inplace=True)
submission_df.sort_values(by=['document', 'token'], inplace=True)
submission_df = submission_df[submission_df['label'] != 'O']
submission_df.reset_index(drop=True, inplace=True)
submission_df['row_id'] = submission_df.index
submission_df = submission_df[['row_id', 'document', 'token', 'label']]

submission_df.to_csv('/kaggle/working/submission.csv', index=False)