# CRF Classifier

To extract NER candidates

In [1]:
import pandas as pd
import numpy as np
import spacy 
import string

from sklearn.metrics import f1_score

import os
import joblib 

from typing import List

import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV


In [2]:
nlp = spacy.load('fr_core_news_lg')

In [3]:
df_train = pd.read_csv('../data/train_2.csv')
df_test = pd.read_csv('../data/test.csv')
# df_sample = pd.read_csv('../data/sample_submission_2.csv')

In [4]:
def cleanData(data:pd.DataFrame) -> None:
    """Simple function to clean Tokens

    :param data: DataFrame containing the data
    :type data: pd.DataFrame
    """

    data['Token'] = data['Token'].str.replace('"', '')

# need to clean Token column because each token is surrounded by quotes, which will cause issues later
cleanData(df_train)


In [5]:
def getStringShape(string:str) -> str:
    """Get the shape of the string as either X for capital letters and x for normal letters

    :param string: String to get the shape of
    :type string: str
    :return: Shape of the string (e.g. Port: Xxxx)
    :rtype: str
    """
    shape = ['X' if x.isupper() else 'x' for x in string]
    return ''.join(shape)
    
def extractFeatures(data:pd.DataFrame)->None:
    """
    Extract features from Token column of the dataframe

    :param data: DataFrame contianing the data
    :type data: pd.DataFrame
    """
    data['lower'] = [x.lower() for x in data['Token']]
    data['isdigit'] = [x.isdigit() for x in data['Token']]
    data['isupper'] = [x.isupper() for x in data['Token']]
    data['ispunct'] = [x in string.punctuation for x in data['Token']]
    data['isstop'] = [x in nlp.Defaults.stop_words for x in data['Token']]
    data['len'] = [len(x) for x in data['Token']]
    data['shape'] = [getStringShape(x) for x in data['Token']]

extractFeatures(df_train)
df_train

Unnamed: 0,Id,Token,Label,lower,isdigit,isupper,ispunct,isstop,len,shape
0,0,01,aucun,01,True,False,False,False,2,xx
1,1,4.5.3,aucun,4.5.3,False,False,False,False,5,xxxxx
2,2,.,aucun,.,False,False,True,False,1,x
3,3,Port,geogFeat geogName,port,False,False,False,False,4,Xxxx
4,4,du,geogName,du,False,False,False,True,2,xx
...,...,...,...,...,...,...,...,...,...,...
39852,39852,portée,aucun,portée,False,False,False,False,6,xxxxxx
39853,39853,sur,aucun,sur,False,False,False,True,3,xxx
39854,39854,les,aucun,les,False,False,False,True,3,xxx
39855,39855,cartes,aucun,cartes,False,False,False,False,6,xxxxxx


In [17]:
def prepareDatasetFeatures(data:pd.DataFrame, i:int, window:int) -> dict:
    """Process the features of a token extracted with extractFeatures and
    prepares them in the format required by sklearn_crfsuite
    Note: this functions takes a lot of time somehow, so it probably needs to be
    refactored.

    :param data: DataFrame containing the token and their features
    :type data: pd.DataFrame
    :param i: index of the token
    :type i: int
    :return: Dictionary where keys are the name of the feature, and values are the features
    :rtype: dict
    """
    # columns to ignore in DataFrame
    ignore_col = ['Id', 'Label']

    data = data[[x for x in data.columns if x not in ignore_col]]
    # get row associated with this token / index
    word_data = data.loc[i]
    feature_dict = word_data.to_dict()

    # extract features of preceding token (except if the current token is the first)
    # if i != 0:
    #     for j in range(i - window, i):
    #         if j >= 0:
    #             prev_word_data = data.loc[j]
    #             for k, v in prev_word_data.to_dict().items():
    #                 feature_dict[f"prev_{j}_{k}"] = v

    # extract features of next tokens (except if the current token is the last)
    # if i != len(data) - 1:
    #     # print(i)
    #     for j in range(i + 1, i + window + 1):
    #         # print('j', j)
    #         if j <= len(data) -  1:
    #             next_word_data = data.loc[j]
    #             for k, v in next_word_data.to_dict().items():
    #                 feature_dict[f"next_{j}_{k}"] = v


    # extract features of preceding token (except if the current token is the first)
    if i != 0:
        prev_word_data = data.loc[i - 1]
        for k, v in prev_word_data.to_dict().items():
            feature_dict[f"prev_{k}"] = v

    # extract features of next token (except if the current token is the last)
    if i != len(data) - 1:
        next_word_data = data.loc[i + 1]
        for k, v in next_word_data.to_dict().items():
            feature_dict[f"next_{k}"] = v
            
    return feature_dict

In [18]:
window = 1
X_train = np.array([prepareDatasetFeatures(data=df_train, i=i, window = window) for i in df_train.index])
y_train = df_train['Label'].values
X_train.shape, y_train.shape

((39857,), (39857,))

In [19]:
X_train[0]

{'Token': '01',
 'lower': '01',
 'isdigit': True,
 'isupper': False,
 'ispunct': False,
 'isstop': False,
 'len': 2,
 'shape': 'xx',
 'next_Token': '4.5.3',
 'next_lower': '4.5.3',
 'next_isdigit': False,
 'next_isupper': False,
 'next_ispunct': False,
 'next_isstop': False,
 'next_len': 5,
 'next_shape': 'xxxxx'}

# CRF Grid Search

In [20]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, train_size=.7, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_dev, y_dev, train_size=.5, random_state=42)

X_train.shape, X_dev.shape, y_train.shape, y_dev.shape, X_test.shape, y_test.shape

((27899,), (5979,), (27899,), (5979,), (5979,), (5979,))

In [21]:

# I had errors while trying RandomizedSearchCV, so I had to implement something a bit similar

c1 = [0.5, 0.1, 0.01, 0.001, 0.0001]
c2 = [0.5, 0.1, 0.01, 0.001, 0.0001]

param_space = []

# generates parameters combinations
for i in range(len(c1)):
    for j in range(len(c2)):
        param_space.append((c1[i], c2[j]))

# trains models for each possible combination of paramaters
crf_models = []
for param in param_space:
    tmp_crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=param[0],
        c2=param[1],
        # max_iterations=500,
        all_possible_transitions=True
    )
    tmp_crf.fit([X_train], [y_train])
    crf_models.append(tmp_crf)




In [23]:

crf_scores = []
for x in crf_models:
    y_pred = x.predict([X_dev])
    # score = f1_score(y_dev, y_pred[0], average='micro')
    score = sklearn_crfsuite.metrics.flat_accuracy_score([y_dev], y_pred)
    # print(x.score(, [y_dev]))
    crf_scores.append(score)

print('Best param:', param_space[np.argmax(crf_scores)])

Best param: (0.001, 0.1)


# Training CRF model with best parameters on whole dataset

In [25]:
c1 = 0.001
c2 = 0.1

X_train = np.array([prepareDatasetFeatures(data=df_train, i=i, window=window) for i in df_train.index])
y_train = df_train['Label'].values

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=c1,
    c2=c2,
    all_possible_transitions=True
)
crf.fit([X_train], [y_train])

### Save model

In [26]:
os.makedirs('../models', exist_ok=True)
joblib.dump(crf, f'../models/crf_{c1}_{c2}_{window}.joblib')

['../models/crf_0.001_0.1_1.joblib']

### Evaluation

### Load model

In [27]:
copy_test = df_test.copy(deep=True)
cleanData(copy_test)
extractFeatures(copy_test)
copy_test

Unnamed: 0,Id,Token,lower,isdigit,isupper,ispunct,isstop,len,shape
0,0,01,01,True,False,False,False,2,xx
1,1,2.1.4.8,2.1.4.8,False,False,False,False,7,xxxxxxx
2,2,.,.,False,False,True,False,1,x
3,3,Conduites,conduites,False,False,False,False,9,Xxxxxxxxx
4,4,sous-marines,sous-marines,False,False,False,False,12,xxxxxxxxxxxx
...,...,...,...,...,...,...,...,...,...
26168,26168,103,103,True,False,False,False,3,xxx
26169,26169,/,/,False,False,True,False,1,x
26170,26170,0,0,True,False,False,False,1,x
26171,26171,103,103,True,False,False,False,3,xxx


In [28]:
X = np.array([prepareDatasetFeatures(data=copy_test, i=i, window=window) for i in copy_test.index])

In [29]:
pred = crf.predict([X])
copy_test['Label'] = pred[0]
copy_test

Unnamed: 0,Id,Token,lower,isdigit,isupper,ispunct,isstop,len,shape,Label
0,0,01,01,True,False,False,False,2,xx,aucun
1,1,2.1.4.8,2.1.4.8,False,False,False,False,7,xxxxxxx,aucun
2,2,.,.,False,False,True,False,1,x,aucun
3,3,Conduites,conduites,False,False,False,False,9,Xxxxxxxxx,aucun
4,4,sous-marines,sous-marines,False,False,False,False,12,xxxxxxxxxxxx,aucun
...,...,...,...,...,...,...,...,...,...,...
26168,26168,103,103,True,False,False,False,3,xxx,aucun
26169,26169,/,/,False,False,True,False,1,x,aucun
26170,26170,0,0,True,False,False,False,1,x,aucun
26171,26171,103,103,True,False,False,False,3,xxx,aucun


In [30]:
os.makedirs('../submissions', exist_ok=True)
submission = copy_test[['Id', 'Label']]
submission.to_csv(f'../submissions/submission_crf_only_{c1}_{c2}_{window}.csv', index=False)

# Binary CRF classifier

In [None]:
#

In [12]:
def binarizeLabels(labels:List[str]) -> np.array:
    """Binarizes labels as either "aucun" or "NER"

    :param labels: List of labels to binarize
    :type labels: List[str]
    :return: Binarized list of labels
    :rtype: np.array
    """
    return np.array(['NER' if x != 'aucun' else 'aucun' for x in labels])
y_train = binarizeLabels(y_train)

[(0.5, 0.5), (0.5, 0.1), (0.5, 0.01), (0.5, 0.001), (0.5, 0.0001), (0.1, 0.5), (0.1, 0.1), (0.1, 0.01), (0.1, 0.001), (0.1, 0.0001), (0.01, 0.5), (0.01, 0.1), (0.01, 0.01), (0.01, 0.001), (0.01, 0.0001), (0.001, 0.5), (0.001, 0.1), (0.001, 0.01), (0.001, 0.001), (0.001, 0.0001), (0.0001, 0.5), (0.0001, 0.1), (0.0001, 0.01), (0.0001, 0.001), (0.0001, 0.0001)]


# CRF Grid Search

In [15]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, train_size=.7, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_dev, y_dev, train_size=.5, random_state=42)

X_train.shape, X_dev.shape, y_train.shape, y_dev.shape, X_test.shape, y_test.shape

((27899,), (5979,), (27899,), (5979,), (5979,), (5979,))

In [14]:

# I had errors while trying RandomizedSearchCV, so I had to implement something a bit similar

c1 = [0.5, 0.1, 0.01, 0.001, 0.0001]
c2 = [0.5, 0.1, 0.01, 0.001, 0.0001]

param_space = []

# generates parameters combinations
for i in range(len(c1)):
    for j in range(len(c2)):
        param_space.append((c1[i], c2[j]))

# trains models for each possible combination of paramaters
crf_models = []
for param in param_space:
    tmp_crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=param[0],
        c2=param[1],
        # max_iterations=500,
        all_possible_transitions=True
    )
    tmp_crf.fit([X_train], [y_train])
    crf_models.append(tmp_crf)

crf_scores = []
for x in crf_models:
    y_pred = x.predict([X_dev])
    score = sklearn_crfsuite.metrics.flat_accuracy_score([y_dev], y_pred)
    # print(x.score(, [y_dev]))
    crf_scores.append(score)

print('Best param:', param_space[np.argmax(crf_scores)])

NameError: name 'X_dev' is not defined

# Training CRF model with best parameters on whole dataset

In [17]:
# Best param: (0.0001, 0.1)

X_train = np.array([prepareDatasetFeatures(data=df_train, i=i) for i in df_train.index])
y_train = binarizeLabels(df_train['Label'].values)

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.0001,
    c2=0.1,
    # max_iterations=500,
    all_possible_transitions=True
)
crf.fit([X_train], [y_train])

In [21]:
os.makedirs('../models', exist_ok=True)
joblib.dump(crf, '../models/crf_binary.joblib')

['../models/crf.joblib']