# CRF Classifier

To extract and extract Extended NER candidates

In [17]:
import pandas as pd
import numpy as np
import spacy 

import json

from sklearn.metrics import f1_score

import os
import joblib 

from typing import List

import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV


In [39]:
def cleanData(data:pd.DataFrame) -> None:
    """Simple function to clean Tokens

    :param data: DataFrame containing the data
    :type data: pd.DataFrame
    """

    data['Token'] = data['Token'].str.replace('"', '')

def getLinguisticFeatures(list_tokens:List[str], nlp) -> dict:
    doc = Doc(nlp.vocab, words=list_tokens)
    for pipe in filter(None, nlp.pipeline):
        pipe[1](doc)
    return {
        "POS": [x.pos_ for x in doc],
        'DEP': [x.dep_ for x in doc]
    }

def getStringShape(word:str) -> str:
    """Get the shape of the string as either X for capital letters, d for digits, . for punctuations and x for normal letters

    :param string: String to get the shape of
    :type string: str
    :return: Shape of the string (e.g. Port: Xxxx)
    :rtype: str
    """
    shape = []
    for x in word:
        if x.isupper():
            shape.append('X')
        elif x.isdigit():
            shape.append('d')
        elif x in string.punctuation:
            shape.append('.')
        else:
            shape.append('x')
    return ''.join(shape)
    
def extractFeatures(data:pd.DataFrame, nlp)->None:
    """
    Extract features from Token column of the dataframe

    :param data: DataFrame contianing the data
    :type data: pd.DataFrame
    """
    data['lower'] = [x.lower() for x in data['Token']]
    data['isdigit'] = [x.isdigit() for x in data['Token']]
    data['isupper'] = [x.isupper() for x in data['Token']]
    data['ispunct'] = [x in string.punctuation for x in data['Token']]
    data['isstop'] = [x in nlp.Defaults.stop_words for x in data['Token']]
    data['len'] = [len(x) for x in data['Token']]
    data['shape'] = [getStringShape(x) for x in data['Token']]
    
    linguistic_features = getLinguisticFeatures(data['Token'], nlp)
    data['pos'] = linguistic_features['POS']
    data['dep'] = linguistic_features['DEP']

def prepareDatasetFeatures(data:pd.DataFrame, i:int) -> dict:
    """Process the features of a token extracted with extractFeatures and
    prepares them in the format required by sklearn_crfsuite
    Note: this functions takes a lot of time somehow, so it probably needs to be
    refactored.

    :param data: DataFrame containing the token and their features
    :type data: pd.DataFrame
    :param i: index of the token
    :type i: int
    :return: Dictionary where keys are the name of the feature, and values are the features
    :rtype: dict
    """
    # columns to ignore in DataFrame
    ignore_col = ['Id', 'Label']

    data = data[[x for x in data.columns if x not in ignore_col]]
    # get row associated with this token / index
    word_data = data.loc[i]
    feature_dict = word_data.to_dict()

    # extract features of preceding token (except if the current token is the first)
    if i != 0:
        prev_word_data = data.loc[i - 1]
        for k, v in prev_word_data.to_dict().items():
            feature_dict[f"prev_{k}"] = v

    # extract features of next token (except if the current token is the last)
    if i != len(data) - 1:
        next_word_data = data.loc[i + 1]
        for k, v in next_word_data.to_dict().items():
            feature_dict[f"next_{k}"] = v

    
    return feature_dict

def gridSearch(param_space:List[tuple]) -> tuple:
    """ I had errors while trying RandomizedSearchCV, so I had to implement something a bit similar

    :param param_space: List of tuple containing the combination of parameters to test
    :type param_space: List[tuple]
    :return: Tuple with the set of parameters reaching the best micro f1 score
    :rtype: tuple
    """

    # trains models for each possible combination of paramaters
    crf_models = []
    for param in param_space:
        tmp_crf = sklearn_crfsuite.CRF(
            # algorithm='lbfgs',
            c1=param[0],
            c2=param[1],
            # max_iterations=500,
            # all_possible_transitions=True
        )
        tmp_crf.fit([X_train], [y_train])
        crf_models.append(tmp_crf)

    # get the set of parameter with highest micro f1_score
    crf_scores = []
    for x in crf_models:
        y_pred = x.predict([X_dev])
        score = f1_score(y_dev, y_pred[0], average='micro')
        # score = sklearn_crfsuite.metrics.flat_accuracy_score([y_dev], y_pred)
        crf_scores.append(score)

    return param_space[np.argmax(crf_scores)]

def trainModel(df_train:pd.DataFrame, features_col:List[str], best_param:tuple, savepath:str):

    c1 = best_param[0]
    c2 = best_param[1]

    print('Preparing features...')
    X_train = np.array([prepareDatasetFeatures(data=df_train[features_col], i=i) for i in df_train.index])
    y_train = df_train['Label'].values

    print('Training Model...')
    crf = sklearn_crfsuite.CRF(
        c1=c1,
        c2=c2,
    )
    crf.fit([X_train], [y_train])

    print('Saving Model...')
    os.makedirs(savepath, exist_ok=True)
    joblib.dump(crf, f'{savepath}/crf.joblib')

    # saving set of features used
    with open(f"{savepath}/params.json", 'w', encoding='utf-8') as f:
        json.dump(
            {
                "c1": c1,
                "c2": c2,
                "features": list(features_col)
            }, f, indent=4
        )
    print('Done')

    return crf

def evaluate(df_test:pd.DataFrame, features_col:List[str], crf, savepath:str) -> None : 

    features_col = [x for x in features_col if x != 'Label']
    # makes a deep copy of df_test so as not to modify it
    copy_test = df_test.copy(deep=True)
    print('Preparing features...')
    X = np.array([prepareDatasetFeatures(data=copy_test[features_col], i=i) for i in copy_test.index])

    print('Predicting...')
    pred = crf.predict([X])
    copy_test['Label'] = pred[0]

    print('Saving results...')
    submission = copy_test[['Id', 'Label']]
    submission.to_csv(f'{savepath}/submission.csv', index=False)

    print('Done')

In [7]:
# needed for stopword list and getting token's POS and dependency tags
nlp = spacy.load('fr_core_news_lg')

In [30]:

# need to clean Token column because each token is surrounded by quotes, which will cause issues later
df_train = pd.read_csv('../data/train_2.csv')
cleanData(df_train)
# update df_train with extracted features
extractFeatures(df_train, nlp=nlp)

df_test = pd.read_csv('../data/test.csv')
cleanData(df_test)
extractFeatures(df_test, nlp=nlp)


# Test with every feature except POS and DEP

In [12]:
features_col = df_train.columns[:-2]
# this might take time to run, since the function is not optimized
X_train = np.array([prepareDatasetFeatures(data=df_train[features_col], i=i) for i in df_train.index])
y_train = df_train['Label'].values
X_train.shape, y_train.shape

((39857,), (39857,))

In [14]:
X_train[1]

{'Token': '4.5.3',
 'lower': '4.5.3',
 'isdigit': False,
 'isupper': False,
 'ispunct': False,
 'isstop': False,
 'len': 5,
 'shape': 'd.d.d',
 'prev_Token': '01',
 'prev_lower': '01',
 'prev_isdigit': True,
 'prev_isupper': False,
 'prev_ispunct': False,
 'prev_isstop': False,
 'prev_len': 2,
 'prev_shape': 'dd',
 'next_Token': '.',
 'next_lower': '.',
 'next_isdigit': False,
 'next_isupper': False,
 'next_ispunct': True,
 'next_isstop': False,
 'next_len': 1,
 'next_shape': '.'}

## CRF Grid Search

In [15]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, train_size=.7, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_dev, y_dev, train_size=.5, random_state=42)

X_train.shape, X_dev.shape, y_train.shape, y_dev.shape, X_test.shape, y_test.shape

((27899,), (5979,), (27899,), (5979,), (5979,), (5979,))

In [16]:
# generates parameters combinations

c1 = [0.5, 0.1, 0.01, 0.001, 0.0001]
c2 = [0.5, 0.1, 0.01, 0.001, 0.0001]

param_space = []

for i in range(len(c1)):
    for j in range(len(c2)):
        param_space.append((c1[i], c2[j]))


best_param = gridSearch(param_space)
print('Best param:', best_param)



Best param: (0.01, 0.1)


### Training CRF model with best parameters on whole dataset and saving model

In [34]:
savepath = '../models/base_features'
base_features_crf = trainModel(df_train, features_col, best_param, savepath)


Preparing features...
Training Model...
Saving Model...
Done


### Predictions on test set and generating submission file

In [37]:
df_test

Unnamed: 0,Id,Token,lower,isdigit,isupper,ispunct,isstop,len,shape,pos,dep
0,0,01,01,True,False,False,False,2,dd,NOUN,ROOT
1,1,2.1.4.8,2.1.4.8,False,False,False,False,7,d.d.d.d,NUM,nmod
2,2,.,.,False,False,True,False,1,.,PUNCT,punct
3,3,Conduites,conduites,False,False,False,False,9,Xxxxxxxxx,NOUN,ROOT
4,4,sous-marines,sous-marines,False,False,False,False,12,xxxx.xxxxxxx,ADJ,amod
...,...,...,...,...,...,...,...,...,...,...,...
26168,26168,103,103,True,False,False,False,3,ddd,NUM,nummod
26169,26169,/,/,False,False,True,False,1,.,SYM,case
26170,26170,0,0,True,False,False,False,1,d,NUM,nummod
26171,26171,103,103,True,False,False,False,3,ddd,NUM,nmod


In [38]:


evaluate(df_test, features_col, base_features_crf, savepath)

Preparing features...
Predicting...
Saving results...
Done


----------------

# Test with every feature and POS and without DEP

In [41]:
features_col = df_train.columns[:-1]
# # this might take time to run, since the function is not optimized
X_train = np.array([prepareDatasetFeatures(data=df_train[features_col], i=i) for i in df_train.index])
y_train = df_train['Label'].values
X_train.shape, y_train.shape

((39857,), (39857,))

### CRF Grid Search

In [42]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, train_size=.7, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_dev, y_dev, train_size=.5, random_state=42)

X_train.shape, X_dev.shape, y_train.shape, y_dev.shape, X_test.shape, y_test.shape

((27899,), (5979,), (27899,), (5979,), (5979,), (5979,))

In [44]:
# generates parameters combinations

c1 = [0.5, 0.1, 0.01, 0.001, 0.0001]
c2 = [0.5, 0.1, 0.01, 0.001, 0.0001]

param_space = []

for i in range(len(c1)):
    for j in range(len(c2)):
        param_space.append((c1[i], c2[j]))


best_param = gridSearch(param_space)
print('Best param:', best_param)



Best param: (0.1, 0.01)


### Training CRF model with best parameters on whole dataset and saving model

In [45]:
savepath = '../models/features_pos'
features_pos_crf = trainModel(df_train, features_col, best_param, savepath)


Preparing features...
Training Model...
Saving Model...
Done


### Predictions on test set and generating submission file

In [46]:
evaluate(df_test, features_col, features_pos_crf, savepath)

Preparing features...
Predicting...
Saving results...
Done


----------------

# Test with every feature and POS and DEP

In [47]:
features_col = df_train.columns
# # this might take time to run, since the function is not optimized
X_train = np.array([prepareDatasetFeatures(data=df_train[features_col], i=i) for i in df_train.index])
y_train = df_train['Label'].values
X_train.shape, y_train.shape

### CRF Grid Search

In [None]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, train_size=.7, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_dev, y_dev, train_size=.5, random_state=42)

X_train.shape, X_dev.shape, y_train.shape, y_dev.shape, X_test.shape, y_test.shape

((27899,), (5979,), (27899,), (5979,), (5979,), (5979,))

In [None]:
# generates parameters combinations

c1 = [0.5, 0.1, 0.01, 0.001, 0.0001]
c2 = [0.5, 0.1, 0.01, 0.001, 0.0001]

param_space = []

for i in range(len(c1)):
    for j in range(len(c2)):
        param_space.append((c1[i], c2[j]))


best_param = gridSearch(param_space)
print('Best param:', best_param)



Best param: (0.1, 0.01)


### Training CRF model with best parameters on whole dataset and saving model

In [None]:
savepath = '../models/features_pos_dep'
features_pos_dep_crf = trainModel(df_train, features_col, best_param, savepath)


Preparing features...
Training Model...
Saving Model...
Done


### Predictions on test set and generating submission file

In [None]:
evaluate(df_test, features_col, features_pos_dep_crf, savepath)

Preparing features...
Predicting...
Saving results...
Done
