# Pipeline

Pipeline combining the CRF and classifier (KNN, XGBoost) to extract and classify ENE candidates

In [11]:
import gensim 
import pandas as pd
import numpy as np
import spacy 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, make_scorer

from gensim.models import KeyedVectors


import scipy

import joblib
import os

import string

import joblib

from typing import List


In [30]:
class Model:

    def __init__(self, extractor, classifier, embeddings, class_labels):
        self.extractor = extractor
        self.classifier = classifier
        self.embeddings = embeddings
        self.class_labels = class_labels

    def cleanData(self, data:pd.DataFrame) -> None:
        """Simple function to clean Tokens

        :param data: DataFrame containing the data
        :type data: pd.DataFrame
        """
        data['Token'] = data['Token'].str.replace('"', '')


    def getStringShape(self, string:str) -> str:
        """Get the shape of the string as either X for capital letters and x for normal letters

        :param string: String to get the shape of
        :type string: str
        :return: Shape of the string (e.g. Port: Xxxx)
        :rtype: str
        """
        shape = ['X' if x.isupper() else 'x' for x in string]
        return ''.join(shape)
        
    def extractCRFFeatures(self, data:pd.DataFrame):
        """
        Extract features from Token column of the dataframe

        :param data: DataFrame contianing the data
        :type data: pd.DataFrame
        """
        data['lower'] = [x.lower() for x in data['Token']]
        data['isdigit'] = [x.isdigit() for x in data['Token']]
        data['isupper'] = [x.isupper() for x in data['Token']]
        data['ispunct'] = [x in string.punctuation for x in data['Token']]
        data['isstop'] = [x in nlp.Defaults.stop_words for x in data['Token']]
        data['len'] = [len(x) for x in data['Token']]
        data['shape'] = [self.getStringShape(x) for x in data['Token']]
        # data['istitle'] = [x.istitle() for x in data['Token']]


    def prepareDatasetFeatures(self, data:pd.DataFrame, i:int) -> dict:
        """Process the features of a token extracted with extractFeatures and
        prepares them in the format required by sklearn_crfsuite
        Note: this functions takes a lot of time somehow, so it probably needs to be
        refactored.

        :param data: DataFrame containing the token and their features
        :type data: pd.DataFrame
        :param i: index of the token
        :type i: int
        :return: Dictionary where keys are the name of the feature, and values are the features
        :rtype: dict
        """
    #     # word_vector = vectorize(we=we, word=word['Token'])
        ignore_col = ['Id', 'Label']

        data = data[[x for x in data.columns if x not in ignore_col]]
        word_data = data.loc[i]
        feature_dict = word_data.to_dict()

        if i != 0:
            prev_word_data = data.loc[i - 1]
            for k, v in prev_word_data.to_dict().items():
                feature_dict[f"prev_{k}"] = v
        
        if i != len(data) - 1:
            next_word_data = data.loc[i + 1]
            for k, v in next_word_data.to_dict().items():
                feature_dict[f"next_{k}"] = v
                
        return feature_dict

    def binarizeLabels(self, labels:List[str]) -> np.array:
        """Binarizes labels as either "aucun" or "NER"

        :param labels: List of labels to binarize
        :type labels: List[str]
        :return: Binarized list of labels
        :rtype: np.array
        """
        return np.array(['NER' if x != 'aucun' else 'aucun' for x in labels])

    def vectorize(self, word:str) -> np.memmap:
        """Vectorize a word into a word embedding vector

        :param we: Word embedding to use
        :type we: KeyedVectors
        :param word: Word to convert
        :type word: str
        :return: Embeddings corresponding to this word
        :rtype: np.memmap
        """
        try:
            vector = self.embeddings.word_vec(word)
        except:
            try:
                vector = self.embeddings.word_vec(word.lower())
            except:
                vector = np.zeros(300)
        return vector

    def extractCandidates(self, X:pd.DataFrame) -> pd.DataFrame :
        """Use the CRF extractor to extract NER candidate from dataset
        First extract the required features, then predict and update the dataset

        :param X: Dataset to classify
        :type X: pd.DataFrame
        :return: Updated dataset with predictions
        :rtype: pd.DataFrame
        """
        # first clean tokens
        self.cleanData(X)
        # extract required features
        self.extractCRFFeatures(X)
        X_features = np.array([self.prepareDatasetFeatures(data=X, i=i) for i in X.index])
        pred = self.extractor.predict([X_features])
        X['crf_pred'] = pred[0]
        return X

    def classifyCandidates(self, X:pd.DataFrame) -> pd.DataFrame:
        """Use the classifier to label the candidates extracted by the CRF extractor

        :param X: Dataset to classify
        :type X: pd.DataFrame
        :return: Updated dataset with predictions
        :rtype: pd.DataFrame
        """
        

        # selects token labelled "NER" by CRF
        sub_X = X[X['crf_pred'] == 'NER']

        # converts tokens into word embeddings
        X_features = np.array([self.vectorize(word=x) for x in sub_X['Token']])

        # output matrix of prediction for each token labelled NER
        pred = pd.DataFrame(self.classifier.predict(X_features), columns = self.class_labels)

        # joins predictions to original dataset
        pred['Id'] = sub_X['Id'].values
        pred = pred.set_index('Id')
        X = X.join(pred)
        
        # makes sure every label is set to 0 when the CRF classifier (extractor)
        # has labelled "aucun" a token
        for col in self.class_labels:
                X.loc[X['crf_pred'] == 'aucun', col] = 0

        return X
    
    def getPrediction(self, X:pd.DataFrame) -> pd.DataFrame:
        """Reads the output of the classifier and add a 'Label' column
        to the dataset which contains the prediction in the required format
        (ie in one Label and not multiple)

        :param X: Dataset to classify
        :type X: pd.DataFrame
        :return: Updated dataset with predictions
        :rtype: pd.DataFrame
        """

        X.loc[
            (X['geogFeat'] == 1) &
            (X['geogName'] == 0) &
            (X['name'] == 0), 'Label'] = 'geogFeat'

        X.loc[
            (X['geogFeat'] == 0) &
            (X['geogName'] == 1) &
            (X['name'] == 0), 'Label'] = 'geogName'


        X.loc[
            (X['geogFeat'] == 1) &
            (X['geogName'] == 1) &
            (X['name'] == 0), 'Label'] = 'geogFeat geogName'
        

        X.loc[
            (X['geogFeat'] == 0) &
            (X['geogName'] == 1) &
            (X['name'] == 1), 'Label'] = 'geogName name'


        X['Label'].fillna('aucun', inplace=True)
        return X
    #     array(['aucun', 'geogFeat geogName', 'geogName', 'geogName name',
    #    'geogFeat'

    def predict(self, X:pd.DataFrame) -> pd.DataFrame:
        """Main function to apply the pipeline: candidate extraction then classification

        :param X: _description_
        :type X: pd.DataFrame
        :return: _description_
        :rtype: pd.DataFrame
        """
        copy_X = X.copy(deep=True)

        copy_X = self.extractCandidates(copy_X)
        copy_X = self.classifyCandidates(copy_X)
        copy_X = self.getPrediction(copy_X)
        return copy_X 



# Loading models

In [8]:
knn = joblib.load('../models/knn.joblib')

In [23]:
xgboost = joblib.load('../models/xgboost.joblib')

In [9]:
crf = joblib.load('../models/crf.joblib')

In [13]:
embeddings_path = '../embeddings/conceptnet_fr-19.08-clean_simpler'
we = KeyedVectors.load(embeddings_path, mmap='r')

In [16]:
df_train = pd.read_csv('../data/train_2.csv')
df_test = pd.read_csv('../data/test.csv')
df_sample = pd.read_csv('../data/sample_submission_2.csv')

In [26]:
nlp = spacy.load('fr_core_news_lg')

In [18]:
class_labels = ['geogFeat', 'geogName', 'name']

# CRF + KNN Pipeline

In [31]:
knn_pipeline = Model(extractor=crf, classifier=knn, embeddings=we, class_labels = class_labels)

In [32]:
predictions = knn_pipeline.predict(df_test)


  vector = self.embeddings.word_vec(word)
  vector = self.embeddings.word_vec(word.lower())


In [34]:
for tok, label in zip(predictions['Token'], predictions['Label']):
    print(tok, label)

01 aucun
2.1.4.8 aucun
. aucun
Conduites aucun
sous-marines aucun
07 aucun
Arrêtés aucun
1974 aucun
- aucun
961 aucun
ADM aucun
/ aucun
SA aucun
du aucun
6 aucun
décembre aucun
1974 aucun
et aucun
1998 aucun
- aucun
002 aucun
du aucun
15 aucun
janvier aucun
1998 aucun
du aucun
préfet aucun
maritime aucun
de aucun
la aucun
Méditerranée geogName
( aucun
www.premar-mediterranee.gouv.fr/arretes.html aucun
) aucun
, aucun
courrier aucun
des aucun
Phares geogName
et geogName
Balises geogName
du aucun
9 aucun
septembre aucun
1968 aucun
. aucun
2107 aucun
08 aucun
Émissaires aucun
de aucun
rejet aucun
protégés aucun
par aucun
des aucun
zones geogName
réglementées geogName
. aucun
13 aucun
Situation aucun
: aucun
zones geogName
portées geogName
sur aucun
les aucun
cartes aucun
, aucun
devant aucun
les aucun
ports geogName
de aucun
: aucun
– aucun
Cerbère geogName
( aucun
42 aucun
° aucun
26,52 aucun
' aucun
N aucun
— aucun
3 aucun
° aucun
10,26 aucun
' aucun
E aucun
) aucun
; aucun
– aucun
Bany

In [37]:
os.makedirs('../submissions', exist_ok=True)
submission = predictions[['Id', 'Label']]
submission.to_csv('../submissions/submission_knn_1.csv', index=False)

In [38]:
submission

Unnamed: 0,Id,Label
0,0,aucun
1,1,aucun
2,2,aucun
3,3,aucun
4,4,aucun
...,...,...
26168,26168,aucun
26169,26169,aucun
26170,26170,aucun
26171,26171,aucun


# CRF + XGBoost Pipeline

In [40]:
xgb_pipeline = Model(extractor=crf, classifier=xgboost, embeddings=we, class_labels = class_labels)

In [41]:
predictions = xgb_pipeline.predict(df_test)


  vector = self.embeddings.word_vec(word)
  vector = self.embeddings.word_vec(word.lower())


In [42]:
for tok, label in zip(predictions['Token'], predictions['Label']):
    print(tok, label)

01 aucun
2.1.4.8 aucun
. aucun
Conduites aucun
sous-marines aucun
07 aucun
Arrêtés aucun
1974 aucun
- aucun
961 aucun
ADM aucun
/ aucun
SA aucun
du aucun
6 aucun
décembre aucun
1974 aucun
et aucun
1998 aucun
- aucun
002 aucun
du aucun
15 aucun
janvier aucun
1998 aucun
du aucun
préfet aucun
maritime aucun
de aucun
la aucun
Méditerranée geogFeat geogName
( aucun
www.premar-mediterranee.gouv.fr/arretes.html aucun
) aucun
, aucun
courrier aucun
des aucun
Phares geogFeat geogName
et geogFeat geogName
Balises geogFeat geogName
du aucun
9 aucun
septembre aucun
1968 aucun
. aucun
2107 aucun
08 aucun
Émissaires aucun
de aucun
rejet aucun
protégés aucun
par aucun
des aucun
zones geogFeat geogName
réglementées geogFeat geogName
. aucun
13 aucun
Situation aucun
: aucun
zones geogFeat geogName
portées geogFeat geogName
sur aucun
les aucun
cartes aucun
, aucun
devant aucun
les aucun
ports geogFeat geogName
de aucun
: aucun
– aucun
Cerbère geogFeat geogName
( aucun
42 aucun
° aucun
26,52 aucun
' aucu

In [45]:
os.makedirs('../submissions', exist_ok=True)
submission = predictions[['Id', 'Label']]
submission.to_csv('../submissions/submission_xgb_1.csv', index=False)

In [46]:
submission

Unnamed: 0,Id,Label
0,0,aucun
1,1,aucun
2,2,aucun
3,3,aucun
4,4,aucun
...,...,...
26168,26168,aucun
26169,26169,aucun
26170,26170,aucun
26171,26171,aucun


### Load model

In [None]:
copy_test = final_model.extractCandidates(df_test)

In [None]:
copy_test = final_model.classifyCandidates(copy_test)


  vector = self.embeddings.word_vec(word)
  vector = self.embeddings.word_vec(word.lower())


In [None]:
copy_test = final_model.getPrediction(copy_test)

In [None]:
copy_test

Unnamed: 0,Id,Token,lower,isdigit,isupper,ispunct,isstop,len,shape,crf_pred,geogFeat,geogName,name,Label
0,0,01,01,True,False,False,False,2,xx,aucun,0.0,0.0,0.0,aucun
1,1,2.1.4.8,2.1.4.8,False,False,False,False,7,xxxxxxx,aucun,0.0,0.0,0.0,aucun
2,2,.,.,False,False,True,False,1,x,aucun,0.0,0.0,0.0,aucun
3,3,Conduites,conduites,False,False,False,False,9,Xxxxxxxxx,aucun,0.0,0.0,0.0,aucun
4,4,sous-marines,sous-marines,False,False,False,False,12,xxxxxxxxxxxx,aucun,0.0,0.0,0.0,aucun
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26168,26168,103,103,True,False,False,False,3,xxx,aucun,0.0,0.0,0.0,aucun
26169,26169,/,/,False,False,True,False,1,x,aucun,0.0,0.0,0.0,aucun
26170,26170,0,0,True,False,False,False,1,x,aucun,0.0,0.0,0.0,aucun
26171,26171,103,103,True,False,False,False,3,xxx,aucun,0.0,0.0,0.0,aucun


In [None]:
copy_test = final_model.predict(df_test)


  vector = self.embeddings.word_vec(word)
  vector = self.embeddings.word_vec(word.lower())


Unnamed: 0,Id,Label
0,0,aucun
1,1,aucun
2,2,aucun
3,3,aucun
4,4,aucun
...,...,...
26168,26168,aucun
26169,26169,aucun
26170,26170,aucun
26171,26171,aucun


In [None]:
for tok, label in zip(copy_test['Token'], copy_test['Label']):
    print(tok, label)

01 aucun
2.1.4.8 aucun
. aucun
Conduites aucun
sous-marines aucun
07 aucun
Arrêtés aucun
1974 aucun
- aucun
961 aucun
ADM aucun
/ aucun
SA aucun
du aucun
6 aucun
décembre aucun
1974 aucun
et aucun
1998 aucun
- aucun
002 aucun
du aucun
15 aucun
janvier aucun
1998 aucun
du aucun
préfet aucun
maritime aucun
de aucun
la aucun
Méditerranée geogFeat geogName
( aucun
www.premar-mediterranee.gouv.fr/arretes.html aucun
) aucun
, aucun
courrier aucun
des aucun
Phares geogFeat geogName
et geogFeat geogName
Balises geogFeat geogName
du aucun
9 aucun
septembre aucun
1968 aucun
. aucun
2107 aucun
08 aucun
Émissaires aucun
de aucun
rejet aucun
protégés aucun
par aucun
des aucun
zones geogFeat geogName
réglementées geogFeat geogName
. aucun
13 aucun
Situation aucun
: aucun
zones geogFeat geogName
portées geogFeat geogName
sur aucun
les aucun
cartes aucun
, aucun
devant aucun
les aucun
ports geogFeat geogName
de aucun
: aucun
– aucun
Cerbère geogFeat geogName
( aucun
42 aucun
° aucun
26,52 aucun
' aucu

In [None]:
os.makedirs('submissions', exist_ok=True)
submission = copy_test[['Id', 'Label']]
submission.to_csv('submissions/submission1.csv', index=False)

In [None]:
submission

Unnamed: 0,Id,Label
0,0,aucun
1,1,aucun
2,2,aucun
3,3,aucun
4,4,aucun
...,...,...
26168,26168,aucun
26169,26169,aucun
26170,26170,aucun
26171,26171,aucun
