# Classification

To classify candidates extracted by the CRF classifier

In [11]:
import gensim 
import pandas as pd
import numpy as np
import spacy 
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV


import scipy

import joblib
import os

import string

from sklearn.multiclass import OneVsRestClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestCentroid

In [None]:
# nlp = spacy.load('fr_core_news_lg')

In [3]:
df_train = pd.read_csv('../data/train_2.csv')
df_test = pd.read_csv('data/test.csv')
df_sample = pd.read_csv('data/sample_submission_2.csv')

In [5]:
from gensim.models import KeyedVectors
embeddings_path = '../embeddings/conceptnet_fr-19.08-clean_simpler'
we = KeyedVectors.load(embeddings_path, mmap='r')

In [6]:
def vectorize(we:KeyedVectors, word:str) -> np.memmap:
    """Vectorize a word into a word embedding vector

    :param we: Word embedding to use
    :type we: KeyedVectors
    :param word: Word to convert
    :type word: str
    :return: Embeddings corresponding to this word
    :rtype: np.memmap
    """
    try:
        vector = we.word_vec(word)
    except:
        try:
            vector = we.word_vec(word.lower())
        except:
            vector = np.zeros(300)
    return vector

In [7]:
def getMultiLabel(data:pd.DataFrame) -> dict:
    """Converts labels from the train set into multi label
    Ignores tokens labelled 'aucun' since those are identified by the CRF classifier

    :param data: Data containing the training data
    :type data: pd.DataFrame
    :return: dictionary containing the label / labels for each token
    :rtype: dict
    """
    y_train_multilabel = {
        # "aucun": [],
        "geogFeat": [],
        'geogName': [],
        'name': []
    }
    for x in data['Label']:
        # if x == 'aucun':
        #     y_train_multilabel['aucun'].append(1)
        #     y_train_multilabel['name'].append(0)
        #     y_train_multilabel['geogFeat'].append(0)
        #     y_train_multilabel['geogName'].append(0)

        # elif x == 'geogFeat geogName':
        if x == 'geogFeat geogName':

            y_train_multilabel['geogFeat'].append(1)
            y_train_multilabel['geogName'].append(1)
            # y_train_multilabel['aucun'].append(0)
            y_train_multilabel['name'].append(0)


        elif x == 'geogName name':
            y_train_multilabel['geogName'].append(1)
            y_train_multilabel['name'].append(1)
            # y_train_multilabel['aucun'].append(0)
            y_train_multilabel['geogFeat'].append(0)

        elif x == 'geogFeat':
            y_train_multilabel['geogFeat'].append(1)
            # y_train_multilabel['aucun'].append(0)
            y_train_multilabel['name'].append(0)
            y_train_multilabel['geogName'].append(0)

        elif x == 'geogName':
            y_train_multilabel['geogName'].append(1)
            # y_train_multilabel['aucun'].append(0)
            y_train_multilabel['name'].append(0)
            y_train_multilabel['geogFeat'].append(0)


    return y_train_multilabel


In [8]:
# ignore tokens labelled 'aucun' as this classifier must classify the output of the CRF classifier
# thus, we will evaluate the models in terms of micro precision, as we want a very precise classifier

cls_X_train = np.array([vectorize(we=we, word=x)for x in df_train[df_train['Label'] != 'aucun']['Token'] ])
y_train_multilabel = pd.DataFrame.from_dict(getMultiLabel(df_train[df_train['Label'] != 'aucun']))



  vector = we.word_vec(word)
  vector = we.word_vec(word.lower())


In [35]:
y_train_multilabel.columns

Index(['geogFeat', 'geogName', 'name'], dtype='object')

In [None]:
# cls_X_train, cls_X_dev, cls_y_train, cls_y_dev = train_test_split(cls_X_train, y_train_multilabel, train_size=.7, random_state=42)
# cls_X_dev, cls_X_test, cls_y_dev, cls_y_test = train_test_split(cls_X_dev, cls_y_dev, train_size=.5, random_state=42)

# cls_X_train.shape, cls_X_dev.shape, cls_y_train.shape, cls_y_dev.shape, cls_X_test.shape, cls_y_test.shape

((27899, 300), (5979, 300), (27899, 4), (5979, 4), (5979, 300), (5979, 4))

In [19]:

model = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=5))
scores = cross_val_score(model, cls_X_train, y_train_multilabel, cv=5, scoring='precision_micro')
scores

array([0.60806115, 0.62195969, 0.69075747, 0.58512856, 0.6147427 ])

In [20]:
model = OneVsRestClassifier(SVC())
scores = cross_val_score(model, cls_X_train, y_train_multilabel, cv=5, scoring='precision_micro')
scores

array([0.60910354, 0.60076442, 0.61257818, 0.59277276, 0.59318498])

In [21]:
model = OneVsRestClassifier(RandomForestClassifier())
scores = cross_val_score(model, cls_X_train, y_train_multilabel, cv=5, scoring='precision_micro')
scores

array([0.60910354, 0.60076442, 0.61257818, 0.59277276, 0.59318498])

In [22]:
model = OneVsRestClassifier(xgb.XGBClassifier(random_state=42))
scores = cross_val_score(model, cls_X_train, y_train_multilabel, cv=5, scoring='precision_micro')
scores



array([0.60910354, 0.60076442, 0.61257818, 0.59277276, 0.59318498])

# KNN GridSearch

In [24]:


knn_params = {
    'estimator__n_neighbors' : [5,7,9,11,13,15],
    'estimator__weights' : ['uniform','distance'],
    'estimator__metric' : ['minkowski','euclidean','manhattan']}



knn_grid = RandomizedSearchCV(estimator=OneVsRestClassifier(KNeighborsClassifier()), param_distributions=knn_params, n_jobs=-1, verbose=3, scoring='precision_micro')
knn_grid.fit(X=cls_X_train, y=y_train_multilabel)
knn_grid.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 2/5] END estimator__metric=minkowski, estimator__n_neighbors=7, estimator__weights=uniform;, score=0.622 total time=   1.5s
[CV 1/5] END estimator__metric=minkowski, estimator__n_neighbors=7, estimator__weights=uniform;, score=0.609 total time=   1.5s
[CV 3/5] END estimator__metric=minkowski, estimator__n_neighbors=7, estimator__weights=uniform;, score=0.691 total time=   1.6s
[CV 4/5] END estimator__metric=minkowski, estimator__n_neighbors=7, estimator__weights=uniform;, score=0.585 total time=   1.3s
[CV 5/5] END estimator__metric=minkowski, estimator__n_neighbors=7, estimator__weights=uniform;, score=0.615 total time=   1.8s
[CV 5/5] END estimator__metric=manhattan, estimator__n_neighbors=11, estimator__weights=uniform;, score=0.615 total time=   9.1s
[CV 1/5] END estimator__metric=manhattan, estimator__n_neighbors=11, estimator__weights=uniform;, score=0.609 total time=   9.1s
[CV 3/5] END estimator__metric=manhattan,

{'estimator__weights': 'uniform',
 'estimator__n_neighbors': 11,
 'estimator__metric': 'manhattan'}

# Training KNN with best hyperparameter set

In [30]:
knn_best_param = {
        'weights': 'uniform',
        'n_neighbors': 11,
        'metric': 'manhattan'}

cls_X_train = np.array([vectorize(we=we, word=x)for x in df_train[df_train['Label'] != 'aucun']['Token'] ])
cls_y_train = pd.DataFrame.from_dict(getMultiLabel(df_train[df_train['Label'] != 'aucun']))

knn = OneVsRestClassifier(KNeighborsClassifier(**knn_best_param))
knn.fit(cls_X_train, cls_y_train)

  vector = we.word_vec(word)
  vector = we.word_vec(word.lower())


### Save model

In [31]:
os.makedirs('../models', exist_ok=True)
joblib.dump(knn, '../models/knn.joblib')

['../models/knn.joblib']

# XGBoost GridSearch

In [32]:
xgb_params = {
    "estimator__n_estimators": [50, 100, 150, 200, 350, 500],
    "estimator__max_depth": [i for i in range(1, 11)],
    "estimator__learning_rate": [0.0001, 0.001, 0.01, 0.1, 1.0],
    "estimator__subsample": [.25, .5, .75, 1],
    "estimator__random_state": [42]

}

xgb_grid = RandomizedSearchCV(estimator=OneVsRestClassifier(xgb.XGBClassifier(random_state=42)), param_distributions=xgb_params, n_jobs=-1, verbose=3, scoring='precision_micro')
xgb_grid.fit(X=cls_X_train, y=y_train_multilabel)
xgb_grid.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 2/5] END estimator__learning_rate=0.1, estimator__max_depth=4, estimator__n_estimators=50, estimator__random_state=42, estimator__subsample=0.5;, score=0.601 total time=   3.1s
[CV 1/5] END estimator__learning_rate=0.1, estimator__max_depth=4, estimator__n_estimators=50, estimator__random_state=42, estimator__subsample=0.5;, score=0.609 total time=   3.3s
[CV 3/5] END estimator__learning_rate=0.1, estimator__max_depth=4, estimator__n_estimators=50, estimator__random_state=42, estimator__subsample=0.5;, score=0.613 total time=   3.4s
[CV 4/5] END estimator__learning_rate=0.1, estimator__max_depth=4, estimator__n_estimators=50, estimator__random_state=42, estimator__subsample=0.5;, score=0.593 total time=   3.6s
[CV 5/5] END estimator__learning_rate=0.1, estimator__max_depth=4, estimator__n_estimators=50, estimator__random_state=42, estimator__subsample=0.5;, score=0.593 total time=   3.6s
[CV 1/5] END estimator__learning_r

{'estimator__subsample': 1,
 'estimator__random_state': 42,
 'estimator__n_estimators': 150,
 'estimator__max_depth': 9,
 'estimator__learning_rate': 1.0}

# Training XGBoost with best hyperparameter set on whole dataset

In [33]:
xgb_boost_best_param = {
        'subsample': 1,
        'random_state': 42,
        'n_estimators': 150,
        'max_depth': 9,
        'learning_rate': 1.0,
   }

cls_X_train = np.array([vectorize(we=we, word=x)for x in df_train[df_train['Label'] != 'aucun']['Token'] ])
cls_y_train = pd.DataFrame.from_dict(getMultiLabel(df_train[df_train['Label'] != 'aucun']))

xgboost = OneVsRestClassifier(xgb.XGBClassifier(**xgb_boost_best_param))
xgboost.fit(cls_X_train, cls_y_train)

  vector = we.word_vec(word)
  vector = we.word_vec(word.lower())


### Save model

In [34]:
os.makedirs('../models', exist_ok=True)
joblib.dump(xgboost, '../models/xgboost.joblib')

['../models/xgboost.joblib']