# Libraries

In [3]:
!python -m spacy download fr_core_news_sm

Collecting fr_core_news_sm==2.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz (14.7 MB)
[K     |████████████████████████████████| 14.7 MB 2.3 MB/s eta 0:00:01
Building wheels for collected packages: fr-core-news-sm
  Building wheel for fr-core-news-sm (setup.py) ... [?25ldone
[?25h  Created wheel for fr-core-news-sm: filename=fr_core_news_sm-2.3.0-py3-none-any.whl size=14718367 sha256=b391ab18912d8519e1ee81b23142d25484375dcb0551c26b0e8c5fe858e91572
  Stored in directory: /private/var/folders/6n/4rrpn39918n57t4rcsptw5nc0000gn/T/pip-ephem-wheel-cache-3aogq127/wheels/48/ca/2e/2a3756cab2ba8745ce853319ba0d44b1efb8892a86320e9633
Successfully built fr-core-news-sm
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-2.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')


In [2]:
import pandas as pd
import numpy as np
import string
from collections import Counter
import spacy
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
# from pycaret.classification import *

nlp = spacy.load("fr_core_news_sm")
punctuation = string.punctuation
stopwords = set(stopwords.words('french'))

# Data

In [3]:
# import data from the respository
data = pd.read_csv('../data/max-dataset.csv')
data_google = pd.read_csv('../data/google-full-dataset.csv')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9174 entries, 0 to 9173
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Text        9174 non-null   object
 1   Difficulty  9174 non-null   object
dtypes: object(2)
memory usage: 143.5+ KB


In [5]:
data.Difficulty.value_counts()

A2    1779
C1    1594
B2    1563
B1    1518
A1    1419
C2    1301
Name: Difficulty, dtype: int64

# Feature Engineering

In [39]:
# Entity Recognition
def return_NER(sentence):
    # Tokenize the sentence
    doc = nlp(sentence)
    # Return text and label for each sentence
    return [(X.text, X.label_) for X in doc.ents]

# Part-Of-Speech
def return_POS(sentence):
    # Tokenize the sentence
    doc = nlp(sentence)
    # Return tag of each token
    return [(X, X.pos_) for X in doc]

def NER_counter(sentence: string):
    # Take a sentence with its text & label and couint each elements
    ner = return_NER(sentence)
    counter = Counter([t[1] for t in ner])
    return counter

def POS_counter(sentence: string):
    # Take a token with its tags and count each elements
    pos = return_POS(sentence)
    counter = Counter([t[1] for t in pos])
    return counter

In [13]:
# extraction of features to better understand the texts
def features_extraction(dataframe: pd.DataFrame):
    dataframe["num_chars"] = dataframe["Text"].apply(len)
    dataframe["num_words"] = dataframe["Text"].apply(lambda x: len(x.split()))
    dataframe["avg_word_length"] = dataframe['Text'].apply(lambda x: np.sum([len(w) for w in x.split()]) / len(x.split()))
    dataframe['num_stopwords'] = dataframe['Text'].apply(lambda x: np.sum([1 for word in x.split(' ') if word in stopwords]))
    dataframe['ratio_num_words_over_stopwords'] = dataframe['num_words'] / dataframe['num_stopwords']
    
    # Iterate over each row in the dataframe and get some specific features
    for index, row in data.iterrows():
        # Part-Of-Speech
        counter_pos = POS_counter(row['Text'])
        for x in counter_pos:
            dataframe.loc[index, x] = counter_pos[x]
        
        # Entity Recognizer
        counter_ner = NER_counter(row['Text'])
        for x in counter_ner:
            dataframe.loc[index, x] = counter_ner[x]
        
        # Number of words before the first verb in each sentence
        current_pos = return_POS(row.Text)
        iter_current_pos = [str(y) for t in current_pos for y in t]
        if 'VERB' in iter_current_pos:
            dataframe.loc[index, 'num_words_before_first_verb'] = (iter_current_pos.index('VERB') + 1) // 2
        else:
            dataframe.loc[index, 'num_words_before_first_verb'] = 0
            
    return dataframe.fillna(0)

In [17]:
%%time
# Create a dataset containing the basic data and the extracted features
dataset = features_extraction(data)

CPU times: user 3min 3s, sys: 317 ms, total: 3min 4s
Wall time: 3min 4s


In [18]:
dataset

Unnamed: 0,Text,Difficulty,num_chars,num_words,avg_word_length,num_stopwords,ratio_num_words_over_stopwords,PRON,VERB,DET,...,PROPN,ORG,ADV,MISC,PER,X,SYM,PART,INTJ,SPACE
0,Je recherche une personne pour garder mes enfa...,A1,64,13,4.000000,5.0,2.600000,1.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Il faut être disponible pour travailler les je...,A1,95,15,5.400000,3.0,5.000000,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Vous devez habiter dans le centre de Limoges o...,A1,66,12,4.583333,5.0,2.400000,1.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Expérience avec les enfants souhaitée.,A1,38,5,6.800000,2.0,2.500000,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Si vous êtes intéressé, appelez-moi.",A1,36,5,6.400000,2.0,2.500000,2.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9169,"La vieille acquiesça de la tête, sous sa grand...",C2,168,25,5.720000,6.0,4.166667,1.0,3.0,5.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9170,Il fallait qu’il débitât les nouvelles apprise...,C2,70,11,5.454545,2.0,5.500000,2.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9171,"Vous eussiez dit deux éclairs bleuâtres, parei...",C2,158,24,5.583333,6.0,4.000000,2.0,6.0,1.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9172,Elle dénoua le ruban qui attachait le manche d...,C2,102,19,4.421053,10.0,1.900000,3.0,3.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Save the dataset on the repository to avoid having to recompute all the features when running the notebook again
dataset.to_csv('../data/max-dataset-with-features.csv', index = False)

# BOW

Creation of a BOW with the dataset to later help with CamemBERT 

In [None]:
dataset = pd.read_csv('../data/max-dataset-with-features.csv')

In [34]:
%%time
# Use of spacy tokenizer to create and preprocess tokens
def spacy_tokenizer(text: string):    
    # create spacy object
    mytokens = [token.text for token in nlp(text)]
    
    # remove stop words and punctuation
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuation ]

    # return preprocessed list of tokens
    return mytokens

# tokenizer 
count = CountVectorizer(ngram_range = (1,2), tokenizer=spacy_tokenizer)
bow = count.fit_transform(dataset.Text)

# create the dataframe
dataset_bow = pd.DataFrame(
    bow.todense(), 
    columns = count.get_feature_names()
    )

CPU times: user 48.5 s, sys: 341 ms, total: 48.8 s
Wall time: 49 s


In [29]:
dataset_bow

Unnamed: 0,\t,\n,\n\n,\n.1,\n avouions,\n cloîtrées,\n donnais,\n furtifs,\n l’,\n où,...,−,− allégorie,− beautés,− d',− enchantent,− exhibait,− premier,− qu',− quand,− tel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9169,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9170,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9171,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9172,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
dataset_bow_augmented = pd.concat([dataset.loc[ : , dataset.columns != 'Text'], dataset_bow], axis = 1)

In [36]:
dataset_bow_augmented.to_csv('../data/bow-dataset-1-2ngrams.csv', index = False)