# Feature Engineering

In [None]:
!python -m spacy download fr_core_news_sm

In [None]:
import pandas as pd
import numpy as np
import string
from collections import Counter
import spacy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

nlp = spacy.load('fr_core_news_sm')
punctuation = string.punctuation
stopwords = set(stopwords.words('french'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# import data 
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [None]:
# Entity Recognition
def return_NER(sentence):
    # Tokenize the sentence
    doc = nlp(sentence)
    # Return text and label for each sentence
    return [(X.text, X.label_) for X in doc.ents]

# Part-Of-Speech
def return_POS(sentence):
    # Tokenize the sentence
    doc = nlp(sentence)
    # Return tag of each token
    return [(X, X.pos_) for X in doc]

def NER_counter(sentence: string):
    # Take a sentence with its text & label and couint each elements
    ner = return_NER(sentence)
    counter = Counter([t[1] for t in ner])
    return counter

def POS_counter(sentence: string):
    # Take a token with its tags and count each elements
    pos = return_POS(sentence)
    counter = Counter([t[1] for t in pos])
    return counter

In [None]:
# Extraction of features to better understand the texts
def features_extraction(dataframe: pd.DataFrame):
    dataframe["num_chars"] = dataframe["sentence"].apply(len)
    dataframe["num_words"] = dataframe["sentence"].apply(lambda x: len(x.split()))
    dataframe["avg_word_length"] = dataframe['sentence'].apply(lambda x: np.sum([len(w) for w in x.split()]) / len(x.split()))
    dataframe['num_stopwords'] = dataframe['sentence'].apply(lambda x: np.sum([1 for word in x.split(' ') if word in stopwords]))
    dataframe['ratio_num_words_over_stopwords'] = dataframe['num_words'] / dataframe['num_stopwords']
    
    # Iterate over each row in the dataframe and get some specific features
    for index, row in dataframe.iterrows():
        # Part-Of-Speech
        counter_pos = POS_counter(row['sentence'])
        for x in counter_pos:
            dataframe.loc[index, x] = counter_pos[x]
        
        # Entity Recognizer
        counter_ner = NER_counter(row['sentence'])
        for x in counter_ner:
            dataframe.loc[index, x] = counter_ner[x]
        
        # Number of words before the first verb in each sentence
        current_pos = return_POS(row['sentence'])
        iter_current_pos = [str(y) for t in current_pos for y in t]
        if 'VERB' in iter_current_pos:
            dataframe.loc[index, 'num_words_before_first_verb'] = (iter_current_pos.index('VERB') + 1) // 2
        else:
            dataframe.loc[index, 'num_words_before_first_verb'] = 0
            
    return dataframe.fillna(0)

In [None]:
train_dataset = features_extraction(train)
test_dataset = features_extraction(test)

# Cognates
Cognate (of a word): having the same linguistic derivation as another (e.g. English father, German Vater, Latin pater ). (source: Oxford Languages)

In [None]:
!pip install google-cloud-translate==2.0.1

In [None]:
!pip install textdistance

In [None]:
import pandas as pd
import textdistance

# Translate API
from googleapiclient.discovery import build
service = build('translate', 'v2', developerKey='YOUR-API-KEY')

In [None]:
for index, row in train_dataset.iterrows():
    result = service.translations().list(source='fr', target='en', q=row.loc['sentence']).execute()
    train_dataset.loc[index, 'Text_english_translation'] = result['translations'][0]['translatedText']

for index, row in test_dataset.iterrows():
    result = service.translations().list(source='fr', target='en', q=row.loc['sentence']).execute()
    test_dataset.loc[index, 'Text_english_translation'] = result['translations'][0]['translatedText']

In [None]:
for index, row in train_dataset.iterrows():
    train_dataset.loc[index, 'hamming'] = textdistance.hamming(row['sentence'], row['Text_english_translation'])
    train_dataset.loc[index, 'hamming_normalized_similarity'] = textdistance.hamming.normalized_similarity(row['sentence'], row['Text_english_translation'])
    train_dataset.loc[index, 'levenshtein'] = textdistance.levenshtein(row['sentence'], row['Text_english_translation'])
    train_dataset.loc[index, 'levenshtein_normalized_similarity'] = textdistance.levenshtein.normalized_similarity(row['sentence'], row['Text_english_translation'])
    train_dataset.loc[index, 'jaro_winkler'] = textdistance.jaro_winkler(row['sentence'], row['Text_english_translation'])

    tokens_1 = row['sentence'].split()
    tokens_2 = row['Text_english_translation'].split()
    train_dataset.loc[index, 'jaccard'] = textdistance.jaccard(tokens_1 , tokens_2)
    train_dataset.loc[index, 'sorensen'] = textdistance.sorensen(tokens_1 , tokens_2)

for index, row in test_dataset.iterrows():
    test_dataset.loc[index, 'hamming'] = textdistance.hamming(row['sentence'], row['Text_english_translation'])
    test_dataset.loc[index, 'hamming_normalized_similarity'] = textdistance.hamming.normalized_similarity(row['sentence'], row['Text_english_translation'])
    test_dataset.loc[index, 'levenshtein'] = textdistance.levenshtein(row['sentence'], row['Text_english_translation'])
    test_dataset.loc[index, 'levenshtein_normalized_similarity'] = textdistance.levenshtein.normalized_similarity(row['sentence'], row['Text_english_translation'])
    test_dataset.loc[index, 'jaro_winkler'] = textdistance.jaro_winkler(row['sentence'], row['Text_english_translation'])

    tokens_1 = row['sentence'].split()
    tokens_2 = row['Text_english_translation'].split()
    test_dataset.loc[index, 'jaccard'] = textdistance.jaccard(tokens_1 , tokens_2)
    test_dataset.loc[index, 'sorensen'] = textdistance.sorensen(tokens_1 , tokens_2)


In [None]:
train_dataset.head(5)

Unnamed: 0,sentence,difficulty,num_chars,num_words,avg_word_length,num_stopwords,ratio_num_words_over_stopwords,PRON,AUX,ADP,SCONJ,PUNCT,num_words_before_first_verb,VERB,ADJ,ADV,Unnamed: 17,NOUN,DET,CCONJ,PER,PART,INTJ,MISC,PROPN,LOC,ORG,NUM,X,Text_english_translation,hamming,hamming_normalized_similarity,levenshtein,levenshtein_normalized_similarity,jaro_winkler,jaccard,sorensen
0,C'est pour quand ?,A1,18,4,3.75,1.0,4.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,It&#39;s for when ?,19.0,0.0,14.0,0.263158,0.620858,0.142857,0.25
1,Je pense que c'est bon.,A1,23,5,3.8,1.0,5.0,2.0,1.0,0.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,I think this is good.,21.0,0.086957,18.0,0.217391,0.570324,0.0,0.0
2,C'est pas mal.,A1,14,3,4.0,1.0,3.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Not bad.,14.0,0.0,10.0,0.285714,0.595238,0.0,0.0
3,Qu'est-ce que vous faites ?,A1,27,5,4.6,2.0,2.5,2.0,0.0,0.0,1.0,1.0,6.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,What are you doing ?,25.0,0.074074,17.0,0.37037,0.55679,0.111111,0.2
4,C'est bien !,A1,12,3,3.333333,0.0,inf,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,It&#39;s good !,15.0,0.0,12.0,0.2,0.516667,0.2,0.333333


In [None]:
test_dataset.head(5)

Unnamed: 0,sentence,num_chars,num_words,avg_word_length,num_stopwords,ratio_num_words_over_stopwords,PRON,AUX,VERB,NOUN,ADP,SCONJ,DET,PROPN,ADJ,PUNCT,CCONJ,ADV,NUM,LOC,MISC,num_words_before_first_verb,ORG,PER,X,Unnamed: 26,PART,INTJ,Text_english_translation,hamming,hamming_normalized_similarity,levenshtein,levenshtein_normalized_similarity,jaro_winkler,jaccard,sorensen
0,Il est devenu courant de déplorer que la Franc...,380,65,4.861538,24.0,2.708333,2.0,4.0,6.0,17.0,7.0,3.0,12.0,4.0,5.0,10.0,1.0,2.0,4.0,4.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,It has become common to deplore the fact that ...,336.0,0.115789,234.0,0.384211,0.76548,0.050847,0.096774
1,"Sur les flancs et les derrières de la noce, to...",97,18,4.444444,8.0,2.25,0.0,0.0,1.0,6.0,4.0,0.0,5.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,"On the flanks and behind the wedding party, th...",87.0,0.103093,65.0,0.329897,0.647544,0.03125,0.060606
2,"J'aime aussi beaucoup sa voix, une voix un peu...",99,18,4.555556,5.0,3.6,2.0,0.0,2.0,3.0,1.0,1.0,4.0,0.0,2.0,2.0,0.0,6.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,"I also really like her voice, a slightly deep ...",94.0,0.069307,77.0,0.237624,0.633959,0.0,0.0
3,En partenariat avec l'INPES (Institut National...,224,35,5.428571,16.0,2.1875,2.0,1.0,2.0,12.0,8.0,0.0,8.0,2.0,2.0,6.0,1.0,0.0,0.0,0.0,0.0,21.0,2.0,0.0,0.0,0.0,0.0,0.0,In partnership with INPES (National Institute ...,197.0,0.120536,121.0,0.459821,0.741265,0.015385,0.030303
4,Il se réveilla seulement quand il entendit un ...,137,24,4.75,10.0,2.4,5.0,1.0,5.0,3.0,0.0,3.0,3.0,0.0,2.0,3.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,"He only woke up when he heard a strange noise,...",132.0,0.036496,102.0,0.255474,0.651604,0.0,0.0


In [None]:
train_dataset.to_csv('train-augmented.csv', index = False)
test_dataset.to_csv('test-augmented.csv', index = False)