#Model ensembling/Data augmentation TP (Nicolas & Richard)

* *Loading the Data*

In [2]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 4.8 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 89.0 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 88.6 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 69.5 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 87.1 MB/s 
Installing coll

In [3]:
# https://huggingface.co/datasets/allocine

from datasets import load_dataset_builder
ds_builder = load_dataset_builder("allocine")

# Inspect dataset description
print(ds_builder.info.description)

# Inspect dataset features
print(ds_builder.info.features)

# get_dataset_split_names
from datasets import get_dataset_split_names
get_dataset_split_names("allocine")

# load_dataset
from datasets import load_dataset
allocine_dataset = load_dataset("allocine")
allocine_dataset
#train_dataset = load_dataset("allocine", split="train")

Downloading builder script:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.11k [00:00<?, ?B/s]

 Allocine Dataset: A Large-Scale French Movie Reviews Dataset.
 This is a dataset for binary sentiment classification, made of user reviews scraped from Allocine.fr.
 It contains 100k positive and 100k negative reviews divided into 3 balanced splits: train (160k reviews), val (20k) and test (20k).

{'review': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}
Downloading and preparing dataset allocine/allocine to /root/.cache/huggingface/datasets/allocine/allocine/1.0.0/ea86b1dc05eae3a45a07b6281f2d4033b5fe7927b1008d06aa457ca1eae660d0...


Downloading data:   0%|          | 0.00/66.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Dataset allocine downloaded and prepared to /root/.cache/huggingface/datasets/allocine/allocine/1.0.0/ea86b1dc05eae3a45a07b6281f2d4033b5fe7927b1008d06aa457ca1eae660d0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['review', 'label'],
        num_rows: 160000
    })
    validation: Dataset({
        features: ['review', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['review', 'label'],
        num_rows: 20000
    })
})

In [4]:
import numpy as np

def get_dataset_split_ratio(dataset_split, seed=42, ratio=100):
  if ratio > 100 or ratio < 1: ratio = 100
  reviews = dataset_split['review']
  labels = dataset_split['label']
  
  data = [pair for pair in zip(reviews, labels)]
  
  np.random.seed(seed)
  np.random.shuffle(data)
  
  ratio_len = int(len(data)*ratio/100)
  
  data = list(zip(*data))

  return {'review':data[0][:ratio_len], 'label':data[1][:ratio_len]}


In [5]:
import pandas as pd

# From huggingface dataset split to DataFrame
def hf_dataset_split_to_df (huggingface_dataset_split):
  df = pd.DataFrame(huggingface_dataset_split)
  # https://github.com/amaiya/ktrain/blob/master/examples/text/ArabicHotelReviews-nbsvm.ipynb
  df['label'] = df['label'].apply(lambda x: 'negative' if x == 0 else 'positive')
  df = pd.concat([df, df.label.astype('str').str.get_dummies()], axis=1, sort=False)
  df = df[['review', 'negative', 'positive']]
  return df

In [6]:
ratio = 50 # percent of the data

train_df = hf_dataset_split_to_df(get_dataset_split_ratio(allocine_dataset['train'], 42, ratio))
val_df = hf_dataset_split_to_df(get_dataset_split_ratio(allocine_dataset['validation'], 42, ratio))
test_df = hf_dataset_split_to_df(get_dataset_split_ratio(allocine_dataset['test'], 42, ratio))

print(train_df.head())
print(train_df.describe())
print('len(train_df) :', len(train_df))
print('len(val_df) :', len(val_df))
print('len(test_df) :', len(test_df))

                                              review  negative  positive
0  Un excellent thriller d'action où les scènes d...         0         1
1  Si le scénariste, qui aurait pu faire un minim...         1         0
2  Référence dans la filmographie de Bogart, "Le ...         1         0
3  Un bon scénario, un bon film, une histoire lou...         0         1
4  Un scenario vide et une mise en scene trés sop...         1         0
           negative      positive
count  80000.000000  80000.000000
mean       0.495000      0.505000
std        0.499978      0.499978
min        0.000000      0.000000
25%        0.000000      0.000000
50%        0.000000      1.000000
75%        1.000000      1.000000
max        1.000000      1.000000
len(train_df) : 80000
len(val_df) : 10000
len(test_df) : 10000


#Ktrain configuration

In [7]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 


In [8]:
!pip install ktrain

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ktrain
  Downloading ktrain-0.31.10.tar.gz (25.3 MB)
[K     |████████████████████████████████| 25.3 MB 4.7 MB/s 
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 59.3 MB/s 
Collecting cchardet
  Downloading cchardet-2.1.7-cp37-cp37m-manylinux2010_x86_64.whl (263 kB)
[K     |████████████████████████████████| 263 kB 82.4 MB/s 
Collecting syntok>1.3.3
  Downloading syntok-1.4.4-py3-none-any.whl (24 kB)
Collecting transformers==4.17.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 82.0 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 49.6 MB/s 
[?25hCollecting keras_bert>=0.86.0
  Downloading keras-bert-0.89.0.

In [9]:
# Execution time 30 s
import ktrain
from ktrain import text

*Preprocessing the data*

In [10]:

(x_train_preproc, y_train_preproc), (x_val_preproc, y_val_preproc), preproc = text.texts_from_df (train_df, 
                                                                   'review', # name of column containing review text
                                                                   label_columns=['negative', 'positive'],
                                                                   val_df=val_df, # if None, 10% of data will be used for validation
                                          #max_features=NUM_WORDS, 
                                          #maxlen=MAXLEN,
                                          #ngram_range=NGRAMS_SIZE,
                                          preprocess_mode='standard' # default
                                          )



['negative', 'positive']
   negative  positive
0         0         1
1         1         0
2         1         0
3         0         1
4         1         0
['negative', 'positive']
   negative  positive
0         0         1
1         0         1
2         1         0
3         1         0
4         1         0
language: fr
Word Counts: 135007
Nrows: 80000
80000 train sequences
train sequence lengths:
	mean : 87
	95percentile : 240
	99percentile : 303
x_train shape: (80000,400)
y_train shape: (80000, 2)
Is Multi-Label? False
10000 test sequences
test sequence lengths:
	mean : 88
	95percentile : 246
	99percentile : 306
x_test shape: (10000,400)
y_test shape: (10000, 2)


#FASTTEXT MODEL

In [11]:
# Build and return a text classification model https://amaiya.github.io/ktrain/text/index.html#ktrain.text.text_classifier
fasttext_model = text.text_classifier('fasttext', (x_train_preproc, y_train_preproc), preproc=preproc)

# Returns a Learner instance that can be used to tune and train Keras models https://amaiya.github.io/ktrain/index.html#ktrain.get_learner
fasttext_learner = ktrain.get_learner(fasttext_model, train_data=(x_train_preproc, y_train_preproc), val_data=(x_val_preproc, y_val_preproc))

Is Multi-Label? False
compiling word ID features...
maxlen is 400
done.


In [12]:
LEARNING_RATE = 0.01

fasttext_learner.autofit(LEARNING_RATE)

early_stopping automatically enabled at patience=5
reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 0.01...
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
Epoch 5/1024
Epoch 6/1024
Epoch 00006: Reducing Max LR on Plateau: new max lr will be 0.005 (if not early_stopping).
Epoch 7/1024
Epoch 8/1024
Epoch 9/1024
Epoch 00009: Reducing Max LR on Plateau: new max lr will be 0.0025 (if not early_stopping).
Epoch 10/1024
Epoch 11/1024
Epoch 00011: Reducing Max LR on Plateau: new max lr will be 0.00125 (if not early_stopping).
Epoch 12/1024
Epoch 12: early stopping
Weights from best epoch have been loaded into model.


<keras.callbacks.History at 0x7f4c92cafe90>

In [13]:
# save Predictor (i.e., model and Preprocessor instance) after partially training
ktrain.get_predictor(fasttext_learner.model, preproc).save('fasttext_allocine.model+preproc')
fasttext_predictor = ktrain.get_predictor(fasttext_learner.model, preproc)

*Evaluation*

In [14]:
# reviews 
x_test = list(test_df['review'])

# labels (gold) 
y_test = list(test_df['positive'])

In [15]:
# hypothèse
y_hyp = [0 if h == 'negative' else 1 for h in fasttext_predictor.predict(x_test) ]
y_hyp_fastext= [0 if h == 'negative' else 1 for h in fasttext_predictor.predict(x_test) ]#we are going to use this variable later on for the ensembling method



In [16]:
from sklearn.metrics import f1_score, precision_score, recall_score

f1_positive_fasttext = f1_score(y_test, y_hyp)

In [17]:
f1_score(y_test, y_hyp, average = "binary", pos_label = 1)

0.9159420289855073

#NBSVM

In [18]:
# load an NBSVM model
nbsvm_model = text.text_classifier('nbsvm', (x_train_preproc, y_train_preproc), preproc=preproc)
nbsvm_learner = ktrain.get_learner(nbsvm_model, train_data=(x_train_preproc, y_train_preproc), val_data=(x_val_preproc, y_val_preproc))

# fine tune
LEARNING_RATE = 0.01
nbsvm_learner.autofit(LEARNING_RATE)



Is Multi-Label? False
compiling word ID features...
maxlen is 400
building document-term matrix... this may take a few moments...
rows: 1-10000
rows: 10001-20000
rows: 20001-30000
rows: 30001-40000
rows: 40001-50000
rows: 50001-60000
rows: 60001-70000
rows: 70001-80000
computing log-count ratios...
done.
early_stopping automatically enabled at patience=5
reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 0.01...
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 00003: Reducing Max LR on Plateau: new max lr will be 0.005 (if not early_stopping).
Epoch 4/1024
Epoch 5/1024
Epoch 00005: Reducing Max LR on Plateau: new max lr will be 0.0025 (if not early_stopping).
Epoch 6/1024
Epoch 6: early stopping
Weights from best epoch have been loaded into model.


<keras.callbacks.History at 0x7f4c913c5d10>

In [19]:
# données
x_test = list(test_df['review']) # reviews 
y_test = list(test_df['positive']) # labels (gold) 

# prédiction
nbsvm_predictor = ktrain.get_predictor(nbsvm_learner.model, preproc)
y_hyp = [0 if h == 'negative' else 1 for h in nbsvm_predictor.predict(x_test) ]
y_hyp_NBSVM = [0 if h == 'negative' else 1 for h in nbsvm_predictor.predict(x_test) ]

f1_score_nbsvm=f1_score(y_test, y_hyp)
# évaluation
print(f1_score(y_test, y_hyp))



0.9195568899471995


In [20]:
# save Predictor (i.e., model and Preprocessor instance) after partially training
ktrain.get_predictor(nbsvm_learner.model, preproc).save('nbsvm_allocine.model+preproc')


#BERT

In [21]:
# ETAPE 1 
(x_train_preproc, y_train_preproc), (x_val_preproc, y_val_preproc), preproc = text.texts_from_df (train_df, 
                      'review',
                      label_columns = ["negative", "positive"],
                      val_df= val_df, # if None, 10% of data will be used for validation
                      ##max_features=NUM_WORDS, 
                      #maxlen=MAXLEN,
                      preprocess_mode='bert' 
                      )

['negative', 'positive']
   negative  positive
0         0         1
1         1         0
2         1         0
3         0         1
4         1         0
['negative', 'positive']
   negative  positive
0         0         1
1         0         1
2         1         0
3         1         0
4         1         0
downloading pretrained BERT model (multi_cased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: fr


Is Multi-Label? False
preprocessing test...
language: fr


In [22]:
# ETAPE 2 et 3
bert_model = text.text_classifier('bert', (x_train_preproc, y_train_preproc) , preproc=preproc)
bert_learner = ktrain.get_learner(bert_model, 
                             train_data=(x_train_preproc, y_train_preproc), 
                             val_data=(x_val_preproc, y_val_preproc), 
                             batch_size=6)
# ETAPE 5
bert_learner.fit_onecycle(2e-5, 1)

Is Multi-Label? False
maxlen is 400
done.


begin training using onecycle policy with max lr of 2e-05...


<keras.callbacks.History at 0x7f4c80333490>

*Evaluation*

In [23]:
# données
x_test = list(test_df['review'])   # reviews 
y_test = list(test_df['positive']) # labels (gold) 

# prédiction
bert_predictor = ktrain.get_predictor(bert_learner.model, preproc)
y_hyp = [0 if h == 'negative' else 1 for h in bert_predictor.predict(x_test) ]
y_hyp_bert = [0 if h == 'negative' else 1 for h in bert_predictor.predict(x_test) ]

# évaluation
f1_score_bert=f1_score(y_test, y_hyp)
from sklearn.metrics import f1_score, precision_score, recall_score
print(f1_score(y_test, y_hyp))

0.9518248175182482


In [24]:
# save Predictor (i.e., model and Preprocessor instance) after partially training
ktrain.get_predictor(bert_learner.model, preproc).save('bert_allocine.model+preproc')

# reload Predictor and extract model
#model = ktrain.load_predictor('/tmp/my_predictor').model

#LOGREG

In [25]:

(x_train_preproc, y_train_preproc), (x_val_preproc, y_val_preproc), preproc = text.texts_from_df (train_df, 
                                                                   'review', # name of column containing review text
                                                                   label_columns=['negative', 'positive'],
                                                                   val_df=val_df, # if None, 10% of data will be used for validation
                                          #max_features=NUM_WORDS, 
                                          #maxlen=MAXLEN,
                                          #ngram_range=NGRAMS_SIZE,
                                          preprocess_mode='standard' # default
                                          )



['negative', 'positive']
   negative  positive
0         0         1
1         1         0
2         1         0
3         0         1
4         1         0
['negative', 'positive']
   negative  positive
0         0         1
1         0         1
2         1         0
3         1         0
4         1         0
language: fr
Word Counts: 135007
Nrows: 80000
80000 train sequences
train sequence lengths:
	mean : 87
	95percentile : 240
	99percentile : 303
x_train shape: (80000,400)
y_train shape: (80000, 2)
Is Multi-Label? False
10000 test sequences
test sequence lengths:
	mean : 88
	95percentile : 246
	99percentile : 306
x_test shape: (10000,400)
y_test shape: (10000, 2)


In [26]:
# Build and return a text classification model https://amaiya.github.io/ktrain/text/index.html#ktrain.text.text_classifier
logreg_model = text.text_classifier('logreg', (x_train_preproc, y_train_preproc), preproc=preproc)

# Returns a Learner instance that can be used to tune and train Keras models https://amaiya.github.io/ktrain/index.html#ktrain.get_learner
logreg_learner = ktrain.get_learner(logreg_model, train_data=(x_train_preproc, y_train_preproc), val_data=(x_val_preproc, y_val_preproc))

Is Multi-Label? False
compiling word ID features...
maxlen is 400
done.


In [27]:
LEARNING_RATE = 0.01
logreg_learner.autofit(LEARNING_RATE)

early_stopping automatically enabled at patience=5
reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 0.01...
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 00003: Reducing Max LR on Plateau: new max lr will be 0.005 (if not early_stopping).
Epoch 4/1024
Epoch 5/1024
Epoch 00005: Reducing Max LR on Plateau: new max lr will be 0.0025 (if not early_stopping).
Epoch 6/1024
Epoch 6: early stopping
Weights from best epoch have been loaded into model.


<keras.callbacks.History at 0x7f4bf230f9d0>

In [28]:
# données
x_test = list(test_df['review']) # reviews 
y_test = list(test_df['positive']) # labels (gold) 

# prédiction
logreg_predictor = ktrain.get_predictor(logreg_learner.model, preproc)
y_hyp = [0 if h == 'negative' else 1 for h in logreg_predictor.predict(x_test) ]


f1_score_logreg=f1_score(y_test, y_hyp)
# évaluation
print(f1_score(y_test, y_hyp))


0.9106197446278417


#CAMEMBERT

In [29]:
x_train = list(train_df['review'])
y_train = list(train_df['positive'])

x_val = list(val_df['review'])
y_val = list(val_df['positive'])

x_test = list(test_df['review'])
y_test = list(test_df['positive'])

In [30]:
import ktrain
from ktrain import text
MODEL_NAME = 'camembert-base'
#MODEL_NAME = 'albert-base-v2'
#MODEL_NAME = 'camembert-base'

CLASS_NAMES = ["negative", "positive"]

camembert_preproc = text.Transformer(MODEL_NAME, maxlen=500, class_names=CLASS_NAMES)
train_preproc = camembert_preproc.preprocess_train(x_train, y_train)
val_preproc = camembert_preproc.preprocess_test(x_val, y_val)
 
camembert_model = camembert_preproc.get_classifier()
camembert_learner = ktrain.get_learner(camembert_model, train_data=train_preproc, val_data=val_preproc, batch_size=12)

Downloading:   0%|          | 0.00/508 [00:00<?, ?B/s]

preprocessing train...
language: fr
train sequence lengths:
	mean : 91
	95percentile : 253
	99percentile : 320


Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

Is Multi-Label? False
preprocessing test...
language: fr
test sequence lengths:
	mean : 93
	95percentile : 260
	99percentile : 323


Downloading:   0%|          | 0.00/518M [00:00<?, ?B/s]

In [31]:
camembert_learner.fit_onecycle(0.01, 1)




begin training using onecycle policy with max lr of 0.01...


<keras.callbacks.History at 0x7f4bec974790>

In [32]:
# données
x_test = list(test_df['review'])   # reviews 
y_test = list(test_df['positive']) # labels (gold) 

# prédiction
camembert_predictor = ktrain.get_predictor(camembert_learner.model, camembert_preproc)
y_hyp = [0 if h == 'negative' else 1 for h in camembert_predictor.predict(x_test) ]


f1_score_camembert=f1_score(y_test, y_hyp)
# évaluation
print(f1_score(y_test, y_hyp))


0.64810058131675


#MODEL ENSEMBLING BY SAMPLE AVERAGING METHOD

In [33]:
f1_score_averaged = (f1_score_bert + f1_score_logreg + f1_score_nbsvm) / 3

print(f1_score_averaged)

0.9273338173644298


#MODEL ENSEMBLING BY STACKING

*J'AI TROUVÉ LE CODE POUR CETTE PARTIE "BYSTACKING" MAIS JE L'AI PAS PARFAITEMENT COMPRIS ET DONC JE L'AI PAS RAJOUTÉ*

#MODEL ENSEMBLING BY HARD VOTING

In [34]:
def get_hard_vote(predictors, data):
  predictions = []
  for predictor in predictors:
    predictions.append(predictor.predict(data)[0])
  return max(set(predictions), key=predictions.count)

In [36]:
predictors = [fasttext_predictor, nbsvm_predictor, bert_predictor]

In [37]:


test_df_eval = hf_dataset_split_to_df(get_dataset_split_ratio(allocine_dataset['test'], 42, 5))

x_test = list(test_df_eval['review'])  
y_test = list(test_df_eval['positive']) 

hard_predictions = []
for data in x_test:
  hard_predictions.append(get_hard_vote(predictors, data))

y_hyp_hardvoting = [0 if h == 'negative' else 1 for h in hard_predictions]

print(f1_score(y_test, y_hyp_hardvoting))

0.6577181208053691


#DATA AUGMENTATION

In [None]:
!pip3 install textattack[tensorflow]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting textattack[tensorflow]
  Downloading textattack-0.3.8-py3-none-any.whl (418 kB)
[K     |████████████████████████████████| 418 kB 6.7 MB/s 
[?25hCollecting language-tool-python
  Downloading language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Collecting lru-dict
  Downloading lru_dict-1.1.8-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26 kB)
Collecting transformers>=4.21.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 80.3 MB/s 
Collecting pycld2
  Downloading pycld2-0.41.tar.gz (41.4 MB)
[K     |████████████████████████████████| 41.4 MB 1.3 MB/s 
Collecting flair
  Downloading flair-0.11.3-py3-none-any.whl (401 kB)
[K     |████████████████████████████████| 401 kB 75.4 MB/s 
Collecting datasets==2.4.0
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K 

In [None]:
from textattack.attack_recipes import PWWSRen2019
from textattack.datasets import HuggingFaceDataset
from textattack.models.wrappers import ModelWrapper
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, pipeline
from textattack import Attacker

import numpy as np

# Quiet TensorFlow.
import os
if "TF_CPP_MIN_LOG_LEVEL" not in os.environ:
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"


class HuggingFaceSentimentAnalysisPipelineWrapper(ModelWrapper):
    """ Transformers sentiment analysis pipeline returns a list of responses
        like

            [{'label': 'POSITIVE', 'score': 0.7817379832267761}]

        We need to convert that to a format TextAttack understands, like

            [[0.218262017, 0.7817379832267761]
    """
    def __init__(self, model):
        self.model = model#pipeline = pipeline
    def __call__(self, text_inputs):
        raw_outputs = self.model(text_inputs)
        outputs = []
        for output in raw_outputs:
            score = output['score']
            if output['label'] == 'POSITIVE':
                outputs.append([1-score, score])
            else:
                outputs.append([score, 1-score])
        return np.array(outputs)


textattack: Updating TextAttack package dependencies.
textattack: Downloading NLTK required packages.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Create the model: a French sentiment analysis model.
# see https://github.com/TheophileBlard/french-sentiment-analysis-with-bert
model = TFAutoModelForSequenceClassification.from_pretrained("camembert-base")
tokenizer = AutoTokenizer.from_pretrained("camembert-base")
pipeline = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

model_wrapper = HuggingFaceSentimentAnalysisPipelineWrapper(pipeline)

# Create the recipe: PWWS uses a WordNet transformation.
recipe = PWWSRen2019.build(model_wrapper)
#
# WordNet defaults to english. Set the default language to French ('fra')
#
# See "Building a free French wordnet from multilingual resources",
# E. L. R. A. (ELRA) (ed.),
# Proceedings of the Sixth International Language Resources and Evaluation (LREC’08).
recipe.transformation.language = 'fra'


# dataset = HuggingFaceDataset('allocine', split='test[:10%]')

# # datalist = ["Ce film était horrible ! L'intrigue était ennuyeuse. Le jeu d'acteur était correct, cependant.",
# #          "Le film est vraiment nul. Je veux qu'on me rende mon argent.",
# #         "Quelle belle comédie romantique. 10/10 à revoir !"]


# attacker = Attacker(recipe, dataset)
# attack_output = attacker.attack_dataset()
# print(attack_output)


All model checkpoint layers were used when initializing TFCamembertForSequenceClassification.

Some layers of TFCamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
textattack: Unknown if model of class <class 'transformers.pipelines.text_classification.TextClassificationPipeline'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


In [None]:

# Create the model: a French sentiment analysis model.
# see https://github.com/TheophileBlard/french-sentiment-analysis-with-bert
model = TFAutoModelForSequenceClassification.from_pretrained("Jean-Baptiste/camembert-ner",)
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
pipeline = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

model_wrapper = HuggingFaceSentimentAnalysisPipelineWrapper(pipeline)

# Create the recipe: PWWS uses a WordNet transformation.
recipe = PWWSRen2019.build(model_wrapper)
#
# WordNet defaults to english. Set the default language to French ('fra')
#
# See "Building a free French wordnet from multilingual resources",
# E. L. R. A. (ELRA) (ed.),
# Proceedings of the Sixth International Language Resources and Evaluation (LREC’08).
recipe.transformation.language = 'fra'



In [None]:
dataset = HuggingFaceDataset('allocine', split='test')

attacker = Attacker(recipe, dataset)
attacker.attack_dataset()



  0%|          | 0/3 [00:00<?, ?it/s]

textattack: Loading [94mdatasets[0m dataset [94mallocine[0m, split [94mtest[0m.


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  weighted-saliency
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapWordNet
  (constraints): 
    (0): RepeatModification
    (1): StopwordModification
  (is_black_box):  True
) 




  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:00<00:02,  3.09it/s][A
[Succeeded / Failed / Skipped / Total] 0 / 0 / 1 / 1:  10%|█         | 1/10 [00:00<00:02,  3.05it/s][A

--------------------------------------------- Result 1 ---------------------------------------------

Magnifique épopée, une belle histoire, touchante avec des acteurs qui interprètent très bien leur rôles (Mel Gibson, Heath Ledger, Jason Isaacs...), le genre de film qui se savoure en famille! :)





[Succeeded / Failed / Skipped / Total] 0 / 0 / 1 / 1:  20%|██        | 2/10 [07:03<28:15, 211.95s/it][A
[Succeeded / Failed / Skipped / Total] 0 / 1 / 1 / 2:  20%|██        | 2/10 [07:03<28:15, 211.95s/it][A

--------------------------------------------- Result 2 ---------------------------------------------

Je n'ai pas aimé mais pourtant je lui mets 2 étoiles car l'expérience est louable. Rien de conventionnel ici. Une visite E.T. mais jonchée d'idées /- originales. Le soucis, tout ceci avait-il vraiment sa place dans un film de S.F. tirant sur l'horreur ? Voici un film qui, à l'inverse de tant d'autres qui y ont droit, mériterait peut-être un remake.





[Succeeded / Failed / Skipped / Total] 0 / 1 / 1 / 2:  30%|███       | 3/10 [07:04<16:29, 141.40s/it][A
[Succeeded / Failed / Skipped / Total] 0 / 1 / 2 / 3:  30%|███       | 3/10 [07:04<16:29, 141.40s/it][A

--------------------------------------------- Result 3 ---------------------------------------------

Un dessin animé qui brille par sa féerie et ses chansons.





[Succeeded / Failed / Skipped / Total] 0 / 1 / 2 / 3:  40%|████      | 4/10 [09:08<13:42, 137.06s/it][A
[Succeeded / Failed / Skipped / Total] 0 / 2 / 2 / 4:  40%|████      | 4/10 [09:08<13:42, 137.06s/it][A

--------------------------------------------- Result 4 ---------------------------------------------

Si c'est là le renouveau du cinéma français, c'est tout de même foutrement chiant. Si l'objet est très stylisé et la tension palpable, le film paraît plutôt creux.





[Succeeded / Failed / Skipped / Total] 0 / 2 / 2 / 4:  50%|█████     | 5/10 [10:10<10:10, 122.06s/it][A
[Succeeded / Failed / Skipped / Total] 0 / 3 / 2 / 5:  50%|█████     | 5/10 [10:10<10:10, 122.06s/it][A

--------------------------------------------- Result 5 ---------------------------------------------

Et pourtant on s’en Doutait !Second volet très mauvais, sans fraîcheur et particulièrement lourdingue. Quel dommage.





[Succeeded / Failed / Skipped / Total] 0 / 3 / 2 / 5:  60%|██████    | 6/10 [10:10<06:47, 101.82s/it][A
[Succeeded / Failed / Skipped / Total] 0 / 3 / 3 / 6:  60%|██████    | 6/10 [10:10<06:47, 101.82s/it][A

--------------------------------------------- Result 6 ---------------------------------------------

Vous reprendrez bien un peu d'été ? Ce film je le voyais comme un mélange de Rohmer et de Rozier, un film de vacances, j'adore ça, un truc beau et pur qui dit des choses sur la vie, l'amour, les filles, les vacances. Un film qui se regarde en sirotant une boisson fraîche en écoutant les grillons ! Sauf qu'en fait non ! On a un film foutraque au possible qui reprend les codes justement de Rohmer voir Godard, enfin la Nouvelle Vague en général dans sa première partie (jusqu'à même finir sur une partie qui ressemblerait à du Kusturica), mais en beaucoup plus léger et décalé. Le film n'en a rien à foutre de rien, il ose tout, n'a peur de rien et ça c'est bon. C'est sans doute le film le plus drôle de 2013, mais tout simplement l'un des meilleurs tout court. Le film qui nous sort des dialogues qui pourraient sortir d'un mauvais Godard (oxymore) sur un ton what the fuckesque… raconte des ane


[Succeeded / Failed / Skipped / Total] 0 / 3 / 3 / 6:  70%|███████   | 7/10 [10:11<04:21, 87.32s/it] [A
[Succeeded / Failed / Skipped / Total] 0 / 3 / 4 / 7:  70%|███████   | 7/10 [10:11<04:21, 87.32s/it][A

--------------------------------------------- Result 7 ---------------------------------------------

Bon c'est pas un grand film mais on passe un bon moment avec ses ado à la recherche de l'orgasme. Y'a que les Allemands pour faire des films aussi barge ! :-)





[Succeeded / Failed / Skipped / Total] 0 / 3 / 4 / 7:  80%|████████  | 8/10 [10:11<02:32, 76.45s/it][A
[Succeeded / Failed / Skipped / Total] 0 / 3 / 5 / 8:  80%|████████  | 8/10 [10:11<02:32, 76.45s/it][A

--------------------------------------------- Result 8 ---------------------------------------------

Terrible histoire que ces êtres sans amour, ces êtres lisses et frustres qui passent à côté de leur vie. Quelle leçon Monsieur Brizé! Vous avez tout dit, tout filmé jusqu'au moindre détail. tout est beau et terrifiant jusqu'à la scène finale qui nous liquéfie, un Vincent Lindon regardant la vie fixement sans oser la toucher ni la prendre dans ses bras, une Hélène Vincent qui attend, qui attend... Mon Dieu Monsieur Brizé, continuez....





[Succeeded / Failed / Skipped / Total] 0 / 3 / 5 / 8:  90%|█████████ | 9/10 [10:11<01:07, 67.99s/it][A
[Succeeded / Failed / Skipped / Total] 0 / 3 / 6 / 9:  90%|█████████ | 9/10 [10:11<01:07, 67.99s/it][A

--------------------------------------------- Result 9 ---------------------------------------------

Un très joli film, qui ressemble à un téléfilm mais qui a le mérite d'être émouvant et proche de ses personnages. Magimel est vraiment très bon et l'histoire est touchante





[Succeeded / Failed / Skipped / Total] 0 / 3 / 6 / 9: 100%|██████████| 10/10 [33:09<00:00, 198.93s/it][A
[Succeeded / Failed / Skipped / Total] 0 / 4 / 6 / 10: 100%|██████████| 10/10 [33:09<00:00, 198.94s/it]

--------------------------------------------- Result 10 ---------------------------------------------

Mais comment certaines personnes ont pus lui mettre 5/5 et donc dire indirectement que c'est un chef-d'œuvre ??? Et comment a-t-il fait pour sortir au cinéma et non en DTV ??? C'est pas un film que l'on regarde dans une salle obscur ça, pour moi ça ressemble plus à un téléfilm que l'on visionne un dimanche pluvieux pour que les enfants arrête de nous casser les pieds ! Et puis, le scénario avec le chien que devient le meilleur ami du gosse, c'est du vu et revu (un cliché) ! L'acteur principal est quant à lui aussi agaçant que son personnage ! Les suites ont l'air aussi mauvaises que Buddy Star des Paniers étant donné que l'histoire est quasiment la même (pour moi ça c'est pas des suites, c'est plutôt une succession de petits reboots inutiles). Reste regardable pour les moins de 10 ans (et encore, même moi à 6 ans, je n'aurais pas aimé).



+-------------------------------+--------+
| 


  average_perc_words_perturbed = self.perturbed_word_percentages.mean()
  ret = ret.dtype.type(ret / rcount)


[<textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fcce397ac90>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x7fcc9e7a6c90>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fcc9e7a6c10>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x7fcce18ec210>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x7fcce3677710>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fcce3793310>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fccea31bc90>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fcce3690b50>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fcce3677ed0>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x7fcc9a3cd590>]

20000

#PARTIE 3 - TP RICHARD - NER DETECTION & CLASSIFICATION USING NER FEATURES

In [13]:
# Then we create a list of all the entities that can be retrieved in the text 
# using hugging face model for french NER (PER, ORG, LOC)

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model_nlp = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")

nlp = pipeline('ner', model=model_nlp, tokenizer=tokenizer, aggregation_strategy="simple")

Downloading:   0%|          | 0.00/269 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/210 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [13]:
df_train = pd.DataFrame(train_df, columns=['review', 'negative','positive','per','misc','loc','org'])
df_test = pd.DataFrame(test_df, columns=['review', 'negative','positive','per','misc','loc','org'])
df_val = pd.DataFrame(val_df, columns=['review', 'negative','positive','per','misc','loc','org'])

In [None]:
df_train=df_train.fillna(0)
df_test=df_test.fillna(0)

In [14]:
results=[]
test_df1=train_df
for i in range(0,len(test_df1)):
  results.append(nlp(test_df1['review'][i]))

In [15]:
results_test=[]
test_df2=test_df
for i in range(0,len(test_df2)):
  results_test.append(nlp(test_df2['review'][i]))

In [13]:
for i in range(len(results)):
  for j in range(len(results[i])):
    if results[i][j]['entity_group']=='PER':
      df_train['per'][i]+=1
    if results[i][j]['entity_group']=='MISC':
      df_train['misc'][i]+=1
    if results[i][j]['entity_group']=='LOC':
      df_train['loc'][i]+=1
    if results[i][j]['entity_group']=='ORG':
      df_train['org'][i]+=1



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [14]:

for i in range(len(results_test)):
  for j in range(len(results_test[i])):
    if results_test[i][j]['entity_group']=='PER':
      df_test['per'][i]+=1
    if results_test[i][j]['entity_group']=='MISC':
      df_test['misc'][i]+=1
    if results_test[i][j]['entity_group']=='LOC':
      df_test['loc'][i]+=1
    if results_test[i][j]['entity_group']=='ORG':
      df_test['org'][i]+=1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [19]:
df_val

Unnamed: 0,review,negative,positive,per,misc,loc,org
0,"Un film extraordinaire par sa légèreté, sa poé...",0,1,0.0,0.0,0.0,0.0
1,"Le Dernier Exorcisme, s'il n'est pas le métrag...",0,1,0.0,1.0,0.0,0.0
2,"Attention, ce film est noté sur l'echelle du C...",1,0,1.0,2.0,0.0,0.0
3,"Tout simplement ennuyant , une perte de temps,...",1,0,0.0,0.0,0.0,0.0
4,Très difficile de regarder un film d'horreur a...,1,0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
1995,Un peu un fourre-tout de ce qui s'est déjà fai...,1,0,0.0,4.0,0.0,0.0
1996,Ce film a le mérite de nous assurer que Serena...,1,0,3.0,3.0,0.0,0.0
1997,Un classique du film social britannique la gra...,0,1,1.0,0.0,0.0,0.0
1998,Un excellent remake à la sauce coréenne. L'his...,0,1,1.0,0.0,0.0,0.0


In [12]:
df_train=df_train.fillna(0)
df_test=df_test.fillna(0)
df_val=df_val.fillna(0)

In [None]:

(x_train_preproc, y_train_preproc), (x_val_preproc, y_val_preproc), preproc = text.texts_from_df (train_df, 
                                                                   ['review'], # name of column containing review text
                                                                   label_columns=['negative', 'positive'],
                                                                   val_df=val_df, # if None, 10% of data will be used for validation
                                          #max_features=NUM_WORDS, 
                                          #maxlen=MAXLEN,
                                          #ngram_range=NGRAMS_SIZE,
                                          preprocess_mode='standard' # default
                                          )



In [None]:
lll=df_train.get(['per','misc'])
lll_test=df_test.get(['per','misc'])

ss=np.hstack([x_train_preproc,lll])
ss_2=np.hstack([x_val_preproc,lll_val])


In [65]:
# load an NBSVM model
nbsvm_model = text.text_classifier('nbsvm', (ss.astype(int), y_train_preproc), preproc=preproc)
nbsvm_learner = ktrain.get_learner(nbsvm_model, train_data=(ss, y_train_preproc), val_data=(ss_2.astype(int), y_val_preproc))

# fine tune
LEARNING_RATE = 0.01
nbsvm_learner.autofit(LEARNING_RATE)



Is Multi-Label? False
compiling word ID features...
maxlen is 402
building document-term matrix... this may take a few moments...
rows: 1-10000
rows: 10001-16000
computing log-count ratios...
done.
early_stopping automatically enabled at patience=5
reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 0.01...
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 00003: Reducing Max LR on Plateau: new max lr will be 0.005 (if not early_stopping).
Epoch 4/1024
Epoch 5/1024
Epoch 00005: Reducing Max LR on Plateau: new max lr will be 0.0025 (if not early_stopping).
Epoch 6/1024
Epoch 6: early stopping
Weights from best epoch have been loaded into model.


<keras.callbacks.History at 0x7ff2a1bbeed0>

In [8]:
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 32.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 78.0 MB/s 
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.13.2 transformers-4.24.0
