In [8]:
import torch
from fast_bert.data_cls import BertDataBunch
from fast_bert.learner_cls import BertLearner
from fast_bert.data_lm import BertLMDataBunch
from fast_bert.learner_lm import BertLMLearner
from fast_bert.metrics import fbeta, roc_auc
from fast_bert.prediction import BertClassificationPredictor
from pathlib import Path
import pandas as pd
import logging
import tensorflow as tf


logger = logging.getLogger()
device_cuda = torch.device("cuda")
device = torch.device('cuda')

# check if multiple GPUs are available
if torch.cuda.device_count() > 1:
    multi_gpu = True
else:
    multi_gpu = False
    


I0625 15:06:06.939834 10444 file_utils.py:39] PyTorch version 1.4.0 available.
I0625 15:06:09.682842 10444 file_utils.py:55] TensorFlow version 2.2.0 available.


In [9]:
DATA_PATH = Path('./data/')
LOG_PATH = Path('./logs/')
MODEL_PATH = Path('./model/')
LABEL_PATH = Path('./labels/')

In [None]:
df = pd.read_csv('./data/labeled_data.csv')

In [12]:

msg = tf.constant('Hello, TensorFlow!')
tf.print(msg)


Hello, TensorFlow!


In [7]:
import torch

a = torch.cuda.FloatTensor(10000)
print("Allocated:", round(torch.cuda.memory_allocated(0)/10243,1), "GB")

b = torch.cuda.FloatTensor(20000)
print("Allocated:", round(torch.cuda.memory_allocated(0)/10243,1), "GB")
torch.rand(20000,20000).cuda()
print("Allocated:", round(torch.cuda.memory_allocated(0)/1024**3,1), "GB")

Allocated: 11.8 GB
Allocated: 11.8 GB
Allocated: 0.0 GB


In [None]:
val_set = df.sample(frac=0.2, replace=False, random_state=42)
train_set = df.drop(index = val_set.index)
print('Nombre de commentaires dans le val_set:',len(val_set))
print('Nombre de commentaires dans le train_set:', len(train_set))
val_set.to_csv('./data/val_set.csv')
train_set.to_csv('.data/train_set.csv')

In [None]:
labels = df.columns[2:].to_list()
with open('./labels/labels.txt', 'w') as f:
    for i in labels:
        f.write(i + "\n")

In [None]:
#df_texts = pd.read_csv('./data/raw_data.csv')
df_texts = pd.read_csv('./data/autres/df_mg36_2020.csv', sep='|', encoding ='utf-8')
#all_texts = df_texts[''].to_list()
#print('Nombre de commentaires:', len(all_texts))

In [None]:
df_texts['fst_txt']=df_texts['fst_txt'].str[13:]
del df_texts['texte_ss_stp']
df_texts.head(5)

In [None]:
df_texts.head(5)

In [None]:
df_texts.to_csv('./data/df_mg_37.csv', sep='|', encoding ='utf-8',index=False)

In [None]:
bvt_val_set = df_texts.sample(frac=0.2, replace=False, random_state=42)
bvt_train_set = df_texts.drop(index = bvt_val_set.index)
print('Nombre de commentaires dans le val_set:',len(bvt_val_set))
print('Nombre de commentaires dans le train_set:', len(bvt_train_set))
bvt_val_set.to_csv('./data/bvt_val_set.csv',sep='|', encoding ='utf-8',index=False)
bvt_train_set.to_csv('./data/bvt_train_set.csv',sep='|', encoding ='utf-8',index=False)

### a partir d'ICI

In [None]:
#lecture direct des données preparées
df_texts=pd.read_csv('./data/df_mg_37.csv', sep='|', encoding ='utf-8',index_col=None)
df_texts.head(5)

In [None]:
df_texts.fst_txt.str.replace("[<>%\$]", '')
df_texts.fst_txt.str.replace("[u'\u2260']", '<>')
df_texts.fst_txt.str.replace(u'œ', u'oe')
df_texts.fst_txt.str.replace(u'μ', u'mu')
df_texts.fst_txt.str.replace(u'≠', u'<>')


In [None]:
all_texts=df_texts['fst_txt'].to_list()


In [None]:
def removePunctuation(sentence):
    return ' '.join([i for i in sentence.split() if i.isalpha()])
all_texts=[removePunctuation(x) for x in all_texts]


In [None]:
#correction des codes non pris en compte dans unidecode
#retire le Œ mu"
all_texts = [sub.replace(u'œ', u'oe') for sub in all_texts] 
all_texts = [sub.replace(u'μ', u'mu') for sub in all_texts]
all_texts = [sub.replace(u'≠', u'<>') for sub in all_texts]
all_texts = [sub.replace(u'\u2260', u'<>') for sub in all_texts]


In [None]:
#corrige une bonne partie des codes à problemes mais pas tous
all_texts=[unidecode(x) for x in all_texts]

In [None]:
import ftfy
#all_texts=[ftfy.fix_text(x) for x in all_texts]

In [None]:
#all_texts=[ftfy.fix_encoding(x) for x in all_texts]

In [None]:
import unicodedata
#all_texts=[unicodedata.normalize('NFD',x).encode('utf-8', errors='ignore') for x in all_texts]
#[unicodedata.is_normalized('NFD', x) for x in all_texts]

In [None]:
from unidecode import unidecode

s = "Héllô œ μ Càèùverâêt Jîôûç ïîäüë ≠"
s = unidecode(s)
print(s) 


In [None]:
stre=["ma chaine de œ pose probleme"," ici c ≠ est μ le soucis"]
print(stre)
stre = [sub.replace(u'œ', u'oe') for sub in stre]  
stre = [sub.replace(u'μ', u'mu') for sub in stre]
stre = [sub.replace(u'\u2260', u'<>') for sub in stre]

print (stre)

In [None]:
(all_texts[0]) # en utf-8


In [None]:
(all_texts[2])  #cp1252

In [None]:
import pickle

with open('./alltexts.pkl', 'wb') as f:
    pickle.dump(all_texts, f)

In [None]:
import pickle
with open('./alltexts.pkl', 'rb') as f:
    all_texts = pickle.load(f)

### Création de LMDataBunch

In [None]:
# la première fois
databunch_lm = BertLMDataBunch.from_raw_corpus(
                    data_dir=DATA_PATH,
                    text_list=all_texts,
                    tokenizer='camembert-base',
                    batch_size_per_gpu=16,
                    max_seq_length=512,
                    multi_gpu=False,
                    model_type='camembert-base',
                    logger=logger)

In [None]:
#les fois suivantes repartir d'ici , il utilise les fichiers lm_train et lm_test générés depuis la cellule précédente
databunch_lm=BertLMDataBunch(data_dir=DATA_PATH, 
                    tokenizer='camembert-base',
                    batch_size_per_gpu=32,
                    max_seq_length= 512,
                    multi_gpu=False,
                    model_type='camembert-base',
                    logger=None, 
                    clear_cache=False,
                    no_cache=False)

I0625 15:06:22.948308 10444 tokenization_utils.py:1022] loading file https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model from cache at C:\Users\odissaux/.cache\torch\transformers\3715e3a4a2de48834619b2a6f48979e13ddff5cabfb1f3409db689f9ce3bb98f.28d30f926f545047fc59da64289371eef0fbdc0764ce9ec56f808a646fcfec59
I0625 15:06:23.044310 10444 data_lm.py:129] Creating features from dataset file data\lm_train.txt


### Création de LMLearner

In [None]:
lm_learner = BertLMLearner.from_pretrained_model(
                            dataBunch=databunch_lm,
                            pretrained_path='camembert-base',
                            output_dir=MODEL_PATH,
                            metrics=[],
                            device=device_cuda,
                            logger=logger,
                            multi_gpu=False,
                            logging_steps=50,
                            fp16_opt_level="O2")

In [None]:
lm_learner.fit(epochs=30,
            lr=1e-4,
            validate=True,
            schedule_type="warmup_cosine",
            optimizer_type="adamw")

In [None]:
lm_learner.validate()

In [None]:
learner.save_model()

### Création de databunch pour la classification

In [None]:
databunch = BertDataBunch(DATA_PATH, LABEL_PATH,
                          tokenizer='camembert-base',
                          train_file='bvt_train_set.csv',
                          val_file='bvt_val_set.csv',
                          label_file='labels_poles.txt',
                          text_col='fst_txt',
                          label_col=['pole'],
                          batch_size_per_gpu=16,
                          max_seq_length=512,
                          multi_gpu=False,
                          multi_label=True,
                          model_type='camembert-base')

### Création de Learner

In [None]:
metrics = [{'name': 'fbeta', 'function': fbeta}, {'name': 'roc_auc', 'function': roc_auc}]
OUTPUT_DIR = Path('./finetuned_model')
WGTS_PATH = Path('model/model_out/pytorch_model.bin')

In [None]:
cl_learner = BertLearner.from_pretrained_model(
                        databunch,
                        pretrained_path='model/model_out',
                        metrics=metrics,
                        device=device_cuda,
                        logger=logger,
                        output_dir=OUTPUT_DIR,
                        finetuned_wgts_path=WGTS_PATH,
                        warmup_steps=300,
                        multi_gpu=False,
                        multi_label=True,
                        is_fp16=True,
                        logging_steps=50)

In [None]:
cl_learner.fit(epochs=30,
            lr=9e-5,
            validate=True,
            schedule_type="warmup_cosine",
            optimizer_type="adamw")

In [None]:
cl_learner.validate()

In [None]:
class_learner.save_model()

### Prédictions

In [None]:
predictor = BertClassificationPredictor(
                model_path='finetuned_model/model_out',
                label_path='labels/',
                multi_label=True,
                model_type='camembert-base',
                do_lower_case=False)

In [None]:
predictor.predict("Texte à classer")

In [None]:
import gc
gc.collect()

In [None]:
gc.get_objects()