## Requirements
On importe toutes les librairies nécéssaires

In [1]:
import numpy as np
import re
import pandas as pd
#!pip install unidecode
from unidecode import unidecode
import torch

In [2]:
from PyPDF2 import PdfReader

In [3]:
import os
pd.set_option('mode.chained_assignment', None)
import s3fs

## Charger les modèles
On charge le modèle de question réponse d'etalab pour appliquer les pré-filtres, puis le modèle T5 pour récupérer les valeurs d'indicateurs.

In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto")

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/50.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
from transformers import pipeline
nlp = pipeline('question-answering', model='etalab-ia/camembert-base-squadFR-fquad-piaf', tokenizer='etalab-ia/camembert-base-squadFR-fquad-piaf')

## Charger les questions
On charge le documents contenant toutes les questions: associées à un mot clé et à un indicateur

In [6]:
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
BUCKET = "/mberthe/narval"
FILE_KEY_Q = "camemBERT/question_all.csv"#charger le fichier de question
FILE_PATH_Q = BUCKET + "/" + FILE_KEY_Q

with fs.open(FILE_PATH_Q,  mode="r") as file_in:
    question=pd.read_csv(file_in)

In [7]:
question

Unnamed: 0,question,mot,indic
0,Quel est le prix de service au m3 pour l'eau p...,prix,D102.0
1,Quel est le prix du service au m3 pour l'assin...,prix,D204.0
2,Quelle est la connaissance du réseau en eau po...,connaissance,P103.2
3,Quel est le taux de connaissance du réseau en ...,connaissance,P202.2
4,Quel est le taux de renouvellement des réseaux...,renouvellement,P107.2
...,...,...,...
97,Quelle est la valeur de l'endettement du servi...,dette,P256.2
98,Quelle est la valeur de l'endettement du servi...,endettement,P256.2
99,Quelle est la valeur de l'endettement du servi...,dette,P153.2
100,Quel est le total de points de connaissance d...,connaissance,P202.2


## Fonctions
On déclare les fonctions que nous utiliserons par la suite

In [8]:
def normalize_text(s):
    """removing caractères, /, accents, m3, /n"""
    import string, re
    
    def no_unit(text):
        regex="m3"
        return re.sub(regex, " ", text)
    def no_exp(text):
        regex="m³"
        return re.sub(regex, " ", text)
    def no_punct(text):
        regex="\\.(?!\d)"
        return re.sub(regex," ",text)
   
    def white_space_fix(text):
        return " ".join(text.split())
    def no_date(text):
        date_extract_pattern = "[0-9]{1,2}\\/[0-9]{1,2}\\/[0-9]{4}"
        return re.sub(date_extract_pattern,"",text)
    def special_char(text):
        regex=regex = "[^\w\s,.]"
        return re.sub(regex, " ", text)
   #on retire tout les caractère des réponses 
    def no_char(text):
        regex=r"([a-zA-Z])"
        return re.sub(regex, " ", text)
    def comma(text):
        regex=",(?!\d)"
        return re.sub(regex," ",text)
    def point(text):
        return re.sub("\,",".",text)
    def indic(text):
        regex="(P\d\d\d\\.\d\s*)|(D\d\d\d\\.\d\s*)|(VP\\.\d\d\d)"
        return re.sub(regex," ", text)

    return white_space_fix(point(no_punct(no_char(unidecode(comma(special_char(indic(no_exp(no_unit(no_date(s)))))))))))

In [9]:
def normalize(s):
    """removing caractères, /, accents, m3, /n"""
    import string, re
    def no_punct(text):
        regex="\\.(?!\d)"
        return re.sub(regex," ",text)
    def white_space_fix(text):
        return " ".join(text.split())
    def special_char(text):
        regex=regex = "[^\w\s,.]"
        return re.sub(regex, " ", text)
    def comma(text):
        regex=",(?!\d)"
        return re.sub(regex," ",text)
    def lower(text):
        return text.lower()

    return white_space_fix(no_punct(unidecode(lower(special_char(s)))))

In [10]:
def saut(text):
        regex="\n"
        return re.sub(regex,"",text)

## Pré-filtres
On récupère les premieres pages du rapport, desquelles on extrait l'année du rapport aisni que le type de service proposé 

In [None]:
nom_fichier='PDF/pancrasse_2016.pdf'  #Renseigner le nom du PDF que l'on veut tester

In [11]:
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
BUCKET = "mberthe/narval"
fs.ls(BUCKET)
BUCKET = "mberthe/narval"
FILE_KEY_S3 = nom_fichier #entrer le nom du PDF
FILE_PATH_S3 = BUCKET + "/" + FILE_KEY_S3

with fs.open(FILE_PATH_S3, mode="rb") as file_in:
    reader = PdfReader(file_in)
    page=reader.pages
    text=[]
    for i in range(0,5):
        page = reader.pages[i]
        output = page.extract_text()
        text.append(output)
    

In [12]:
text1=''
for x in text:
    text1 +=' '+ x

In [13]:
#text1

In [14]:
reau= "eau potable"
rass="assainissement"
d=nlp({'question': "Quels types de services ? ",
         'context': text1})
print(d)
prediction=d["answer"]
if ((re.findall(reau,prediction)==[])&(re.findall(rass,prediction)==[])):
    d=nlp({'question': "eau potable ou assainissement collectif ? ",
         'context': text1})
    print(d)
    prediction=d["answer"]
    if ((re.findall(reau,prediction)==[])&(re.findall(rass,prediction)==[])):
         d=nlp({'question': "Quelle est l'entité de gestion ? ",
         'context': text1})
    print(d)
    prediction=d["answer"]
    if ((re.findall(reau,prediction)==[])&(re.findall(rass,prediction)==[])):
        print("type de service non retrouvé dans le texte")

{'score': 0.5983580946922302, 'start': 7305, 'end': 7350, 'answer': ' Collecte  \uf0fc   \n Transport  \uf0fc   \n Dépollution'}
{'score': 0.057350821793079376, 'start': 224, 'end': 251, 'answer': ' l’assainissement collectif'}
{'score': 0.057350821793079376, 'start': 224, 'end': 251, 'answer': ' l’assainissement collectif'}


In [15]:
année=nlp({'question': "Quelle est l'année d'exercice du rapport ?",
         'context': text1})
print(année)

{'score': 0.9285984039306641, 'start': 131, 'end': 136, 'answer': ' 2016'}


In [16]:
an=normalize_text(année['answer']) # on normalise la réponse afin de n'avoir que l'année 

In [17]:
def normalize(s):
    """removing caractères, /, accents, m3, /n"""
    import string, re
    def no_punct(text):
        regex="\\.(?!\d)"
        return re.sub(regex," ",text)
    def white_space_fix(text):
        return " ".join(text.split())
    def special_char(text):
        regex=regex = "[^\w\s,.]"
        return re.sub(regex, " ", text)
    def comma(text):
        regex=",(?!\d)"
        return re.sub(regex," ",text)
    def lower(text):
        return text.lower()

    return white_space_fix(no_punct(unidecode(lower(special_char(s)))))

In [18]:
def compute_f1(prediction, truth):
    pred_tokens =normalize(prediction).split()
    truth_tokens = normalize(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    #print(len(common_tokens))
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

Grâce à notre réponse ur le service, et la métrique f1, on choisit la liste d'indicateur appropriée

In [19]:
truth="""eau potable"""
prediction=d['answer']
print(prediction)
f1_ep=compute_f1(prediction,truth)
print('f1 eau potable:',f1_ep)

truth="""assainissement"""
prediction=d['answer'] 
f1_ac=compute_f1(prediction,truth)
print('f1 assainissement:',f1_ac)

reau= "eau potable"
rass="assainissement"
if ((re.findall(reau,prediction)!=[])&(re.findall(rass,prediction)!=[])):
    print(re.findall(reau,prediction),re.findall(rass,prediction))
    indic=["D102.0","D204.0","P103.2","P202.2","P107.2","P253.2","P109.0","P207.0","P153.2","P256.2","P154.0","P257.0","P155.1","P258.1","P101.1","P102.1","P104.3","P105.3","P106.3","P108.3","P151.1","D203.0","P201.1","P204.3","P205.3","P206.3","P251.1","P252.2","P254.3","P255.3","D302.0","P301.3"]
else:
    if (f1_ep > f1_ac ):
        indic=["D102.0","P103.2","P107.2","P109.0","P153.2","P154.0","P155.1","P101.1","P102.1","P104.3","P105.3","P106.3","P108.3","P151.1"]
    if (f1_ep < f1_ac):
        indic=["D204.0","P202.2","P253.2","P207.0","P256.2","P257.0","P258.1","D203.0","P201.1","P204.3","P205.3","P206.3","P251.1","P252.2","P254.3","P255.3","D302.0","P301.3"]
print(indic)

 l’assainissement collectif
f1 eau potable: 0
f1 assainissement: 0.5
['D204.0', 'P202.2', 'P253.2', 'P207.0', 'P256.2', 'P257.0', 'P258.1', 'D203.0', 'P201.1', 'P204.3', 'P205.3', 'P206.3', 'P251.1', 'P252.2', 'P254.3', 'P255.3', 'D302.0', 'P301.3']


## On charge tout le texte du PDF
On va maintenant extraire les valeurs des RPQS

In [23]:
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
BUCKET = "mberthe/narval"
fs.ls(BUCKET)
BUCKET = "mberthe/narval"
FILE_KEY_S3 = nom_fichier 
FILE_PATH_S3 = BUCKET + "/" + FILE_KEY_S3
with fs.open(FILE_PATH_S3, mode="rb") as file_in:
    reader = PdfReader(file_in)
    page=reader.pages
    count = len(reader.pages)
    print(count)
    text=[]
    for i in range(count):
        page = reader.pages[i]
        output = page.extract_text()
        text.append(output)

20


In [24]:
pages =[]
for i in question.index:
    my_list=[]
    if question["indic"][i] in indic:
        for j in text:
            #print(re.findall(f1["mot"][i],j))
                if (re.findall(question["mot"][i],j)!=[]):
                    my_list.append(j)
                    print("question:",i,"page",text.index(j))
            #print(my_list)
    pages.append(my_list)
#len(pages[2])  

question: 1 page 0
question: 3 page 1
question: 3 page 11
question: 3 page 12
question: 3 page 19
question: 5 page 1
question: 5 page 12
question: 9 page 2
question: 9 page 13
question: 9 page 14
question: 11 page 1
question: 11 page 9
question: 11 page 19
question: 13 page 11
question: 13 page 12
question: 13 page 19
question: 15 page 1
question: 21 page 1
question: 21 page 9
question: 21 page 19
question: 23 page 11
question: 23 page 12
question: 23 page 19
question: 25 page 1
question: 38 page 0
question: 42 page 1
question: 42 page 12
question: 49 page 1
question: 49 page 18
question: 49 page 19
question: 53 page 1
question: 57 page 1
question: 59 page 1
question: 61 page 1
question: 63 page 1
question: 65 page 1
question: 72 page 1
question: 72 page 2
question: 72 page 7
question: 72 page 10
question: 72 page 14
question: 72 page 15
question: 72 page 19
question: 73 page 1
question: 73 page 7
question: 73 page 19
question: 74 page 1
question: 74 page 3
question: 74 page 4
question

## Prédictions:

On pose maintenant toutes les questions

In [25]:
pred=[]
for i in question.index:
    print("question :",i)
    list_pred=[]
    for j in pages[i]:
        print("page:",text.index(j))
        input_text = 'contexte: '+ j + 'question: ' + question['question'][i] + "en "+ an + " ?"
        input_ids = tokenizer(input_text, max_length=510,
            truncation="only_second",
            padding="max_length",return_tensors="pt").input_ids.to('cuda')
        outputs = model.generate(input_ids)
        list_pred.append(tokenizer.decode(outputs[0]))
        #print(tokenizer.decode(outputs[0]))
    pred.append(list_pred)

question : 0
question : 1
page: 0
question : 2
question : 3
page: 1
page: 11
page: 12
page: 19
question : 4
question : 5
page: 1
page: 12
question : 6
question : 7
question : 8
question : 9
page: 2
page: 13
page: 14
question : 10
question : 11
page: 1
page: 9
page: 19
question : 12
question : 13
page: 11
page: 12
page: 19
question : 14
question : 15
page: 1
question : 16
question : 17
question : 18
question : 19
question : 20
question : 21
page: 1
page: 9
page: 19
question : 22
question : 23
page: 11
page: 12
page: 19
question : 24
question : 25
page: 1
question : 26
question : 27
question : 28
question : 29
question : 30
question : 31
question : 32
question : 33
question : 34
question : 35
question : 36
question : 37
question : 38
page: 0
question : 39
question : 40
question : 41
question : 42
page: 1
page: 12
question : 43
question : 44
question : 45
question : 46
question : 47
question : 48
question : 49
page: 1
page: 18
page: 19
question : 50
question : 51
question : 52
question : 

In [26]:
pred1=question
pred1["pred"]=pred

In [27]:
for i in pred1.index :
     pred1["pred"][i]=list(map(normalize_text, pred1["pred"][i]))
pred1.head(20)

Unnamed: 0,question,mot,indic,pred
0,Quel est le prix de service au m3 pour l'eau p...,prix,D102.0,[]
1,Quel est le prix du service au m3 pour l'assin...,prix,D204.0,[]
2,Quelle est la connaissance du réseau en eau po...,connaissance,P103.2,[]
3,Quel est le taux de connaissance du réseau en ...,connaissance,P202.2,"[3.2, 100 142, 90, 90 90]"
4,Quel est le taux de renouvellement des réseaux...,renouvellement,P107.2,[]
5,Quel est le taux de renouvellement des réseaux...,renouvellement,P253.2,"[3.9, 0]"
6,Quelle est la conformité microbiologique de l'...,conformité,P101.1,[]
7,Quelle est la conformité physico-chimique de l...,conformité,P102.1,[]
8,Quel est le rendement du réseau de distribution,rendement,P104.3,[]
9,Quel est la conformité dispositifs assinisseme...,conformité,P301.3,"[, 100, 100]"


In [28]:
for i in pred1.index :
     pred1["pred"][i]=list(map(normalize_text, pred1["pred"][i]))
pred1.head(20)

Unnamed: 0,question,mot,indic,pred
0,Quel est le prix de service au m3 pour l'eau p...,prix,D102.0,[]
1,Quel est le prix du service au m3 pour l'assin...,prix,D204.0,[]
2,Quelle est la connaissance du réseau en eau po...,connaissance,P103.2,[]
3,Quel est le taux de connaissance du réseau en ...,connaissance,P202.2,"[3.2, 100 142, 90, 90 90]"
4,Quel est le taux de renouvellement des réseaux...,renouvellement,P107.2,[]
5,Quel est le taux de renouvellement des réseaux...,renouvellement,P253.2,"[3.9, 0]"
6,Quelle est la conformité microbiologique de l'...,conformité,P101.1,[]
7,Quelle est la conformité physico-chimique de l...,conformité,P102.1,[]
8,Quel est le rendement du réseau de distribution,rendement,P104.3,[]
9,Quel est la conformité dispositifs assinisseme...,conformité,P301.3,"[, 100, 100]"


In [29]:
#on affiche que les réponses prédites et pas les vides
for i in pred1.index:
    if not (pred1["pred"][i]):
        pred1=pred1.drop([i])

In [30]:
pred1.head(15)

Unnamed: 0,question,mot,indic,pred
1,Quel est le prix du service au m3 pour l'assin...,prix,D204.0,[]
3,Quel est le taux de connaissance du réseau en ...,connaissance,P202.2,"[3.2, 100 142, 90, 90 90]"
5,Quel est le taux de renouvellement des réseaux...,renouvellement,P253.2,"[3.9, 0]"
9,Quel est la conformité dispositifs assinisseme...,conformité,P301.3,"[, 100, 100]"
11,Quelle est la valeur de D204.0,D204.0,D204.0,"[0, 146.40 146.40, 1.84 183]"
13,Quelle est la valeur de P202.2,P202.2,P202.2,"[100 142, 90, 90 90]"
15,Quelle est la valeur de P253.2,P253.2,P253.2,[3.9]
21,Quelle est la valeur de l'indicateur D204.0,D204.0,D204.0,"[0, 0, 1.84 183]"
23,Quelle est la valeur de l'indicateur P202.2,P202.2,P202.2,"[100 142, 90, 90 90]"
25,Quelle est la valeur de l'indicateur P253.2,P253.2,P253.2,[0]


In [31]:
indic

['D204.0',
 'P202.2',
 'P253.2',
 'P207.0',
 'P256.2',
 'P257.0',
 'P258.1',
 'D203.0',
 'P201.1',
 'P204.3',
 'P205.3',
 'P206.3',
 'P251.1',
 'P252.2',
 'P254.3',
 'P255.3',
 'D302.0',
 'P301.3']

In [32]:
table=pd.DataFrame()
for i in indic:
    my_list=[]
    for j in pred1.index:
        if (pred1["indic"][j]==i):
            my_list=my_list + pred1["pred"][j]
    #print(i,my_list)
    d={"indic":i,"valeur":my_list}
    #print(d)
    df_dictionary = pd.DataFrame([d])
    table = pd.concat([table, df_dictionary], ignore_index=True)
print(table.head())
table.head()    

    indic                                             valeur
0  D204.0   [, 0, 146.40 146.40, 1.84 183, 0, 0, 1.84 183, ]
1  P202.2  [3.2, 100 142, 90, 90 90, 100 142, 90, 90 90, ...
2  P253.2                           [3.9, 0, 3.9, 0, 3.9, 0]
3  P207.0                          [, 0, 0, 5.1, 0, 0 _0___]
4  P256.2                                  [0, , 155 332.28]


Unnamed: 0,indic,valeur
0,D204.0,"[, 0, 146.40 146.40, 1.84 183, 0, 0, 1.84 183, ]"
1,P202.2,"[3.2, 100 142, 90, 90 90, 100 142, 90, 90 90, ..."
2,P253.2,"[3.9, 0, 3.9, 0, 3.9, 0]"
3,P207.0,"[, 0, 0, 5.1, 0, 0 _0___]"
4,P256.2,"[0, , 155 332.28]"


In [33]:
for l in table.index:
    for i in table['valeur'][l]:
        if (re.findall('\d+',i)==[]):
            table['valeur'][l].remove(i)
for l in table.index:
    for i in table['valeur'][l]:        
        if (i==''):
            table['valeur'][l].remove(i)

In [34]:
for l in table.index:
    for i in table["valeur"][l]:
        i = [item.split(' ') for item in i]
#pred

In [35]:
BUCKET_OUT = "mberthe/narval"
FILE_KEY_OUT_S3 = "camemBERT/pre_" + nom_fichier + "_T5_xl.csv" #modif non du csv
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3

with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    table.to_csv(file_out,index=False)

In [36]:
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
BUCKET = "mberthe/narval"
FILE_KEY_S3 ="camemBERT/pre_" + nom_fichier + "_T5_xl.csv" #modif nom du csv 
FILE_PATH_S3 = BUCKET + "/" + FILE_KEY_S3

with fs.open(FILE_PATH_S3, mode="rb",index= False) as file_in:
    pred = pd.read_csv(file_in)

In [37]:
import ast

In [38]:
pred

Unnamed: 0,indic,valeur
0,D204.0,"['0', '146.40 146.40', '1.84 183', '0', '0', '..."
1,P202.2,"['3.2', '100 142', '90', '90 90', '100 142', '..."
2,P253.2,"['3.9', '0', '3.9', '0', '3.9', '0']"
3,P207.0,"['0', '0', '5.1', '0', '0 _0___']"
4,P256.2,"['0', '155 332.28']"
5,P257.0,['0']
6,P258.1,['0']
7,D203.0,"['1.10.2', '40 0', '1024', '0', '40 0', '0', '..."
8,P201.1,"['1.4', '8.4', '62', '0', '0', '100', '0', '10..."
9,P204.3,"['0', '93', '100', '100', '0', '100', '100']"


In [39]:
type(pred["valeur"][0])

str

In [40]:
#Nous avons un string donc, on le met sous forme de liste
for i in pred.index:
   pred['valeur'][i] =res = ast.literal_eval(pred['valeur'][i])

In [41]:
for i in pred.index:
    pred['valeur'][i]= [item.split(' ') for item in pred['valeur'][i]]
#pred

In [42]:
for l in pred.index:
    temp=[]
    for i in pred['valeur'][l]:
        for j in i:
            print(j)
            temp.append(j)
    pred['valeur'][l]=temp
#pred

0
146.40
146.40
1.84
183
0
0
1.84
183
3.2
100
142
90
90
90
100
142
90
90
90
100
142
90
90
90
3.2
203
0
90
90
90
3.9
0
3.9
0
3.9
0
0
0
5.1
0
0
_0___
0
155
332.28
0
0
1.10.2
40
0
1024
0
40
0
0
40
0
40
0
1.4
8.4
62
0
0
100
0
100
142
100
0
93
100
100
0
100
100
100
100
__0______
100
0
100
100
3.6
96
0
91.91
100
0
0
0
100
0
3.7
58
0
0
0
0
100
100


In [43]:
import re
for l in pred.index:
    my_list=[]
    for i in pred['valeur'][l]:
       if (re.findall('\d+',i)==[]):
            my_list.append(i)
    for i in my_list:
        pred['valeur'][l].remove(i)

In [44]:
reg=r"\d\.(?!0)"

for i in pred.index:
    if pred["indic"][i] in ["P103.2","P202.2","P254.3","P255.3"]:
        print(pred['valeur'][i])
        temp=[]
        for x in pred['valeur'][i]:
            print(x)
            if (re.findall(reg,x)!=[]):
                temp.append(x)
                print('ajoute x à temp:', x)
        for x in temp:
            pred['valeur'][i].remove(x)
            print('retire :',x)
        print(pred['valeur'][i])

['3.2', '100', '142', '90', '90', '90', '100', '142', '90', '90', '90', '100', '142', '90', '90', '90', '3.2', '203', '0', '90', '90', '90']
3.2
ajoute x à temp: 3.2
100
142
90
90
90
100
142
90
90
90
100
142
90
90
90
3.2
ajoute x à temp: 3.2
203
0
90
90
90
retire : 3.2
retire : 3.2
['100', '142', '90', '90', '90', '100', '142', '90', '90', '90', '100', '142', '90', '90', '90', '203', '0', '90', '90', '90']
['0', '0']
0
0
['0', '0']
['0']
0
['0']


In [53]:
nofloat=[]
for i in pred.index:
    mylist = []
    print(i, pred['valeur'][i])
    for item in pred['valeur'][i]:
        try:
            mylist.append(float(item))
        except ValueError:
            nofloat.append(item)
    for item in nofloat:
        pred["valeur"][i].remove(item)
    pred['valeur'][i]=mylist

0 [1.84]
1 [100.0, 90.0]
2 []
3 []
4 [155.0, 0.0, 332.28]
5 [0.0]
6 [0.0]
7 [1024.0, 40.0, 0.0]
8 [8.4, 100.0, 142.0, 1.4, 62.0, 0.0]
9 [93.0, 100.0, 0.0]
10 [0.0, 100.0]
11 {'100', '3.6', '96', '91.91', '0'}
12 {'3.7'}
13 {'58', '0'}
14 {'0'}
15 {'0'}
16 set()
17 {'100'}


In [54]:
for i in pred.index:
    pred["valeur"][i]=set(pred["valeur"][i])

## Ajout des bornes

In [56]:
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
BUCKET = "mberthe/narval"
FILE_KEY_S3 = "camemBERT/bornes.csv"
FILE_PATH_S3 = BUCKET + "/" + FILE_KEY_S3

with fs.open(FILE_PATH_S3, mode="rb",index=False) as file_in:
    df = pd.read_csv(file_in)

In [57]:
pred['min']=""
pred['max']=""
for i in df.index:
    for j in pred.index:
        if df['indic'][i]==pred['indic'][j]:
            pred["min"][j]=df['min'][i]
            pred['max'][j]=df['max'][i]

In [58]:
for i in pred.index:
    temp=[]
    for x in pred['valeur'][i]:
        if (x<=pred['min'][i]) or (x>pred['max'][i]):
            temp.append(x)
    for x in temp:
        pred['valeur'][i].remove(x)

In [59]:
pred

Unnamed: 0,indic,valeur,min,max
0,D204.0,{1.84},0.5,3.0
1,P202.2,"{90.0, 100.0}",0.0,120.0
2,P253.2,{},0.0,3.0
3,P207.0,{},0.0,0.1
4,P256.2,{},0.0,30.0
5,P257.0,{},0.0,5.0
6,P258.1,{},0.0,20.0
7,D203.0,"{1024.0, 40.0}",0.0,1000000000.0
8,P201.1,"{100.0, 62.0}",50.0,100.0
9,P204.3,"{100.0, 93.0}",0.0,100.0


In [60]:
BUCKET_OUT = "mberthe/narval"
FILE_KEY_OUT_S3 = "camemBERT/results.csv"#modif nom csv
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3

with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    results.to_csv(file_out,index=False,sep=';')