In [108]:
batch_size = 10

In [109]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import DistilBertTokenizer, DataCollatorWithPadding
import multiprocessing
from tqdm.notebook import tqdm

In [110]:
np.random.seed(987654321)
tf.random.set_seed(987654321)

In [111]:
docs=pd.read_csv('LinkedInPosts_20220521_2.csv', sep=',', names=["authors","content","doc_id","note"],encoding='utf-8', header=1)
docs

Unnamed: 0,authors,content,doc_id,note
0,,,2,3.0
1,"Mahabubur Rahman, Ph.D.",Some industry-relevant insights for corporate ...,3,1.0
2,Anne Gilbert,"Très heureuse de partager le GreenBook, notre ...",4,1.0
3,Alice IUFP_USMB,#Métiersdavenir Master Chimie Verte et Eco-inn...,5,1.0
4,,,6,2.0
...,...,...,...,...
1431,,,1433,
1432,Nicolas Breyton,"Notre nouvelle Premier Ministre, au cours d’un...",1434,
1433,Nicolas Breyton,"Robert Habeck, ministre allemand vert, cherche...",1435,
1434,Nicolas Breyton,Excellent dossier sur l’Etat de l’art de la fu...,1436,


In [112]:
docs.dropna(subset=['content'], inplace=True)
docs.dropna(subset=['note'], inplace=True)

In [113]:
indexNames = docs[docs['note'] == 0].index
docs.drop(indexNames , inplace=True)
docs

Unnamed: 0,authors,content,doc_id,note
1,"Mahabubur Rahman, Ph.D.",Some industry-relevant insights for corporate ...,3,1.0
2,Anne Gilbert,"Très heureuse de partager le GreenBook, notre ...",4,1.0
3,Alice IUFP_USMB,#Métiersdavenir Master Chimie Verte et Eco-inn...,5,1.0
5,Fernando Diaz Lopez,#callforpapers #preannouncement Attention all...,7,1.0
7,Jaylym Aldryne Escorpizo,"Good day, dear connections! I'm currently cond...",9,4.0
...,...,...,...,...
1357,Dale R.,“The secret of getting ahead is getting starte...,1359,4.0
1358,Maria Cuba,Today is my 11th year anniversary at Airbnb. W...,1360,4.0
1359,Bryan Most,"While my time at NYSHEX has come to an end, it...",1361,4.0
1360,Samson Tan (Dr),"After almost five years at NIE, I'm ready for ...",1362,4.0


In [114]:
docs.loc[docs["note"] == 2.0, "note"] = 1
docs.loc[docs["note"] == 3.0, "note"] = 0
docs.loc[docs["note"] == 4.0, "note"] = 0
docs

Unnamed: 0,authors,content,doc_id,note
1,"Mahabubur Rahman, Ph.D.",Some industry-relevant insights for corporate ...,3,1.0
2,Anne Gilbert,"Très heureuse de partager le GreenBook, notre ...",4,1.0
3,Alice IUFP_USMB,#Métiersdavenir Master Chimie Verte et Eco-inn...,5,1.0
5,Fernando Diaz Lopez,#callforpapers #preannouncement Attention all...,7,1.0
7,Jaylym Aldryne Escorpizo,"Good day, dear connections! I'm currently cond...",9,0.0
...,...,...,...,...
1357,Dale R.,“The secret of getting ahead is getting starte...,1359,0.0
1358,Maria Cuba,Today is my 11th year anniversary at Airbnb. W...,1360,0.0
1359,Bryan Most,"While my time at NYSHEX has come to an end, it...",1361,0.0
1360,Samson Tan (Dr),"After almost five years at NIE, I'm ready for ...",1362,0.0


In [115]:
print("nombre de documents pertinents : "+str(np.count_nonzero(docs['note'] == 1)))
print("nombre de documents non pertinents : "+str(np.count_nonzero(docs['note'] == 0)))

nombre de documents pertinents : 132
nombre de documents non pertinents : 775


In [116]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

In [124]:
def tokenize_sample(sample):
    return tokenizer(sample, truncation=True)

def distributed_tokenize_dataset(dataset):
    ds = list(dataset)
    print(len(ds))
    with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
        tokenized_ds = list(tqdm(
            pool.imap(tokenize_sample, ds),
            total=len(ds)
        ))
    docs_list = list(docs['doc_id'])
    #print(docs_list)
    for idx, doc in enumerate(tokenized_ds):
        doc['doc_id'] = docs_list[idx]
    return tokenized_ds

#tokenized_x_train = distributed_tokenize_dataset(x_train)
#tokenized_x_test = distributed_tokenize_dataset(x_test)

tokenized_ds =  distributed_tokenize_dataset(docs['content'])
tokenized_ds

907


  0%|          | 0/907 [00:00<?, ?it/s]

[{'input_ids': [101, 13885, 17425, 118, 44622, 15498, 103437, 10142, 46666, 23467, 98514, 10108, 18138, 84459, 10146, 10114, 10105, 21316, 10108, 32704, 41011, 113, 173, 10812, 118, 64502, 117, 10266, 58334, 51608, 111, 64416, 10106, 13382, 13409, 25744, 114, 10135, 10105, 26545, 10108, 20570, 19211, 10108, 10105, 46666, 23467, 113, 10169, 150, 119, 44777, 19016, 118, 62814, 117, 10404, 17199, 10104, 46495, 13315, 117, 16149, 10111, 90796, 66679, 155, 119, 91182, 11189, 117, 149, 65729, 10404, 117, 15497, 114, 119, 13716, 10546, 10161, 119, 10160, 120, 187, 11274, 11273, 12022, 13966, 108, 23467, 10230, 108, 23467, 56923, 108, 170, 10729, 14496, 29786, 10230, 108, 32704, 29378, 11710, 108, 26069, 108, 10846, 37879, 35717, 108, 173, 10812, 24488, 23033, 11046, 108, 29465, 100856, 108, 15648, 64208, 10112, 112, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [127]:
data_collator = DataCollatorWithPadding(tokenizer, padding='longest', return_tensors="tf")
data_collator(tokenized_ds)

{'input_ids': <tf.Tensor: shape=(907, 512), dtype=int32, numpy=
array([[  101, 13885, 17425, ...,     0,     0,     0],
       [  101, 91985, 47034, ...,     0,     0,     0],
       [  101,   108,   150, ...,     0,     0,     0],
       ...,
       [  101, 14600, 15127, ...,     0,     0,     0],
       [  101, 11301, 17122, ...,     0,     0,     0],
       [  101,   138, 35858, ...,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(907, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>, 'doc_id': <tf.Tensor: shape=(907,), dtype=int32, numpy=
array([   3,    4,    5,    7,    9,   10,   11,   13,   14,   15,   16,
         17,   18,   19,   22,   23,   24,   25,   26,   27,   28,   29,
         30,   31,   32,   33,   34,   35,   36,   37,   38,   39,   40,
         

In [136]:
def make_dataset(x, y):
    collated = data_collator(x)
    dataset = tf.data.Dataset.from_tensor_slices(
        (collated['input_ids'], collated['attention_mask'], collated['doc_id'] ,y)
    )
    transformed_dataset = (
        dataset
        .map(
            lambda x, y, i, z: ((x, y, i), z)
        )
        #.shuffle(25000)
        .batch(batch_size)
    )
    return transformed_dataset

all_ds = make_dataset(tokenized_ds, docs['note'])
print(all_ds)
for x, y in all_ds:
    print(x)
    print(y)
    break
print ("done")
for x, y in all_ds:
    print(x)
    print(y)
    break
#re répartir en ds

<_BatchDataset element_spec=((TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None)), TensorSpec(shape=(None,), dtype=tf.float64, name=None))>
(<tf.Tensor: shape=(10, 512), dtype=int32, numpy=
array([[  101, 13885, 17425, ...,     0,     0,     0],
       [  101, 91985, 47034, ...,     0,     0,     0],
       [  101,   108,   150, ...,     0,     0,     0],
       ...,
       [  101, 85526,   118, ...,     0,     0,     0],
       [  101, 31827, 14926, ...,     0,     0,     0],
       [  101, 66717, 47264, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(10, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>, <tf.Tensor: shape=(10,), dtype=int32, numpy=array([ 3,  4, 

2023-05-19 18:25:00.521181: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype double and shape [907]
	 [[{{node Placeholder/_3}}]]


In [137]:
DATASET_SIZE = all_ds.cardinality().numpy()

print("DATASET_SIZE "+str(DATASET_SIZE))
train_size = int(0.8 * DATASET_SIZE)
print(train_size)
all_ds_shuffled = all_ds.shuffle(DATASET_SIZE, seed=2568, reshuffle_each_iteration=False)
train_ds = all_ds_shuffled.take(train_size)
test_ds = all_ds_shuffled.skip(train_size)

DATASET_SIZE 91
72


In [184]:
train_ids = []
for x, y in train_ds:
    train_ids += list(x[2].numpy())
print(len(train_ids))

717


In [185]:
test_ids = []
for x, y in test_ds:
    test_ids += list(x[2].numpy())
print(len(test_ids))


190


2023-05-19 18:47:55.720679: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype double and shape [907]
	 [[{{node Placeholder/_3}}]]


In [211]:
train_ds = docs[docs['doc_id'].isin(train_ids)]
with open('train_ds.csv', 'w') as f:
    f.write(train_ds.to_csv())


In [212]:
test_ds = docs[docs['doc_id'].isin(test_ids)]
with open('test_ds.csv', 'w') as f:
    f.write(test_ds.to_csv())


In [213]:
len(train_ds)

717

In [214]:
len(test_ds)

190