In [6]:
from typing import Any, Dict, List, Literal, Union
import numpy as np
from numpy.typing import NDArray
import pandas as pd
import glob

In [29]:
def check_empty_label(labels_dict: Dict[str, List[str]]):
    return '' not in labels_dict.keys() and '' not in labels_dict.values()

In [31]:
def convert_to_labels_dict(Path_data):
    labels_dict = {}
    for file_ in glob.glob(Path_data) :
        labels = [sent.split("\t")[1].strip() for sent in open(file_).readlines()]
        fileName = file_.split("/")[3].split(".")[0]
        labels_dict[fileName] = labels
    if check_empty_label(labels_dict):
        return labels_dict

In [32]:
def flatten_labels_per_patent(labels_per_patent: Dict[str, List[str]]):
    patents_names = list(labels_per_patent.keys())
    patents_names.sort()
    
    labels_flattened = []
    for patent_name in patents_names:
        labels_flattened += labels_per_patent[patent_name]
    return labels_flattened

In [33]:
def compute_l2i_and_i2l(labels):
    l2i = {}
    i2l = {}
    for label in labels:
        if label not in l2i:
            idx = len(l2i.keys())
            l2i[label] = idx
            i2l[idx] = label
    return l2i, i2l

In [56]:
def load_input_data(path_document,encoded_label):
    df = pd.DataFrame({'text':str(), 'label':int()}, index = [])
    with open(path_document) as file_:
        for line in file_.readlines():
            text = line.split('\t')[0]
            label = line.split('\t')[1].strip()
            row_series = pd.Series((text,encoded_label[label]), index=["text","label"])
            df = pd.concat([df,row_series.to_frame().T],
                            ignore_index = True)
    return df

In [79]:
def check_sentence_length(Path_data,tokenizer,encoded_label):
    import statistics
    sentence_nb_length = {}
    sentence_long = []
    for path_document in glob.glob(Path_data):
        df = load_input_data(path_document,encoded_label)
        sentence_length = []
        for idx, row in df.iterrows() : 
            tokenizedSentence = tokenizer.tokenize(row['text'])
            if len(tokenizedSentence) > 512 :
                sentence_long.append((path_document,idx))
            sentence_length.append(len(tokenizedSentence))
        sentence_nb_length[path_document.split("/")[-1].split(".")[0]] = (len(df),
                                                                          round(statistics.mean(sentence_length),0),
                                                                          max(sentence_length))
    return sentence_nb_length,sentence_long


In [92]:
import numpy as np
from sklearn.model_selection import KFold

def split_for_cross_validation(Path_data,nb_fold):
    """
    split data into train and validation for k fold validation
    """
    folds = {}
    files = glob.glob(Path_data)
    kf = KFold(n_splits=nb_fold, shuffle=True, random_state=42)
    fold_id = 1
    for train_index, val_index in kf.split(files):
        train_docs = [files[i] for i in train_index]
        val_docs = [files[i] for i in val_index]
        folds[fold_id] = (train_docs,val_docs)
        fold_id += 1
    return folds

In [None]:
from util import split_for_cross_validation

Path_data = "../data/train/*.txt"
folds = split_for_cross_validation(Path_data,4)
print(folds)


In [84]:
Path_data = "../data/train/*.txt"
labels_dict = convert_to_labels_dict(Path_data)
l2i, i2l = compute_l2i_and_i2l(flatten_labels_per_patent(labels_dict))


from transformers import BertTokenizer
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
sentence_nb_length, long = check_sentence_length(Path_data,tokenizer,l2i)
#sorted = {k: v for k, v in sorted(sentence_nb_length.items())}
print(sentence_nb_length)
print(long)




{'G2': (284, 39.0, 121), 'G1': (224, 36.0, 172), 'F1': (109, 46.0, 133), 'F2': (245, 32.0, 94), 'A1': (393, 45.0, 385), 'A2': (228, 42.0, 221), 'C1': (349, 41.0, 242), 'C2': (681, 34.0, 299), 'B2': (101, 36.0, 138), 'B1': (217, 35.0, 154), 'E2': (102, 38.0, 85), 'E1': (364, 32.0, 121), 'D1': (307, 35.0, 138), 'H2': (221, 32.0, 89), 'D2': (106, 31.0, 100), 'H1': (193, 38.0, 99)}
[]


In [None]:
import pandas as pd
import numpy as np
import json
import torch

from datasets import Dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainerCallback, TrainingArguments
from torch.utils.data import DataLoader

from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report

BASE_MODEL = "bert-base-cased"
LEARNING_RATE = 1e-4
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 50



In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, id2label=i2l, label2id=l2i)
