In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/amina/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
import pandas as pd

In [4]:
def annotate(text, psr, tag):
    # Initialize the annotation list with 0s
    annotation = [0] * len(text)

    # Define the labels
    labels = {'O': 0, 'det': 1, 'appo': 2, 'ambiguë': 3}

    # Find the start index of the subsequence psr in text
    psr_len = len(psr)
    text_len = len(text)

    for i in range(text_len - psr_len + 1):
        if text[i:i + psr_len] == psr:
            annotation[i] = labels[tag]
            break

    return annotation

In [6]:
from nltk import RegexpTokenizer

def tokenizer(text):
  return RegexpTokenizer(r'''\w'|\w+|[^\w\s]''').tokenize(text)



In [None]:
import re

def replace_characters(match: re.Match) -> str:
    char = match.group(0)
    replacements = {'’': "'",'´': "'",'`': "'",'‘': "'",'«': '"','»': '"','“': '"','”': '"','–': '-','—': '-','…': ' ',u'\xa0': ' ',}
    return replacements[char]

def normalize_text(text: str) -> str:
    pattern = r'[’´`‘«»“”–—…]'
    return re.sub(pattern, replace_characters, text).strip()

In [7]:
file_path = '../data/relatives.xlsx'
sheet_name = 'Feuille 1_2'
df = pd.read_excel(file_path, sheet_name=sheet_name)

In [10]:
df = df.map(normalize_text)

In [11]:
# Application de la fonction tokenizer sur les colonnes Phrase et PSR
df['Phrase'] = df['Phrase'].apply(tokenizer)
df['PSR'] = df['PSR'].apply(tokenizer)

In [None]:
df['Phrase', 'PSR'] = df['Phrase', 'PSR'].map(tokenizer)

In [13]:
# Appliquer la fonction annotate à chaque ligne du DataFrame pour créer la colonne psr_tags
df['psr_tags'] = df.apply(lambda row: annotate(row['Phrase'], row['PSR'], row['Annotation']), axis=1)

In [14]:
df.drop(columns=['PSR','Annotation'], inplace=True)
# Renommer la colonne Phrase en tokens
df.rename(columns={'Phrase': 'tokens'}, inplace=True)

In [15]:
df.head()

Unnamed: 0,tokens,psr_tags
0,"[Simone, Veil, occupe, désormais, une, place, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[une, force, qui, a, fait, d', elle, une, actr...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[Les, heures, que, nous, vivons, sont, de, cel...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
3,"[Je, veux, aujourd, ', hui, vous, parler, du, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
4,"[des, institutions, que, je, veux, changer]","[0, 0, 1, 0, 0, 0]"


In [16]:
#split the dataset into train , validation and test
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [25]:
# Supprimer les colonnes d'index auto-générées si elles existent
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [22]:
train_df.head()

Unnamed: 0,tokens,psr_tags
197,"[Mes, chers, collègues, ,, scellons, donc, auj...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
330,"[La, loi, du, 26, janvier, 2016, de, modernisa...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
321,"[Nous, avons, par, ailleurs, pris, différentes...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
430,"[Ces, travaux, débutent, aujourd, ', hui, sous...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
173,"[Ceux, qui, profitent, du, rouleau, compresseu...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [19]:
from datasets import Dataset, Features, Sequence, Value, ClassLabel

In [20]:
# Définir les Features pour chaque colonne du DataFrame
features = Features({
    'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
    'psr_tags': Sequence(feature=ClassLabel(names=['O', 'DET', 'APPO', 'AMBIGUE'], id=None), length=-1, id=None)
})

In [26]:
# Convertir le DataFrame en Dataset Hugging Face avec les Features spécifiés
train_ds = Dataset.from_pandas(train_df, features=features)
val_ds = Dataset.from_pandas(val_df, features=features)
test_ds = Dataset.from_pandas(test_df, features=features)

In [27]:
from datasets import DatasetDict

# Recréation du DatasetDict
dataset = DatasetDict({
    'train': train_ds,
    'validation': val_ds,
    'test': test_ds
})

In [28]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'psr_tags'],
        num_rows: 281
    })
    validation: Dataset({
        features: ['tokens', 'psr_tags'],
        num_rows: 71
    })
    test: Dataset({
        features: ['tokens', 'psr_tags'],
        num_rows: 88
    })
})

In [30]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [32]:
# Enregistrer le DatasetDict dans le hub Hugging Face
dataset.push_to_hub('relatives_psr_fr')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/djamina/relatives_psr_fr/commit/debf020a6cd5995a93aa765318bc492648cebd37', commit_message='Upload dataset', commit_description='', oid='debf020a6cd5995a93aa765318bc492648cebd37', pr_url=None, pr_revision=None, pr_num=None)