# 01 Load modules

In [1]:
import pandas as pd 
import json
import os

In [2]:
base_df=pd.read_csv('/home/e077926/buscode_2023/05_E2_tata/data/receipt_raw_df.csv')
base_df = base_df.drop(columns=['Unnamed: 0'])
base_df.head()

Unnamed: 0,text_lines,doc_id,line_id
0,tan woon yann,X00016469612,1
1,BOOK TA-K (TAMAN DAYA) SDN BHD,X00016469612,2
2,B94 7-W,X00016469612,3
3,"NO.5? 55,57 & 59, JALAN SAGU 18,",X00016469612,4
4,TAMAN DAYA,X00016469612,5


# 02 create id df

In [3]:
base_df['id']=base_df["doc_id"] +"-"+ base_df["line_id"].astype(str)

In [4]:
id_df=base_df[['id','text_lines']]

In [5]:
id_df.head()

Unnamed: 0,id,text_lines
0,X00016469612-1,tan woon yann
1,X00016469612-2,BOOK TA-K (TAMAN DAYA) SDN BHD
2,X00016469612-3,B94 7-W
3,X00016469612-4,"NO.5? 55,57 & 59, JALAN SAGU 18,"
4,X00016469612-5,TAMAN DAYA


# 03 create entities df

In [6]:
# Utiliser str.split pour diviser chaque ligne en mots
id_df['text_lines'] = id_df['text_lines'].str.split()
# Utiliser explode pour créer une ligne pour chaque mot tout en conservant l'ID
entities_df = id_df.explode('text_lines')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  id_df['text_lines'] = id_df['text_lines'].str.split()


# 04 add entities labels

We'll use BIO (Begin, Inside, Outside)

'B-' (Begin) pour le premier mot d'une entité, 

'I-' (Inside) pour les mots suivants de la même entité

'O' (Outside) pour les mots qui ne font pas partie d'une entité. 

In [7]:
entities_df.tail()

Unnamed: 0,id,text_lines
18206,X51009453804-20,anv
18206,X51009453804-20,enauirv.
18206,X51009453804-20,please
18206,X51009453804-20,contact
18206,X51009453804-20,us:


In [8]:
entity_folder_path = "/home/e077926/buscode_2023/05_E2_tata/data/SROIE2019/train/entities/"

for entity_file in os.listdir(entity_folder_path):
    file_name, _ = os.path.splitext(entity_file)
    matching_rows = entities_df[entities_df['id'].str.startswith(file_name + '-')]

    if not matching_rows.empty:

        with open(f'{entity_folder_path+file_name}.txt', 'r') as f:
                    data_dict = json.load(f)

                    for index, row in matching_rows.iterrows():
                        # Réinitialiser la valeur à 'O' par défaut
                        entities_df.at[index, 'entity'] = 'O'

                        for k, v in data_dict.items():
                            if row['text_lines'] in v:
                                entities_df.at[index, 'entity'] = k
                        # Si une correspondance est trouvée, vous pourriez sortir de la boucle interne ici
                                break

entities_df=entities_df.reset_index()

# 05 Add BIO

In [9]:
entities_df.head(10)

Unnamed: 0,index,id,text_lines,entity
0,0,X00016469612-1,tan,O
1,0,X00016469612-1,woon,O
2,0,X00016469612-1,yann,O
3,1,X00016469612-2,BOOK,company
4,1,X00016469612-2,TA-K,company
5,1,X00016469612-2,(TAMAN,company
6,1,X00016469612-2,DAYA),company
7,1,X00016469612-2,SDN,company
8,1,X00016469612-2,BHD,company
9,2,X00016469612-3,B94,O


In [10]:
# Créer une nouvelle colonne pour les entités avec les suffixes
entities_df['entity_with_suffix'] = 'O'

# Parcourir le DataFrame
for i in range(len(entities_df)):
    entity = entities_df.at[i, 'entity']

    if entity != 'O':
        # Si c'est la première occurrence, ajouter '-B', sinon '-I'
        suffix = '-B' if entities_df.at[i-1, 'entity_with_suffix'] == 'O' else '-I'
        entities_df.at[i, 'entity_with_suffix'] = entity + suffix

# Supprimer la colonne originale 'entity' si nécessaire
entities_df.drop(columns=['entity'], inplace=True)
# Renommer la nouvelle colonne
entities_df.rename(columns={'entity_with_suffix': 'entity'}, inplace=True)

Unnamed: 0,index,id,text_lines,entity
0,0,X00016469612-1,tan,O
1,0,X00016469612-1,woon,O
2,0,X00016469612-1,yann,O
3,1,X00016469612-2,BOOK,company-B
4,1,X00016469612-2,TA-K,company-I
5,1,X00016469612-2,(TAMAN,company-I
6,1,X00016469612-2,DAYA),company-I
7,1,X00016469612-2,SDN,company-I
8,1,X00016469612-2,BHD,company-I
9,2,X00016469612-3,B94,O


In [17]:
entities_df.tail(60)

Unnamed: 0,index,id,text_lines,entity
72713,18190,X51009453804-4,SR,address-I
72714,18190,X51009453804-4,"1/9,",address-I
72715,18190,X51009453804-4,SEKSYEN,address-I
72716,18190,X51009453804-4,9,address-I
72717,18191,X51009453804-5,TAMAN,address-I
72718,18191,X51009453804-5,SERDANG,address-I
72719,18191,X51009453804-5,"RAYA,",address-I
72720,18192,X51009453804-6,43300,address-I
72721,18192,X51009453804-6,SERI,address-I
72722,18192,X51009453804-6,"KEMBANGAN,",address-I
