In [2]:
import pandas as pd
pd.set_option('display.max_rows', None)

In [12]:
pd.set_option('display.max_columns', None)

### Jewellery documents

In [35]:
import os

# Specify the directory you want to search
directory = r'C:\Users\delll\Downloads\metadata\ListData_Jewellery'

# Initialize a list to hold all the file paths
json_files = []

# Walk through directory
for dirpath, dirnames, files in os.walk(directory):
    for file in files:
        if file.endswith(".json"):
            json_files.append(os.path.join(dirpath, file))

json_files

['C:\\Users\\delll\\Downloads\\metadata\\ListData_Jewellery\\ListData_Jew03_20240203_205630.json',
 'C:\\Users\\delll\\Downloads\\metadata\\ListData_Jewellery\\ListData_Jew2_20240126_164111.json']

In [36]:
list_df_writing = []
for json_file in json_files:
    df = pd.read_json(json_file)
    df['FileLeafRef'] = df['FileLeafRef'].fillna('')
    df = df[df['FileLeafRef'].str.contains('.pdf')].drop_duplicates()
    df['FileLeafRef'] = df['FileLeafRef'].apply(lambda x: x.replace('.aspx', ''))
    list_df_writing.append(df)

In [None]:
df_concatenated = pd.concat(list_df_writing, ignore_index=True).drop_duplicates()

def change_year_format(year_str):
    if year_str:
        return int(year_str.replace(',', ''))
    else:
        return ''
    
def change_str_to_int(string):
    if string:
        return int(string)
    else:
        return 0
    
df_concatenated['AnneePublication'] = df_concatenated['AnneePublication'].apply(change_year_format)
df_concatenated['Modified'] = df_concatenated['Modified'].astype(str)
df_concatenated['Created'] = df_concatenated['Created'].astype(str)

In [None]:
remaining_columns = ['FileLeafRef', 'AnneePublication', 'BijouxType', 'QueVoyezVous',
                     'Modified', 'Depot',  'Region', 'Created', 'Author', 'Editor']


depot_region_mappings = {
    "CN": 'Chine',
    "HK": 'Hong Kong',
    "IFPI": 'Suisse',
    "INPI": 'France',
    "OHMI": 'UE',
    "OMPI": 'Monde',
    "USPTO": 'USA',
    "": ''
}
df_concatenated['Region'] = df_concatenated['Depot'].map(depot_region_mappings)

df_concatenated = df_concatenated[remaining_columns]

In [None]:
file_path = 'Downloads/original_metadata.xlsx'

# Read the Excel file
df_jewellery_metadata = pd.read_excel(file_path, sheet_name='Jewellery')
df_jewellery_metadata = df_jewellery_metadata[['Level 0', 'Level 1', 'Level 2', 'Alias Catalog']].fillna('')


In [41]:
def concatenate_strings(row, columns):
    list_strings = [row[col] for col in columns if row[col]][:-1]
    return '.'.join(list_strings).lower()

In [None]:
level_columns = ['Level 0', 'Level 1', 'Level 2']
df_jewellery_metadata['paths'] = df_jewellery_metadata.apply(lambda row: concatenate_strings(row, level_columns), axis=1)


In [43]:
metadata_mapping = df_jewellery_metadata[['Alias Catalog', 'paths']].drop_duplicates()
metadata_mapping

Unnamed: 0,Alias Catalog,paths
0,QueVoyezVous,jewellery.que voyez vous
237,BijouxType,jewellery.type


In [44]:
d = {'Alias Catalog': ['AnneePublication', 'Modified', 'Created', 'Depot', 'Region', 'Author', 'Editor', 'FileLeafRef'], 
     'paths': ['year', 'modified', 'created', 'depot', 'region', 'author', 'editor', 'filename']}

df_structu_metadata = pd.DataFrame(data=d)
df_structu_metadata

Unnamed: 0,Alias Catalog,paths
0,AnneePublication,year
1,Modified,modified
2,Created,created
3,Depot,depot
4,Region,region
5,Author,author
6,Editor,editor
7,FileLeafRef,filename


In [45]:
metadata_names_df = pd.concat([df_structu_metadata, metadata_mapping], ignore_index=True)
metadata_names_df

Unnamed: 0,Alias Catalog,paths
0,AnneePublication,year
1,Modified,modified
2,Created,created
3,Depot,depot
4,Region,region
5,Author,author
6,Editor,editor
7,FileLeafRef,filename
8,QueVoyezVous,jewellery.que voyez vous
9,BijouxType,jewellery.type


In [46]:
metadata_names_dict = dict(zip(metadata_names_df['Alias Catalog'], metadata_names_df['paths']))
metadata_names_dict

{'AnneePublication': 'year',
 'Modified': 'modified',
 'Created': 'created',
 'Depot': 'depot',
 'Region': 'region',
 'Author': 'author',
 'Editor': 'editor',
 'FileLeafRef': 'filename',
 'QueVoyezVous': 'jewellery.que voyez vous',
 'BijouxType': 'jewellery.type'}

In [47]:
def merge_dictionaries(list_of_dicts):
    merged_dicts = {}
    for d in list_of_dicts:
        if d['path'] in merged_dicts:
            # Extend the list of values if the path is already present
            merged_dicts[d['path']].extend(d['values'])
        else:
            # Otherwise, add the new path and its values
            merged_dicts[d['path']] = d['values']
    # Remove duplicates from the values lists
    for path, values in merged_dicts.items():
        merged_dicts[path] = list(set(values))
    # Convert the merged dictionary back to a list of dictionaries
    merged_list_of_dicts = [{'path': path, 'values': values} for path, values in merged_dicts.items()]
    return merged_list_of_dicts

In [48]:
def get_filter_values(row, metadata_names_dict, cols_filter_values, cols_struct_metadata):
    dict_file = {}
    list_filter_values_paths = []
    for column in cols_struct_metadata:
        dict_file[metadata_names_dict[column]] = row[column]
    for column in cols_filter_values:
        values_taken = row[column]
        if values_taken != '':
            filter_path = metadata_names_dict[column]
            list_values = values_taken.split('|')
            list_filter_values_paths.append({'path': filter_path, 
                                             'values': list_values})
    list_filter_values_paths = merge_dictionaries(list_filter_values_paths)
    dict_file['filterValues'] = list_filter_values_paths
    return dict_file
        

columns_filter_values = ['BijouxType', 'QueVoyezVous']
columns_struct_metadata = ['FileLeafRef', 'AnneePublication', 'Modified', 'Depot', 'Region', 'Created', 'Author', 'Editor']
df_concatenated['dict_filter_values'] = df_concatenated.apply(lambda row: get_filter_values(row, 
                                                                                            metadata_names_dict, 
                                                                                            columns_filter_values,
                                                                                            columns_struct_metadata), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_concatenated['dict_filter_values'] = df_concatenated.apply(lambda row: get_filter_values(row,


In [51]:
import json

# Specify the filename
filename = 'jewellery_documents_metadata.json'

# Write JSON data to a file
with open(filename, 'w', encoding='utf-8') as file:
    json.dump(list(df_concatenated['dict_filter_values']), file, ensure_ascii=False, indent=4)
