## Creating folders for each WARENGRUPPE along with the product data for that WR

This is a one-time used code to structure the data so that products for each product group are in their separate folder along with their prompt format and allowed keys as well as their output csv and json files. This will structure the data so that each product group is treated individually based on its attributes. 

In [5]:
## Importing libraries and initializing values class to import the relevant variables
import pandas as pd
from values import *
import os
from dbfread import DBF

val = Values()
encoding_ = 'cp850'
encoding_errors='replace'

In [None]:
## loading data from warengruppe list, the products in the webshop and the marketing artikel list that connects the other two datas together
warengr = pd.read_csv(val.wr_filepath,delimiter=';',encoding_errors=encoding_errors,encoding=encoding_)
shop = pd.read_excel(val.shop_file_path,engine='openpyxl')
marketing_art = pd.read_csv(val.marketing_artikel,delimiter=';',encoding_errors=encoding_errors)
mark_bez_df = pd.read_csv(val.mark_bez,delimiter=';',encoding=encoding_,encoding_errors=encoding_errors)

## We need the first part of the StoreId number to connect with the marketing artikel list
# shop['NUMMER'] = shop['StoreId'].str.split().str[0]

# cat_data = pd.read_csv(val.cat_data,delimiter=';',encoding_errors=encoding_errors)

In [None]:
# cat_data.reset_index(drop=True,inplace=True)
# cat_data = cat_data[['NUMMER','BESCHREIBUNG']]
# warengr['WAREN_GRP'] = warengr['WAREN_GRP'].fillna('')
# marketing_art['WARENGR'] = marketing_art['WARENGR'].fillna('')
# marketing_art[marketing_art['NUMMER'].str.contains('554HB04')]['WARENGR']

In [None]:
marketing_art['WM'].fillna("",inplace=True)
marketing_art['FARBE'].fillna("",inplace=True)
marketing_art['GROESSE'].fillna("",inplace=True)

### Data Processing

In [None]:

marketing_art['WARENGR'] = marketing_art['WARENGR'].astype(str)
marketing_art['WARENGR'] = marketing_art['WARENGR'].str.replace('.0','')
marketing_art['WARENGR'] = marketing_art['WARENGR'].str.strip()


marketing_art = marketing_art[['WM','NUMMER','GROESSE','FARBE','WARENGR']]
marketing_art['NUMBER'] = marketing_art['NUMMER'].str.ljust(8) + marketing_art['GROESSE'].str.ljust(4) + marketing_art['FARBE'].str.ljust(2)
marketing_art['NUMBER'] = marketing_art['NUMBER'].str.strip()
mark_bez_df['NUMMER'] = mark_bez_df['NUMMER'].str.strip()

warengr['WAREN_GRP'] = warengr['WAREN_GRP'].astype(str)
warengr['WAREN_GRP'] = warengr['WAREN_GRP'].str.replace('.0','')
warengr['WAREN_GRP'] = warengr['WAREN_GRP'].str.strip()



### Data Structure Creation

In the following cell, different tables are connected together to create a list of all the artikels that are in the shop, and that have a description, so that we can extract their attributes from their description and VAR_TEXT. some lines are commented, because for example the katalog data is no longer needed for this purpose. But can be added if needed in the future.
Also note that the folder name is now changed to Product_mining_2 instead of Product_mining. So we have two folders, and the old one is a bit processed and can be used for comparisons.

In [None]:
wg_data = []
for num, vals in zip(warengr['WAREN_GRP'],warengr['WAREN_GRNA']):
    ## iterating over warengruppe
    artikels_in_wr = marketing_art[marketing_art['WARENGR'] == str(num)]
    ## connecting marketing artikels for each warnegruppe with the shop list
    artikels = pd.merge(artikels_in_wr,shop,how='right',left_on="NUMBER",right_on="StoreId")
    ## selecting necessary columns
    artikels = artikels[['WM','NUMMER','GROESSE','FARBE', 'NUMBER','WARENGR','Name','Beschreibung','Var_Text']]
    # artikels = pd.merge(artikels,cat_data,how='left',on='NUMMER')
    ## renaming columns 
    artikels = artikels.rename(columns={'Name':'NAME','Beschreibung':'NET_BESCHREIBUNG'})
    ## data cleaning (removing html codes and unnecessary characters)
    artikels['NET_BESCHREIBUNG'] = artikels['NET_BESCHREIBUNG'].str.replace(' ',' ',regex=True)
    artikels['NET_BESCHREIBUNG'] = artikels['NET_BESCHREIBUNG'].str.replace(r'(<[^>]*>)',' ',regex=True,case=False)
    artikels['NET_BESCHREIBUNG'] = artikels['NET_BESCHREIBUNG'].str.replace(r'&nbsp_|&nbsp;|&Oslash',' ',regex=True)
    # artikels = artikels[artikels['BESCHREIBUNG'].isna() == False]
    ## preparing the product description to have all the information by concatenating both names and description together
    artikels['BESCHREIBUNG'] = artikels['NAME'] + ' - ' + artikels['Var_Text'] + ' - ' + artikels['NET_BESCHREIBUNG'] 
    # artikels = artikels[artikels['WARENGR'].isna() == False]
    artikels = artikels.drop_duplicates('NUMBER')
    artikels = artikels.dropna(subset='NUMBER')
    artikels = artikels[['WM','NUMMER','GROESSE','FARBE','WARENGR','NAME','BESCHREIBUNG']]
    wg_data.append([num,vals,len(artikels)])

    if len(artikels) != 0:
        new_val = vals.replace('/','_')
        paths = os.path.join(val.parent_dir, str(num)+'_'+str(new_val))
        print(str(num) + ' ' + str(len(artikels)))
        # Building folders based on the warengruppe numbers
        if not os.path.exists(paths):
            os.mkdir(paths) 
        artikels.to_excel(f'{paths}/{num}_{new_val}.xlsx', engine='xlsxwriter',index=False)
        
wg_data

## Code for creating the Product_mining lists based on Marketing Artikel Bezeichnung data BANAM1,2, and 3

In [None]:
wg_data = []
for num, vals in zip(warengr['WAREN_GRP'],warengr['WAREN_GRNA']):
    ## iterating over warengruppe
    artikels_in_wr = marketing_art[marketing_art['WARENGR'] == str(num)]
    ## connecting marketing artikels for each warnegruppe with the shop list
    # artikels = pd.merge(artikels_in_wr,shop,how='left',left_on="NUMBER",right_on="StoreId")
    artikels = pd.merge(artikels_in_wr,mark_bez_df,how='left',left_on='NUMBER',right_on='NUMMER')
    # artikels = artikels.dropna(subset='WARENGR')

    artikels = pd.merge(artikels,shop,how='right',left_on="NUMBER",right_on="StoreId")


    artikels = artikels.rename(columns={'NUMMER_x':'NUMMER'})
    artikels = artikels.drop_duplicates('NUMMER')
    artikels = artikels.dropna(subset='NUMMER')
    artikels = artikels[['WM','NUMMER','GROESSE','FARBE','WARENGR','BANAME1','BANAME2','BANAME3']]
    wg_data.append([num,vals,len(artikels)])

    if len(artikels) != 0:
        new_val = vals.replace('/','_')
        paths = os.path.join(val.parent_dir, str(num)+'_'+str(new_val))
        print(paths)
        # print(path)
        # Building folders based on the warengruppe numbers
        if not os.path.exists(paths):
            os.mkdir(paths) 
        # artikels.to_csv(f'{paths}/{num}_{new_val}.csv',encoding='utf-8', sep=',',index=False)
        # Apply the cleaning function to all string elements in the DataFrame
        # artikels = artikels.apply(convert_encoding)
        # artikels.to_csv(f'{paths}/{num}_{new_val}.csv',index=False,sep=';',encoding='utf-8')
        # artikels.to_excel(f'{paths}/{num}_{new_val}.xlsx',index=False, engine='openpyxl')
        artikels.to_excel(f'{paths}/{num}_{new_val}_shop.xlsx', engine='xlsxwriter',index=False)

        # artikels.to_json(f'{paths}/{num}_{new_val}.json',index=False)
    # else:
    #     paths = os.path.join(val.parent_dir, str(num))
    #     for file in os.listdir(paths):
    #         os.remove(os.path.join(paths,file))
    #     print(paths)
    #     os.rmdir(paths)

        ## inserting the csv file of the products for each warengruppe and adding a prompt placeholder text (to be filled)
        # with open(f'{path}/prompt_{num}.txt','w') as txt:
        #     txt.write("prompt-to-be-filled")
        
wg_data

### Saving WG_artikel_3 which is list of WGs

In [None]:
# wg_df = pd.DataFrame(wg_data,columns=['WG_ID','WG_NAME','ITEMS_IN_WG'])
# # wg_df.to_csv('/Users/maralsheikhzadeh/Documents/Codes/useful-exports/WG_artikels_3.csv',sep=';',index=False,encoding='utf-8')
# wg_df.to_excel('/Users/maralsheikhzadeh/Documents/Codes/useful-exports/WG_artikels_3.xlsx',index=False, engine='openpyxl')