In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import regex as re
from values import *

import warnings
warnings.filterwarnings('ignore')


In [None]:
val = Values()

In [None]:
artikel_df = pd.read_excel(val.shop_file_path)
marketing_artikel = pd.read_csv(val.marketing_artikel,encoding='latin-1', delimiter=';', on_bad_lines='skip',parse_dates=val.dates,dayfirst=True)

In [None]:
### Cleaning and preparing Warengroup data
warengrps = marketing_artikel[['NUMMER','WARENGR']]
warengrps['WARENGR'] = pd.to_numeric(warengrps['WARENGR'],errors='coerce')
warengrps.dropna(subset='WARENGR',inplace=True)
warengrps['WARENGR'] = warengrps['WARENGR'].astype(int)
warengrps.drop_duplicates(subset='NUMMER',inplace=True)

In [None]:
### Selecting a list of columns from our artikels that we need
artikels = artikel_df[['StoreId','Name','Beschreibung']]

### Cleaning up the Artikelnumbers, renaming their column and removing unwanted information to connect this dataset with Warengroups data
artikels['Number'] = artikels['StoreId'].str.split().str[0]

In [None]:
### Connecting two datasets to find the warengroups of each artikel and select some of them for further analysis
artikels_mit_wrgp = pd.merge(artikels,warengrps,how='left',left_on='Number',right_on='NUMMER')

In [None]:
### Checking the items from right dataset and see if the left-join left some rows null and removing them
artikels_mit_wrgp = artikels_mit_wrgp[artikels_mit_wrgp['NUMMER'].isna()==False]
### Converting WARENGR column values from float to int
artikels_mit_wrgp['WARENGR'] = artikels_mit_wrgp['WARENGR'].astype(int)

In [None]:
### Selecting certain Warengroups and columns for further analysis
kuchen_gerate = artikels_mit_wrgp[artikels_mit_wrgp['WARENGR'].isin(val.kuchengerate_wr_list)]

### Selecting the needed columns and uniforming the column names
kuchen_gerate = kuchen_gerate[['WARENGR','NUMMER','Name','Beschreibung']]
kuchen_gerate = kuchen_gerate.rename(columns={'Name':'NAME','Beschreibung':'BESCHREIBUNG'})

In [None]:
### Removing rows without description
for id,item in enumerate(kuchen_gerate['BESCHREIBUNG']):
    kuchen_gerate = kuchen_gerate.dropna(subset='BESCHREIBUNG')

kuchen_gerate['ORIGINAL_BESCHREIBUNG'] = kuchen_gerate['BESCHREIBUNG'].copy()

### Cleaning unwanted characters and rows

In [None]:
def replace_dot_with_comma(match):
    # Replace the dot with a comma
    return match.group(0).replace('.', ',')

def replace_dot_between_numbers(input_string):
    # Define the regular expression pattern
    pattern = r'(?<=\d)\.(?=\d)'  # Matches a dot between two digits

    # Perform the replacement
    result = re.sub(pattern, replace_dot_with_comma, input_string)

    return result

In [None]:
### Removing unwanted characters
kuchen_gerate['BESCHREIBUNG'] = kuchen_gerate['BESCHREIBUNG'].str.replace(' ',' ',regex=True)
kuchen_gerate['BESCHREIBUNG'] = kuchen_gerate['BESCHREIBUNG'].str.replace(r'<p>|</p>|<ul>|</ul>|<li>|</li>|<br>|<b>|</b>|<span.*>|<font.*>|<strong>|</strong>',' ',regex=True,case=False)
kuchen_gerate['BESCHREIBUNG'] = kuchen_gerate['BESCHREIBUNG'].str.replace(r'&nbsp_|&nbsp;',' ',regex=True)
kuchen_gerate['BESCHREIBUNG'] =kuchen_gerate['BESCHREIBUNG'].str.replace('&Oslash',' ')
kuchen_gerate['BESCHREIBUNG'] =kuchen_gerate['BESCHREIBUNG'].map(replace_dot_between_numbers)
# kuchen_gerate['BESCHREIBUNG'] =kuchen_gerate['BESCHREIBUNG'].str.replace('<br>',' ')

### With another dataset that hat Werbemittel
# kuchen_gerate = kuchen_gerate[kuchen_gerate['WM'].isna() == True]

### To remove the Artikels for Swiss or P artikels
# kuchen_gerate = kuchen_gerate[kuchen_gerate['NUMMER'].str.match(r'^\w+S$') == False]
# kuchen_gerate = kuchen_gerate[kuchen_gerate['NUMMER'].str.match(r'^\w+P$') == False]
kuchen_gerate = kuchen_gerate.drop_duplicates()
kuchen_gerate.info()

# Run from Here

In [None]:
## creating a copy of the original text, to clean it up with the extracted patterns
mined_text = kuchen_gerate[['NUMMER','NAME','BESCHREIBUNG','ORIGINAL_BESCHREIBUNG']].copy()
# mined_text['ORIGINAL_BESCHREIBUNG'] = mined_text['BESCHREIBUNG'].copy()
mined_text['BESCHREIBUNG'] = mined_text['NAME'] + '\n' +mined_text['BESCHREIBUNG']
mined_text['REMAINED_TEXT'] = mined_text['BESCHREIBUNG'].copy()
mined_text['BESCHREIBUNG'] = mined_text['BESCHREIBUNG'].str.lstrip()
mined_text['BESCHREIBUNG'] = mined_text['BESCHREIBUNG'].str.rstrip()

In [None]:
# ## Extracting sizes from text
# mined_text.loc[mined_text['BESCHREIBUNG'].str.contains(r' klein *',case=False),'GROESSE'] = 'S'
# mined_text.loc[mined_text['BESCHREIBUNG'].str.contains(r' *groß *|großer* *',case=False),'GROESSE'] = 'L'
# mined_text.loc[mined_text['BESCHREIBUNG'].str.contains(r' mittel *',case=False),'GROESSE'] = 'M'
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r' *klein *| groß *| mittel *|großer* *','',regex=True,case=False)

In [None]:
# ## Extracting colors
# mined_text['FARBE'] = mined_text['BESCHREIBUNG'].str.extract(r'(kirschrot|ofenrot|rot|orange|gelb| blau|schwarz|weiß|weiss|grün|silber|creme|grau|blu |dijongelb| \
#                                                          |Elfenbein|gold|aubergine|Dunkelgrün|dunkel-* *braun|braun)(?!tiefschwarz)',flags=re.IGNORECASE)
# mined_text['FARBE'] = mined_text['FARBE'].str.capitalize()
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'kirschrot|ofenrot|rot|orange|gelb| blau|schwarz|weiß|weiss|grün|silber|creme|grau|blu |dijongelb|Elfenbein|gold| \
#                                                                 |aubergine|aubergine|Dunkelgrün|dunkel- braun|braun|(?!tiefschwarz)','',regex=True, case=False)

In [None]:
# ## Extracting number of items
# mined_text['STUECK'] = mined_text['BESCHREIBUNG'].str.extract(r'( \d+ Stück)',flags=re.IGNORECASE)
# # mined_text['STUECK'] = mined_text['STUECK'].str.extract(r'( \d+)',flags=re.IGNORECASE)
# # mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r' \d+ Stück','',regex=True, case=False)

In [None]:
## Extracting number of items in sets and uniforming the values
mined_text['TEILIG'] = mined_text['BESCHREIBUNG'].str.extract(r'( *\d+-tlg.|\d+-teilig|\d+tlg.*|\d+-*er set|\d+ set|\d+ ?ply|\d+er-Set)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r' *\d+-tlg.|\d+-* *teilig[er]*|\d+tlg.*|\d+-*er set|\d+ set|\d+ ?ply','',regex=True, case=False)

mined_text['TEILIG'] = mined_text['TEILIG'].str.replace(r'-tlg.|-tlg |tlg.|-teilig|tlg *',' tlg.',regex=True,case=False)
mined_text['TEILIG'] = mined_text['TEILIG'].str.lstrip()

In [None]:
mined_text[mined_text['BESCHREIBUNG'].str.contains(r'deckel \d',regex=True,case=False)]
mined_text[mined_text['NUMMER'] == '035B06']

In [None]:
## Extracting weight patterns
mined_text['GEWICHT'] = mined_text['BESCHREIBUNG'].str.findall(r'([\d,]*\d+ *kg|\d+ *g |\d+ *gram|\d+-\d+ kg|\d+ *g\.)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'[\d,]*\d+ *kg|\d+ *g |\d+ *gram|\d+-\d+ kg|\d+ *g\.','',regex=True,case=False)

In [None]:
## Extracting height patterns and cleaning up inconsistencies
mined_text['HOHE'] = mined_text['BESCHREIBUNG'].str.findall(r'(\d+,*\d* *cm hoch|hoch \(\d+ cm\)|\d+,\d+ *cm hoch|hoch \d+ *cm|hoch [Ø|ø] \d+,*\d* *cm)',flags=re.IGNORECASE)
mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'\d+,*\d* *cm hoch|hoch \(\d+ cm\)|\d+,\d+ *cm hoch|hoch \d+ *cm|hoch [Ø|ø] \d+,*\d* *cm','',regex=True, case=False)

# ## cleaning up the height formats
# mined_text['HOHE'] = mined_text['HOHE'].str.replace('cm',' cm')
# mined_text['HOHE'] = mined_text['HOHE'].str.replace('  cm',' cm')
# mined_text['HOHE'] = mined_text['HOHE'].str.replace('hoch','',case=False)


In [None]:
### Extracting floor sizes and cleaning up the formatting
mined_text['BODEN'] = mined_text['BESCHREIBUNG'].str.findall(r'(Boden [Ø |_]*\d+[,\d]* cm|Boden [Ø |_]*\d+[,\d]* mm|Boden [bis Ø]* \d+ cm| \
                                                         Bodendurchmesser [Ø |_]*\d+[,\d]* cm|Boden-Kontaktfläche [Ø |_]*\d+[,\d]* cm)',flags=re.IGNORECASE)
mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'Boden [Ø |_]*\d+[,\d]* cm|Boden [Ø |_]*\d+[,\d]* mm|Boden [bis Ø]* \d+ cm| \
                                                         Bodendurchmesser [Ø |_]*\d+[,\d]* cm|Boden-Kontaktfläche [Ø |_]*\d+[,\d]* cm','',regex=True, case=False)

## Cleaning up the boden formats
# mined_text['BODEN'] = mined_text['BODEN'].str.replace(' Ø','')
# mined_text['BODEN'] = mined_text['BODEN'].str.replace(' ø','')
# mined_text['BODEN'] = mined_text['BODEN'].str.replace('Boden','',regex=True,case=False)
# mined_text['BODEN'] = mined_text['BODEN'].str.replace('Kontaktfläche','',regex=True,case=False)
# mined_text['BODEN'] = mined_text['BODEN'].str.replace('durchmesser','',regex=True,case=False)
# mined_text['BODEN'] = mined_text['BODEN'].str.lstrip()
# mined_text['BODEN'] = mined_text['BODEN'].str.rstrip()



In [None]:
### STARKE
mined_text['STARKE'] = mined_text['BESCHREIBUNG'].str.extract(r'(\d+,*\d*-*\d* mm starke|stärke von \d+,*\d* mm)',flags=re.IGNORECASE)
mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'\d+,*\d*-*\d* mm starke','',case=False,regex=True)
mined_text['STARKE'] = mined_text['STARKE'].str.rstrip(' starke')
mined_text['STARKE'].unique()

In [None]:
## Extracting the patterns related to size and size units and uniforming the formatting
mined_text['MASSEN'] = mined_text['REMAINED_TEXT'].str.findall(r'[Ø|ø]* \d+-\d+ cm|[Ø|ø]* \d+,*\d* cm|\d+,*\d* x \d+,*\d* x \d+,*\d* cm \(B\/T\/H\)| \
                                                                 Ø* *\d+,*\d*\/\d+,*\d*\/\d+,*\d* cm|\(Ø \d+,*\d*, \d+,*\d*, \d+,*\d* cm\)|\d+,*\d* x \d+,*\d* cm|\d+,*\d* *x *\d+,*\d* *x *\d+,*\d* *cm *\(B\/T\/H\)|\d+,*\d* cm x \d+,*\d* cm x \d+,*\d* cm \(B\/T\/H\)| \
                                                               \d+,*\d* *x* *\d+,*\d* *x* *\d*,*\d* *cm|\(B\/T\/H\): *\d+,*\d* *x *\d+,*\d* *x *\d+,*\d* *cm|[Ø|ø]* \d+,*\d* *cm',flags=re.IGNORECASE)
mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'\d+,*\d* *x *\d+,*\d* *x *\d+,*\d* *cm *\(B\/T\/H\)|\d+,*\d* cm x \d+,*\d* cm x \d+,*\d* cm \(B\/T\/H\)| \
                                                               \d+,*\d* *x* *\d+,*\d* *x* *\d*,*\d* *cm|\(B\/T\/H\): *\d+,*\d* *x *\d+,*\d* *x *\d+,*\d* *cm|\d+,*\d* *cm','',regex=True,case=False)
# mined_text['MASSEN'] = mined_text['MASSEN'].str.replace(r'^, ','',regex=True)
# mined_text['MASSEN'] = mined_text['MASSEN'].str.replace(r'^ *','',regex=True)
# mined_text['MASSEN'] = mined_text['MASSEN'].str.replace('X','x',regex=True)
# mined_text['MASSEN'] = mined_text['MASSEN'].str.replace('x',' x ',regex=True)
# mined_text['MASSEN'] = mined_text['MASSEN'].str.replace('  x  ',' x ',regex=True)
# mined_text['MASSEN'] = mined_text['MASSEN'].str.replace('cm',' cm',regex=True)
# mined_text['MASSEN'] = mined_text['MASSEN'].str.replace('  cm',' cm',regex=True)
# mined_text['MASSEN'] = mined_text['MASSEN'].str.replace('mm',' mm',regex=True)

# ## Separating the measurement unit from the values
# mined_text['MASSEN_EINHEIT'] = mined_text['MASSEN'].str.extract(r'(cm|mm)')
# mined_text['MASSEN'] = mined_text['MASSEN'].str.replace(r'[cm|mm]','',regex=True,case=False)

In [None]:
# ## Separating dimensions from one another when there are more than 1
# text = mined_text['MASSEN'].str.split(r'x|, ')

# df = pd.DataFrame(text)

# df['MASSEN_1'] = ''
# df['MASSEN_2'] = ''
# df['MASSEN_3'] = ''

# for index, row in df.iterrows():
#     numbers = row['MASSEN']
#     if isinstance(numbers, list):
#         if len(numbers) == 1:
#             df.at[index, 'MASSEN_1'] = numbers[0]
#         elif len(numbers) == 2:
#             df.at[index, 'MASSEN_1'] = numbers[0]
#             df.at[index, 'MASSEN_2'] = numbers[1]
#         elif len(numbers) == 3:
#             df.at[index, 'MASSEN_1'] = numbers[0]
#             df.at[index, 'MASSEN_2'] = numbers[1]
#             df.at[index, 'MASSEN_3'] = numbers[2]

# df.drop(columns=['MASSEN'], inplace=True)

# mined_text['MASSEN_1'] = df['MASSEN_1']
# mined_text['MASSEN_2'] = df['MASSEN_2']
# mined_text['MASSEN_3'] = df['MASSEN_3']

In [None]:
## Extracting volume and size information and uniforming the formatting
mined_text['VOLUME'] = mined_text['BESCHREIBUNG'].str.findall(r' *\d+ ml| *\d+l\W| *\d*,*\d+ *l+[iter]*\W| *\d+ l\W|\d+-\d+ ml|\d+-\d+ l',flags=re.IGNORECASE)
mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r' *\d+ ml| *\d+l\W| *\d*,*\d+ *l+[iter]*\W| *\d+ l\W|\d+-\d+ ml|\d+-\d+ l','',regex=True,case=False)

# mined_text['VOLUME'] = mined_text['VOLUME'].str.replace(r'L',' L',regex=True)
# mined_text['VOLUME'] = mined_text['VOLUME'].str.replace(r'Liter',' L',regex=True,case=False)
# mined_text['VOLUME'] = mined_text['VOLUME'].str.replace(r'l',' L',regex=True)

# mined_text['VOLUME'] = mined_text['VOLUME'].str.replace(r'  L',' L',regex=True)
# mined_text['VOLUME'] = mined_text['VOLUME'].str.replace(r'm L','mL',regex=True)

# ## Separating the measuring unit from the values
# mined_text['VOLUME_EINHEIT'] = mined_text['VOLUME'].str.extract(r'(L|mL)')
# mined_text['VOLUME'] = mined_text['VOLUME'].str.replace(r'[L|mL]','',regex=True)


In [None]:
## Extracting info about the Deckel
mined_text['DECKEL'] = mined_text['BESCHREIBUNG'].str.extract(r'(mit deckel|ohne deckel|mit glasdeckel|m.Deckel|\+Deckel|\+ Deckel|\(Inkl. Deckel\))',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'mit deckel|ohne deckel|mit glasdeckel|m.Deckel|\+Deckel|\+ Deckel|\(Inkl. Deckel\)','',regex=True, case=False)
# mined_text['DECKEL'] = mined_text['BESCHREIBUNG'].str.replace(r'mit deckel|ohne deckel|mit glasdeckel|m.Deckel|\+Deckel|\+ Deckel|\(Inkl. Deckel\)','Ja',regex=True,case=False)
mined_text['DECKEL'] = mined_text['DECKEL'].str.replace(r'mit deckel|mit glasdeckel|m.Deckel|\+Deckel|\+ Deckel|\(Inkl\. Deckel\)','Ja',regex=True,case=False)
mined_text['DECKEL'] = mined_text['DECKEL'].str.replace(r'ohne deckel','Nein',regex=True,case=False)

In [None]:
def capitalize_list(lst):
    new_list = []
    for item in lst:
        item = item.lstrip()
        item = item.capitalize()
        new_list.append(item)
    new_list = list(set(new_list))
    result = ' ,'.join(map(str,new_list))
    return result


In [None]:
## Extracting shape info
mined_text['FORM'] = mined_text['BESCHREIBUNG'].str.findall(r'\Wrund|\Woval|rechteckig|eckig|quadratisch',flags=re.IGNORECASE)
# mined_text['FORM'] = mined_text['FORM'].str.lstrip()
# mined_text['FORM'] = mined_text['FORM'].str.capitalize()
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r' rund| oval|rechteckig|eckig|quadratisch','',regex=True, case=False)
mined_text['FORM']  = mined_text['FORM'].apply(capitalize_list)



In [None]:
## Extracting info about the griff
mined_text['GRIFF'] = mined_text['BESCHREIBUNG'].str.findall(r'Gußeisen-* *griffe*n*|Edelstahl-* *griffe|Edelst.-Griffe*n*|Edelstahlgriffe*n*|Gusseisen-* *griffe*n*|EDEL- STAHLgriffe| \
                                                         |Bronze-? ?griffe?n?|Bronzegriffe*n*|Bronze Griffe*n*|mit griffe*n*|griffe*n*|mit \w+ *griffe*|Hohlgriff',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'Gußeisen-* *griffe*n*|Edelstahl-* *griffe*n*|Edelst.-Griffe*n*|Edelstahlgriffe*n*|Gusseisen-* *griffe*n*|EDEL- STAHLgriffe| \
#                                                          |Bronze-? ?griffe?n?|Bronzegriffe*n*|Bronze Griffe*n*|mit griffe*n*|griffe*n*|mit \w+ *griffe*n*','',regex=True, case=False)
# mined_text['GRIFF'] = mined_text['GRIFF'].str.replace(r'Gußeisen-* *griffe*n*|Edelstahl-* *griffe|Edelst.-Griffe*n*|Edelstahlgriffe*n*|Gusseisen-* *griffe*n*|EDEL- STAHLgriffe| \
#                                                          |Bronze-? ?griffe?n?|Bronzegriffe*n*|Bronze Griffe*n*|mit griffe*n*|griffe*n*|mit \w+ *griffe*','Ja',regex=True,case=False)
# mined_text['GRIFF'] = mined_text['GRIFF'].str.capitalize()
mined_text['GRIFF'] = mined_text['GRIFF'].apply(capitalize_list)

mined_text['GRIFF'] = mined_text['GRIFF'].str.replace(r'^Griffen ,Griffe|^Griffe*n*|^Mit griffe*n*','Griff',regex=True,case=False)
mined_text['GRIFF'] = mined_text['GRIFF'].str.replace(r'^Griff ,|Mit | ,Griffe*n*$| ,Edelstahlgriffe$','',regex=True,case=False)
mined_text['GRIFF'] = mined_text['GRIFF'].str.replace(r'^Griffe','Griff',regex=True,case=False)
mined_text['GRIFF'] = mined_text['GRIFF'].str.capitalize()


In [None]:
### What is its core material
mined_text['KERN'] = mined_text['BESCHREIBUNG'].str.findall(r'\S+kern',flags=re.IGNORECASE)
mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'\S+kern','',case=False,regex=True)
mined_text['KERN'] = mined_text['KERN'].apply(lambda x: " / ".join(set(x)))

In [None]:
## Extracting materials (so many variations that can be for different parts of the item)
mined_text['MATERIALS'] = mined_text['BESCHREIBUNG'].str.findall(r'steinzeug|granit\W|edelstahl\W|ohne keramik|Guss-*Aluminium\W|Gussaluminium\W|Guß-* *eisen|titan|kupfer|hartglas| \
                                                                 |Anodisierte Aluminium\W|Glaskeramik edelst.|keramik|Gußeisen\W|Gusseisen\W| Guss | eisen |holz|porzellan|Guß- alu\W|Edelst\.| \
                                                                 |Kunststoff|Metall\w*|\w*stein|Edelstahlböden',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'steinzeug|granit|edelstahl|ohne keramik|[Guss-]*Aluminium|Gussaluminium|Guß-* *eisen|titan|kupfer|hartglas|Anodisierte Aluminium| \
#                                                  edelst.|keramik|Gußeisen|Gusseisen|Guss|eisen|holz|porzellan|Guß- alu|Edelst.|Kunststoff|Metall\w*|\w*stein|Glaskeramik','',regex=True,case=False)

# mined_text['MATERIALS'] = mined_text['MATERIALS'].apply(capitalize_list)


In [None]:
def lower_case(x):
    return [i.lower() for i in x]
def remove_char(x):
    return [re.sub(r',|-| $|\.$|\/$|\\n','',i) for i in x]
    # return [i.replace(r',|-','',regex=True) for i in x]


In [None]:
mined_text['MATERIALS'] = mined_text['MATERIALS'].map(lower_case)
mined_text['MATERIALS'] = mined_text['MATERIALS'].map(remove_char)
mined_text['MATERIALS'] = mined_text['MATERIALS'].map(set)
mined_text['MATERIALS'] = mined_text['MATERIALS'].map(list)

In [None]:
## Extracting information about temperature
mined_text['TEMPERATUR'] = mined_text['BESCHREIBUNG'].str.extract(r'(von \d+ °C bis \d+ °C|von \d+ °C bis \d+ °C|bis \d+ °C|von \d+ °C|\d+-\d+ °C|bis \+\d+ °C|[Betriebstemperatur]* \d+-\d+ °C)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'von \d+ °C bis \d+ °C|von \d+ °C bis \d+ °C|bis \d+ °C|von \d+ °C|\d+-\d+ °C','',regex=True,case=False)

In [None]:
### Info about Kratz resistence
mined_text['KRATZ'] = mined_text['BESCHREIBUNG'].str.extract(r'(kratz-* und schnittresistent|Kratz Und Schnittfesten|Kratzfest|kratzresistent)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'kratz-* und schnittresistent|Kratz Und Schnittfesten|Kratzfest Und Robust|Kratzfest|kratzresistent','',case=False,regex=True)
# mined_text['KRATZ'] = mined_text['KRATZ'].str.join(' ')
mined_text['KRATZ'] = mined_text['KRATZ'].str.capitalize()


In [None]:
### Info about Antihaftbeschichtung
mined_text['BESCHICTUNG'] = mined_text['BESCHREIBUNG'].str.extract(r'(Antihaftbeschichtung|Anti-* *haftbeschichtung|Antihaftversiegelung|antihaftbeschichteter*)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'Antihaftbeschichtung|Anti-* *haftbeschichtung|Antihaftversiegelung|antihaftbeschichteter*','',case=False,regex=True)
mined_text.loc[ mined_text['BESCHICTUNG'].isna()==False ,'BESCHICTUNG'] = 'Ja'


In [None]:
### Info about often oder grillfest
mined_text['OFENFEST'] = mined_text['BESCHREIBUNG'].str.extract(r'(Ofenfest[ bis \d+ °C]*|backofenfest[ bis \d+ °C]*|grillfest[ bis \d+ °C]*)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'Ofenfest[ bis \d+ °C]*|backofenfest[ bis \d+ °C]*|grillfest[ bis \d+ °C]*','',case=False,regex=True)
mined_text['OFENFEST'] = mined_text['OFENFEST'].str.rstrip(r'I$| | S$| Sc$')
mined_text['OFENFEST'] = mined_text['OFENFEST'].str.capitalize()
mined_text['OFENFEST'].unique()

In [None]:
### Info about Spuelmachine festness
mined_text['SPUELMACHINEFEST'] = mined_text['BESCHREIBUNG'].str.findall(r'spülmaschinenfest',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'spülmaschinenfest','',case=False,regex=True)
mined_text['SPUELMACHINEFEST'] = mined_text['SPUELMACHINEFEST'].str.replace('spülmaschinenfest', 'Ja')

In [None]:
### Info about conductivity
mined_text['SUPERLEITFAEHIG'] = mined_text['BESCHREIBUNG'].str.findall(r'Superleitfähig',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'Superleitfähig','',case=False,regex=True)
mined_text['SUPERLEITFAEHIG'] = mined_text['SUPERLEITFAEHIG'].str.replace('Superleitfähig', 'Ja')

In [None]:
### Is the item HG Exklusiv or not
mined_text['HGEXKLUSIV'] = mined_text['BESCHREIBUNG'].str.findall(r'Hagen Grote Exklusiv',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'Hagen Grote Exklusiv','',case=False,regex=True)
mined_text['HGEXKLUSIV'] = mined_text['HGEXKLUSIV'].str.replace('Hagen Grote Exklusiv', 'Ja')

In [None]:
### Is it induction friendly or not
mined_text['INDUKTION'] = mined_text['BESCHREIBUNG'].str.extract(r'(alle Herdarten|inklusive Induktion|Einschließlich Induktion|Induktionsfähig|Induktion geeignet|inkl. Induktion)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'alle Herdarten|inklusive Induktion|Einschließlich Induktion|Induktionsfähig|Induktion geeignet|inkl. Induktion','',case=False,regex=True)
mined_text['INDUKTION'] = mined_text['INDUKTION'].str.replace(r'inklusive Induktion|Einschließlich Induktion|Induktionsfähig|Induktion geeignet|inkl. Induktion','Inkl. Induktion',regex=True,case=False)
mined_text['INDUKTION'] = mined_text['INDUKTION'].str.replace(r'alle Herdarten','Alle Herdarten',regex=True,case=False)
mined_text['INDUKTION'].unique()

In [None]:
### Does it have Knauf or not
mined_text['KNAUF'] = mined_text['BESCHREIBUNG'].str.extract(r'(\S*knauf)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'knauf|dicht knauf','',case=False,regex=True)


In [None]:
### Is it emailliert or not
mined_text['EMAILLIERT'] = mined_text['BESCHREIBUNG'].str.extract(r'(\S*Emailliert\S*|\S*Emaille\S*)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'Emailliert\w+|Glas-Emaille','',case=False,regex=True)
mined_text['EMAILLIERT'] = mined_text['EMAILLIERT'].str.replace(r'emailliertes|emailliertem|emailliert,*|Emaille','Emailliert',regex=True,case=False)
mined_text['EMAILLIERT'] = mined_text['EMAILLIERT'].str.replace(r'emaille-versiegelt,','Emaille-Versiegelt',regex=True,case=False)
mined_text['EMAILLIERT'] = mined_text['EMAILLIERT'].str.replace(r'GlasEmailliert','Glas-Emailliert',regex=True,case=False)
mined_text['EMAILLIERT'] = mined_text['EMAILLIERT'].str.replace(r'HartEmailliert','Hart-Emailliert',regex=True,case=False)
mined_text['EMAILLIERT'] = mined_text['EMAILLIERT'].str.replace(r'Hart-Emailliert-Oberfläche','Hartemailliert-Oberfläche',regex=True,case=False)
mined_text['EMAILLIERT'] = mined_text['EMAILLIERT'].str.replace(r',','',regex=True,case=False)


In [None]:
### What is its brand?
mined_text['MARKE'] = mined_text['BESCHREIBUNG'].str.extract(r'(Le *Creuset|smeg|kitchenaid)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'Le *Creuset|smeg|kitchenaid','',case=False,regex=True)
mined_text['MARKE'] = mined_text['MARKE'].str.replace('Le creuset','Le Creuset')


In [None]:
### Is it Teflon or not
mined_text['TEFLON'] = mined_text['BESCHREIBUNG'].str.extract(r'(teflon)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'Teflon','',case=False,regex=True)
mined_text['TEFLON'] = mined_text['TEFLON'].str.replace('teflon','Ja',case=False)

In [None]:
### Is it handgeschmiedeten?
mined_text['HANDGESCHMIEDETEN'] = mined_text['BESCHREIBUNG'].str.extract(r'(Handgeschmiedeten)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'Handgeschmiedeten','',case=False,regex=True)
mined_text['HANDGESCHMIEDETEN'] = mined_text['HANDGESCHMIEDETEN'].str.replace('handgeschmiedeten','Ja',case=False)
mined_text['HANDGESCHMIEDETEN'].unique()

In [None]:
### IS it Wärmeleitfähigkeit?
mined_text['WAERMLEITFAEHIGKEIT'] = mined_text['BESCHREIBUNG'].str.extract('(Wärmeleitfähigkeit)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'Wärmeleitfähigkeit','',case=False,regex=True)

mined_text['WAERMLEITFAEHIGKEIT'] = mined_text['WAERMLEITFAEHIGKEIT'].str.replace('Wärmeleitfähigkeit','Ja',case=False)

In [None]:
### how is its fusse?
mined_text['FUSSE'] = mined_text['BESCHREIBUNG'].str.extract(r'(rutschfeste Füßen*|rutschfester Standfuß|Aufstellfuß)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'rutschfeste Füße','',case=False,regex=True)
mined_text['FUSSE'] = mined_text['FUSSE'].str.replace(r'rutschfeste Füßen*|rutschfester Standfuß|rutschfeste Füße|rutschfester Standfuß','Rutschfeste Füße',regex=True,case=False)
mined_text['FUSSE'].unique()

In [None]:
### Is it slow-cooking friendly?
mined_text['SLOWKOCHEN'] = mined_text['BESCHREIBUNG'].str.extract(r'(langsames*, sanftes* Schmoren)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'langsames*, sanftes* Schmoren','',case=False,regex=True)
mined_text['SLOWKOCHEN'] = mined_text['SLOWKOCHEN'].str.replace(r'langsames*, sanftes* Schmoren','Ja',regex=True,case=False)

In [None]:
### What is its voltage?
mined_text['VOLT'] = mined_text['BESCHREIBUNG'].str.extract(r'(\d+ *V\W|\d+ VOLT)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r' \d+V | \d+ VOLT','',case=False,regex=True)
mined_text['VOLT'] = mined_text['VOLT'].str.replace(r'230V/|230V ','230 V',regex=True,case=False)
mined_text['VOLT'].unique()

In [None]:
### What is its power consumption?
mined_text['WATT'] = mined_text['BESCHREIBUNG'].str.extract(r'([\d*\.]*\d+ *W\W| \d+ *WATT)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'[\d*\.]*\d+ *W\W| \d+ *WATT','',case=False,regex=True)
mined_text['WATT'] = mined_text['WATT'].str.rstrip(',')
mined_text['WATT'] = mined_text['WATT'].str.replace(r'\.| ','',regex=True)
mined_text['WATT'].unique()

In [None]:
### How long is its cable?
mined_text['KABEL'] = mined_text['BESCHREIBUNG'].str.extract(r'([\d,]*\d+ *c*m [Anschluss]*kabel|\d+ m langes \d+ °C hitzebeständiges Kabel)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'[\d,]*\d+ *c*m [Anschluss]*kabel|\d+ m langes \d+ °C hitzebeständiges Kabel','',case=False,regex=True)
mined_text['KABEL'] = mined_text['KABEL'].str.extract(r'(\d+,*\d* c*m)',flags=re.IGNORECASE)
mined_text['KABEL'].unique()

In [None]:
### How much pressure it uses
mined_text['DRUCK'] = mined_text['BESCHREIBUNG'].str.extract(r'(\d+ bar[,| ])',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'\d+ bar[,| ]','',case=False,regex=True)
mined_text['DRUCK'] = mined_text['DRUCK'].str.rstrip(r',| ')
mined_text['DRUCK'].unique()

In [None]:
### Grid Property
mined_text['GITTERNETZ'] = mined_text['BESCHREIBUNG'].str.extract(r'(Gitternetz)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'Gitternetz','',case=False,regex=True)
mined_text['GITTERNETZ'] = mined_text['GITTERNETZ'].replace('Gitternetz','Ja')
mined_text['GITTERNETZ'] .unique()

In [None]:
### Handwasche empfohlen Property
# mined_text[mined_text['ORIGINAL_BESCHREIBUNG'].str.contains('Handwäsche empfohlen')]
mined_text.loc[mined_text['ORIGINAL_BESCHREIBUNG'].str.contains('Handwäsche empfohlen'),'HANDWASCHE_EMPFOHLEN'] = 'Ja'

In [None]:
### Magnetic Boden
mined_text['MAGNETICBODEN'] = mined_text['BESCHREIBUNG'].str.extract(r'(magnetischen Edelstahl-*böden|magnetischen Edelstahl-Spezialboden|magnetischem Edelstahl-Spezialboden)',flags=re.IGNORECASE)
# mined_text['REMAINED_TEXT'] = mined_text['REMAINED_TEXT'].str.replace(r'magnetischen Edelstahlböden|magnetischen Edelstahl-Spezialboden|magnetischem Edelstahl-Spezialboden','',case=False,regex=True)
mined_text['MAGNETICBODEN'] = mined_text['MAGNETICBODEN'].str.replace(r'magnetischen Edelstahl-*böden|magnetischen Edelstahl-Spezialboden|magnetischem Edelstahl-Spezialboden',
                                                                      'Magnetischen Edelstahlboden',regex=True,case=False)



In [None]:
set_items = mined_text[mined_text['TEILIG'].isna() == False][['NUMMER','NAME','ORIGINAL_BESCHREIBUNG','TEILIG','TEMPERATUR','GRIFF','HOHE','BODEN','FORM','MATERIALS','KRATZ','OFENFEST','KERN', 'INDUKTION', 'KNAUF',
                                                              'BESCHICTUNG','EMAILLIERT', 'TEFLON','MAGNETICBODEN','STARKE','HANDWASCHE_EMPFOHLEN','BESCHREIBUNG']]
set_items['ORIGINAL_BESCHREIBUNG'] = set_items['ORIGINAL_BESCHREIBUNG'].str.lower()
set_items['ORIGINAL_BESCHREIBUNG'] = set_items['ORIGINAL_BESCHREIBUNG'].str.split('<li>')
set_items['ORIGINAL_BESCHREIBUNG']

In [None]:
set_items['TEILIG'] = set_items['TEILIG'].str.extract(r'(\d)')
mined_text = mined_text[mined_text['TEILIG'].isna() == True]

In [None]:
## Reordering the columns
mined_text = mined_text[['NUMMER','NAME', 'ORIGINAL_BESCHREIBUNG', 'VOLUME', 'MASSEN', 'TEMPERATUR',
      'GRIFF','GEWICHT', 'HOHE', 'BODEN', 'DECKEL', 'FORM', 'MATERIALS', 'KRATZ','OFENFEST','SPUELMACHINEFEST','SUPERLEITFAEHIG','HGEXKLUSIV','KERN','INDUKTION','KNAUF', 'BESCHICTUNG',
      'EMAILLIERT','MARKE','TEFLON','HANDGESCHMIEDETEN','WAERMLEITFAEHIGKEIT','FUSSE','SLOWKOCHEN','VOLT','WATT','KABEL','DRUCK','GITTERNETZ','MAGNETICBODEN','STARKE','HANDWASCHE_EMPFOHLEN','BESCHREIBUNG']]


In [None]:
set_items['ORIGINAL_BESCHREIBUNG']

In [None]:

set_items['DURCHMESSER'] = set_items['BESCHREIBUNG'].str.findall(r'[Boden]* Ø* \d+,*\d*\/\d,*\d*\/\d*,*\d* cm|[Ø|ø]* \d+,*\d* cm|\d+,*\d* x \d+,*\d* x \d+,*\d* cm \(B\/T\/H\)| \
                                                                 Ø* *\d+,*\d*\/\d+,*\d*\/\d+,*\d* cm|\(Ø \d+,*\d*, \d+,*\d*, \d+,*\d* cm\)|\d+,*\d* x \d+,*\d* cm')
set_items['DURCHMESSER']

In [None]:
# ## Exporting data into Excel
# mined_text.to_excel('Exports/kuchen_gerate_mined_sample_1.xlsx')

In [None]:
with pd.ExcelWriter("Exports/kuchen_gerate_mined_sample_1.xlsx") as writer:
    mined_text.to_excel(writer, sheet_name="Non-Set")  
    set_items.to_excel(writer, sheet_name="Set")  