### Import libraries

In [1]:
import pandas as pd
import requests
import re

from collections import defaultdict
from bs4 import BeautifulSoup

### Read dataset

In [2]:
df = pd.read_csv("data/products_dataset.csv")

### Extract the type of furniture from English dictionary

In [3]:
url = "https://www.enchantedlearning.com/wordlist/furniture.shtml"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

prods_list = []
for tag in soup.find_all(class_='wordlist-item'):
    # Replace non-alphanumeric characters with empty string (preserving spaces)
    item = re.sub(r'[-/.*]', '', tag.text).lower().strip()
    prods_list.append(item)

### Build the dataframe in the workable format

In [4]:
# Convert list to a dictionary for faster lookup
prods_dict = defaultdict(str)
for prod in prods_list:
    prods_dict[prod.lower()] = prod.upper().replace(' ', '-')

# Initialize your new data
new_data = {'tokens': [], 'ner_tags': []}

# Iterate over your DataFrame
for _, row in df.iterrows():
    sentence = str(row['Products'])
    sentence_tokens = sentence.split()
    sentence_ner_tags = []
    
    # Iterate over tokens in the sentence
    token_index = 0
    while token_index < len(sentence_tokens):
        token = sentence_tokens[token_index]
        
        matched_prod = None
        for prod in prods_dict:
            prod_tokens = prod.split()
            if sentence_tokens[token_index:token_index+len(prod_tokens)] == prod_tokens:
                matched_prod = prod
                break

        if matched_prod:
            for _ in matched_prod.split():
                # If it's the beginning of the product name, add 'B-', else add 'I-'
                prefix = 'B-' if _ == matched_prod.split()[0] else 'I-'
                sentence_ner_tags.append(prefix + prods_dict[matched_prod])
                token_index += 1
        else:
            sentence_ner_tags.append('O')
            token_index += 1
            
    # Append the tokens and tags to your new data
    new_data['tokens'].append(sentence_tokens)
    new_data['ner_tags'].append(sentence_ner_tags)

# Convert new data to DataFrame
df_model = pd.DataFrame(new_data)

Let's check how many products from the English dictionary are matching with the ones from the extracted URL's

In [5]:
unique_ner_tags = list(set([tag for sublist in new_data['ner_tags'] for tag in sublist]))

len_dict_prods = len(prods_list)
len_extracted_prods = len(unique_ner_tags)
print(f"Looks like we have a number of {len_extracted_prods} extracted prods out of a total of {len_dict_prods} from the dictionary.")

Looks like we have a number of 84 extracted prods out of a total of 143 from the dictionary.


### Build a dataframe to do some small EDA

In [6]:
# Extract the domain from the URL in df DataFrame
df['domain'] = df['url'].apply(lambda x: x.split('/')[2])
df_eda = pd.merge(df_model, df['domain'], left_index=True, right_index=True)

Check which is the most popular product on each website

In [7]:
# Flatten the 'ner_tags' column and remove the "O" tags
df_non_o_tags = df_eda.explode('ner_tags').loc[lambda x: x['ner_tags'] != 'O']

# Check which is the most popular products on each website
popular_tags = df_non_o_tags.groupby(['domain'])['ner_tags'].value_counts().groupby(level=0).idxmax().str[1]

Let's take an example to check if we meesed something up

In [8]:
most_popular_domain_name = df_non_o_tags['domain'].value_counts().idxmax()
most_popular_domain_count = df_non_o_tags['domain'].value_counts().max()

popular_tag = popular_tags.loc[most_popular_domain_name]
occurrence = df_non_o_tags.loc[df_non_o_tags['domain'] == most_popular_domain_name, 'ner_tags'].value_counts().max()

print(f"The domain selling the highest number of products is {most_popular_domain_name}. They have around {most_popular_domain_count} furniture products on their website.\n"
      f"The most popular tag is {popular_tag} appearing {occurrence} times.")

The domain selling the highest number of products is www.saladinostyle.com. They have around 64 furniture products on their website.
The most popular tag is B-CHAIR appearing 15 times.


Let's check for the most popular product from our list of URL's

In [9]:
# Check the most popular tag from the entire DataFrame
most_popular_tag_name = df_non_o_tags['ner_tags'].value_counts().idxmax()
most_popular_tag_count = df_non_o_tags['ner_tags'].value_counts().max()

# Print the most popular tag
print(f"The most popular tag from the entire DataFrame is: {most_popular_tag_name} appearing {most_popular_tag_count} times.")

The most popular tag from the entire DataFrame is: B-CHAIR appearing 129 times.


### Export dataframe we will used in the training process

In [10]:
df_model['ner_tags'] = df_model['ner_tags'].apply(lambda tags: ['B-PRODUCT' if tag.startswith('B-') else 'I-PRODUCT' if tag.startswith('I-') else 'O' for tag in tags])

In [11]:
tag_counts = df_model['ner_tags'].apply(pd.Series).stack().value_counts()
print(tag_counts)

O            4678
B-PRODUCT     843
I-PRODUCT      70
dtype: int64


In [12]:
df_model.to_csv("data/training_data.csv", index=False)