In [65]:
import word2number as w2n
from tabulate import tabulate
import re
from nltk.corpus import stopwords
import spacy
nlp = spacy.load("en_core_web_sm")

# **function to convert numerical data to digits**

In [66]:
def word_to_num(word):
    word_dict = {
        'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4,
        'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9,
        'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13,
        'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': 17,
        'eighteen': 18, 'nineteen': 19, 'twenty': 20,
        'thirty': 30, 'forty': 40, 'fifty': 50, 'sixty': 60,
        'seventy': 70, 'eighty': 80, 'ninety': 90,
        'hundred': 100, 'thousand': 1000, 'million': 1000000
    }
    words = word.split()
    num = 0
    total = 0
    for word in words:
        if word in word_dict:
            num = word_dict[word]
            if num == 1000000:
                total *= 1000000
            else:
                total += num
        else:
            return None
    return total

# **defining the pattern**

In [67]:
numbers = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety', 'hundred', 'thousand', 'million']
pattern = r"((?:" + '|'.join(numbers) + r"|\d)(?:\s(?:" + '|'.join(numbers) + r"|\d|and))*)(.*?)(\d+[\.|\,]?\d*)\b\s*(\$|dollar)"

# **Use case text**

In [68]:
text = "I bought five smartphones with 1500$ each, four kilos of fresh bananas for 1.2$ each, one apple for 4.5 dollar, three pairs of sneakers at 90$ each, six bottles of soda for 1.9 dollar each, two bags of chips at 8$ each, seven cans of soup for 5$ each, eight rolls of toilet paper for 7$ each, nine bars of soap for 1.2$ each, and ten boxes of tissues for 4$ each. "


## **Tokenize the text**

In [69]:
word_tokens = nlp(text)


# **filtrer our tokens**

In [70]:
filtered_tokens = [token.text for token in word_tokens if token.pos_ != "ADJ"]


In [71]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# **removing stop words**

In [72]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in filtered_tokens if token.lower() not in stop_words]
filtered_tokens = ' '.join(filtered_tokens)


# **units pattern**

In [73]:
weight_units_regex = r'\b(?:kg|kilos|[\w]*(?:gram|grams))\b'
filtered_tokens = re.sub(weight_units_regex, '', filtered_tokens)

In [74]:
matches = re.finditer(pattern, filtered_tokens)

# **generate bill**

In [75]:
products = []
quantities = []
unit_prices = []
total_prices = []
total=0
for match in matches:
    quantity = match.group(1)
    quantity = word_to_num(quantity.replace(',', '.'))
    product = match.group(2)
    price = float(match.group(3).replace(',', '.'))

    # Calculate total price
    total_price = quantity * price
    total=total+total_price
    products.append(product)
    quantities.append(quantity)
    unit_prices.append(price)
    total_prices.append(total_price)

bill_data = list(zip(products, quantities, unit_prices, total_prices))

In [76]:
print(tabulate(bill_data, headers=["Product", "Quantity", "Unit Price ($)", "Total Price ($)"], tablefmt="pretty"))


+--------------------+----------+----------------+--------------------+
|      Product       | Quantity | Unit Price ($) |  Total Price ($)   |
+--------------------+----------+----------------+--------------------+
|    smartphones     |    5     |     1500.0     |       7500.0       |
|      bananas       |    4     |      1.2       |        4.8         |
|       apple        |    1     |      4.5       |        4.5         |
|   pairs sneakers   |    3     |      90.0      |       270.0        |
|    bottles soda    |    6     |      1.9       | 11.399999999999999 |
|     bags chips     |    2     |      8.0       |        16.0        |
|     cans soup      |    7     |      5.0       |        35.0        |
| rolls toilet paper |    8     |      7.0       |        56.0        |
|     bars soap      |    9     |      1.2       | 10.799999999999999 |
|   boxes tissues    |    10    |      4.0       |        40.0        |
+--------------------+----------+----------------+--------------

In [77]:
print(total)

7948.5
