In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack


Download data from kaggle

In [None]:
#!mkdir ~/.kaggle
#!cp kaggle.json ~/.kaggle/
#!chmod 600 ~/.kaggle/kaggle.json
#!kaggle datasets download -d lakshmi25npathi/online-retail-dataset

Extract Data

In [None]:
#!unzip online-retail-dataset.zip
df=pd.read_excel("online_retail.xlsx")

In [None]:
df
# df.info()
# df.describe()

Data type casting

In [None]:
def auto_data_type(df):
    for col in df.select_dtypes(include=[np.number]).columns:
        min_value = df[col].min()
        max_value = df[col].max()

        if pd.api.types.is_float_dtype(df[col]):
            df[col] = df[col].astype(np.float32)
        elif min_value >= 0:
            if max_value <= 255:
                df[col] = df[col].astype(np.uint8)
            elif max_value <= 65535:
                df[col] = df[col].astype(np.uint16)
            elif max_value <= 4294967295:
                df[col] = df[col].astype(np.uint32)
            else:
                df[col] = df[col].astype(np.uint64)
        else:
            if -128 <= min_value and max_value <= 127:
                df[col] = df[col].astype(np.int8)
            elif -32768 <= min_value and max_value <= 32767:
                df[col] = df[col].astype(np.int16)
            elif -2147483648 <= min_value and max_value <= 2147483647:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)
    return df


clean_df=auto_data_type(df)
#df.info()


drop rows with missing Customer ID

In [None]:
df = df.dropna(subset=["Customer ID"])
df["Customer ID"] = df["Customer ID"].astype(np.uint16)

Remove rows with duplicate entries, missing and negative Quantity

In [None]:
df_copy=df.copy()
df_copy = df_copy.drop_duplicates()
df_copy = df_copy[df_copy["Quantity"] >= 0]
df_copy.info()


Encode Country column to numeric labels

In [None]:
le = LabelEncoder()
df_copy["Country_Code"] = le.fit_transform(df_copy["Country"].astype(str)).astype(np.uint8)

In [None]:
df_copy = df_copy.drop(columns=["Country"])

In [91]:
main_df = df_copy.copy()

df_clean = main_df.copy()

df_clean["Description"] = df_clean["Description"].astype(str).str.lower()
df_clean["Description"] = df_clean["Description"].str.replace(r"[^a-zA-Z0-9\s]", " ", regex=True)
df_clean["Description"] = df_clean["Description"].str.replace(r"\s+", " ", regex=True)

invoice_df = df_clean.groupby("Invoice")["Description"].apply(lambda x: " ".join(x)).reset_index()

vectorizer = CountVectorizer(stop_words="english")
X = vectorizer.fit_transform(invoice_df["Description"])

#counter = vectorizer.vocabulary_.get("mug")


In [172]:
user_input = input("Enter you product name:")

user_input = user_input.lower()
user_input = user_input.replace(",", " ")
user_input = re.sub(r"[^a-zA-Z0-9\s]", "", user_input)
user_input = re.sub(r"\s+", " ", user_input).strip()

user_vector = vectorizer.transform([user_input])

if user_vector.nnz == 0:
    print("No products found.")
else:
    similarity_scores = cosine_similarity(user_vector, X).flatten()

    sorted_scores = similarity_scores.argsort()[::-1]

    most_similar_invoice = []
    for i in sorted_scores:
        if similarity_scores[i] < 0.999:
            most_similar_invoice.append(i)
        if len(most_similar_invoice) == 10:
            break

    for j in most_similar_invoice:
      invoice_number = invoice_df.iloc[j]['Invoice']
      similarity_score = similarity_scores[j]
      print(f"invoice number: {invoice_number} - similarity: {similarity_score:.6f}")


Enter you product name:black hat
invoice number: 493883 - similarity: 0.447214
invoice number: 532040 - similarity: 0.416025
invoice number: 534457 - similarity: 0.408248
invoice number: 511334 - similarity: 0.408248
invoice number: 511463 - similarity: 0.408248
invoice number: 511330 - similarity: 0.408248
invoice number: 515933 - similarity: 0.408248
invoice number: 496238 - similarity: 0.392232
invoice number: 518765 - similarity: 0.392232
invoice number: 492521 - similarity: 0.375000


In [176]:
user_items = set(user_input.split())
recommended_list = []
tokens = []

invoice_items_temp = df_clean.groupby("Invoice")["Description"].apply(list).reset_index()

for i in most_similar_invoice:
    descriptions = invoice_items_temp.iloc[i]["Description"]

    for word in descriptions:
        word = word.lower()
        word = re.sub(r"[^a-zA-Z0-9\s]", "", word)
        word = re.sub(r"\s+", " ", word).strip()
        for t in word.split():
            if len(t) > 1:
                tokens.append(t)


        if len(tokens) >= 2 and tokens[0].isdigit():
            phrase = tokens[0]
            for j in range(1, min(len(tokens), 4)):
                if tokens[j].isdigit():
                    break
                phrase += " " + tokens[j]
            if not any(item in user_items for item in phrase.split()):
                recommended_list.append(phrase)

        for i in range(len(tokens) - 2):
            group = tokens[i:i+4]
            if any(item.isdigit() for item in group[1:]):
                continue
            if not any(item in user_items for item in group):
                phrase = " ".join(group)
                recommended_list.append(phrase)

recommended_list = list(set(recommended_list))
print("\n Recommended products list:")
counter = 0
for item in recommended_list[:10]:
    counter += 1
    print(f"{counter} - {item}")




 Recommended products list:
1 - wine glass retro spot
2 - white necklace tassel
3 - spot traditional teapot
4 - dots ruffled umbrella
5 - red white dots ruffled
6 - light setting red white
7 - retro spot traditional teapot
8 - white dots ruffled umbrella
9 - ruffled umbrella edwardian parasol
10 - bead necklace tassel
