In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack


Download data from kaggle

In [None]:
#!mkdir ~/.kaggle
#!cp kaggle.json ~/.kaggle/
#!chmod 600 ~/.kaggle/kaggle.json
#!kaggle datasets download -d lakshmi25npathi/online-retail-dataset

Extract Data

In [2]:
#!unzip online-retail-dataset.zip
df=pd.read_excel("online_retail.xlsx")

In [None]:
df
# df.info()
# df.describe()

Data type casting

In [4]:
def auto_data_type(df):
    for col in df.select_dtypes(include=[np.number]).columns:
        min_value = df[col].min()
        max_value = df[col].max()

        if pd.api.types.is_float_dtype(df[col]):
            df[col] = df[col].astype(np.float32)
        elif min_value >= 0:
            if max_value <= 255:
                df[col] = df[col].astype(np.uint8)
            elif max_value <= 65535:
                df[col] = df[col].astype(np.uint16)
            elif max_value <= 4294967295:
                df[col] = df[col].astype(np.uint32)
            else:
                df[col] = df[col].astype(np.uint64)
        else:
            if -128 <= min_value and max_value <= 127:
                df[col] = df[col].astype(np.int8)
            elif -32768 <= min_value and max_value <= 32767:
                df[col] = df[col].astype(np.int16)
            elif -2147483648 <= min_value and max_value <= 2147483647:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)
    return df


clean_df=auto_data_type(df)
#df.info()


drop rows with missing Customer ID

In [None]:
df = df.dropna(subset=["Customer ID"])
df["Customer ID"] = df["Customer ID"].astype(np.uint16)

Remove rows with duplicate entries, missing and negative Quantity

In [None]:
df_copy=df.copy()
df_copy = df_copy.drop_duplicates()
df_copy = df_copy[df_copy["Quantity"] >= 0]
df_copy.info()


Encode Country column to numeric labels

In [38]:
le = LabelEncoder()
df_copy["Country_Code"] = le.fit_transform(df_copy["Country"].astype(str)).astype(np.uint8)

In [None]:
df_copy = df_copy.drop(columns=["Country"])

In [50]:
main_df = df_copy.copy()

main_df["Description"] = main_df["Description"].str.lower()

invoice_df = main_df.groupby("Invoice")["Description"].apply(lambda x: " ".join(x)).reset_index()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\b[a-z]*\d+[a-z]*\b", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"\b\d+\w*\b", "", text)
    text = re.sub(r"\b(on|the|and|at|with|to|for|in|of|by)\b", "", text)
    return text

invoice_df["MixedDescription"] = invoice_df["Description"].apply(clean_text)

vectorizer = CountVectorizer(stop_words="english")
X = vectorizer.fit_transform(invoice_df["MixedDescription"])

#counter = vectorizer.vocabulary_.get("mug")
