# [Product Recommendations for Online Retail Store](https://medium.com/@peggy1502/product-recommendations-for-online-retail-store-1d565e1607b7)
### Build and Train a Personalized Recommender Engine with Amazon Sagemaker Factorization Machines

**This is `Notebook Part 1`**

**Click [here](fm_v3_part2.ipynb) for `Notebook Part 2`**

In [1]:
import numpy as np 
import pandas as pd 
import time

import boto3
#import sagemaker
#import sagemaker.amazon.common as smac

from scipy.sparse import csr_matrix, hstack, save_npz, load_npz
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
print("numpy version:", np.__version__)
print("pandas version:", pd.__version__)

# Read Dataset

In [3]:
pd.set_option('display.max_columns', 50)

df = pd.read_csv("../input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv", 
                 parse_dates=["created_at", "Working Date"], low_memory=False)
df

In [4]:
import seaborn as sns
sns.heatmap(df.isnull(), cbar=False)

In [5]:
columns = ["sku", "category_name_1", "Customer ID", "price", "qty_ordered"]
df = df[columns]

In [6]:
df.info()

# Data Cleaning

### (i) Drop empty rows & columns, and handle missing values

In [7]:
df = df.dropna(how="all", axis=0) # Drop empty rows
df = df.dropna(how="all", axis=1) # Drop empty columns

In [8]:
df.isna().sum()

In [9]:
# Drop rows where sku or Customer ID is null.

df.dropna(axis=0, subset=["sku", "Customer ID"], inplace=True) 

In [10]:
# Replace missing value for category_name_1 with empty string.

df["category_name_1"].fillna("", inplace=True)

In [11]:
df.isna().sum()

In [12]:
df

### (ii) Remove leading and trailing spaces for `sku` and `category_name_1`

In [13]:
df["sku"] = df["sku"].apply(lambda x: x.strip())

In [14]:
df[["sku", "category_name_1"]] = df[["sku", "category_name_1"]
                                   ].apply(lambda x: x.str.strip())

In [15]:
# To check that leading space has been removed.
# E.g. the original sku "" Huawei Mate 8" has a leading space.
df[df["sku"] == "Huawei Mate 8"].head(2)

### (iii) For `category_name_1`, replace `\N` with the correct value.

In [16]:
# There are around 7.8K records with "category_name_1" = "\N".

df["category_name_1"].value_counts()

In [17]:
# Example: For this particular sku, the category_name_1 contains value "\N" and "Superstore".

df[df["sku"]=="RB_Dettol Germ Busting Kit-bf"][["sku", "category_name_1"]]

In [18]:
dfmap = df.copy()

# Select all records where category_name_1 is not "\N".
dfmap = dfmap.loc[dfmap["category_name_1"] != "\\N"]

# Filter only columns "sku", "category_name_1".
dfmap = dfmap[["sku", "category_name_1"]]
dfmap

In [19]:
# Number of unique sku where category_name_1 is not "\N".
dfmap["sku"].nunique()

In [20]:
# Create a mapping dictionary of sku:category_name_1

dic = dfmap.set_index('sku').to_dict("dict")
dic = dic['category_name_1']
dic

In [21]:
# Check to ensure that sku "RB_Dettol Germ Busting Kit-bf" is mapping to "Superstore".
dic["RB_Dettol Germ Busting Kit-bf"]

In [22]:
# Check before apply mapping
df.loc[df["category_name_1"] == "\\N"]

In [23]:
# Map the category_name_1 column with the dictionary values.
# If dictionary mapping not found, replace it with empty string.

# df["category_name_1"] = df["sku"].map(dic).fillna(df['category_name_1'])
df["category_name_1"] = df["sku"].map(dic).fillna("")

In [24]:
# Check to confirm that sku "RB_Dettol Germ Busting Kit-bf" is now having all category_name_1 as "Superstore".
df[df["sku"]=="RB_Dettol Germ Busting Kit-bf"][["sku", "category_name_1"]]

In [25]:
# Check after apply mapping

df.loc[df["category_name_1"] == "\\N"]

In [26]:
df.loc[df["category_name_1"] == ""]

# Create new column `sku_and_cat`
(i.e. combining `sku` with `category_name_1`)

In [27]:
df["sku_and_cat"] = df["sku"] + " " + df["category_name_1"]
df[["sku", "sku_and_cat"]] = df[["sku", "sku_and_cat"]].apply(lambda x: x.str.strip())

# Group records by (`sku`, `category_name_1`, `sku_and_cat`, `Customer ID`, `price`) and sum `qty_ordered`

In [28]:
df = df.groupby(["sku", "category_name_1", "sku_and_cat", "Customer ID", "price"])["qty_ordered"].sum()
df

In [29]:
# Ensure that we only select records with sum of qty_ordered > 0, and reset the index.
df = df.loc[df > 0].reset_index()
df

In [30]:
print("Unique sku:", df["sku"].nunique())
print("Unique category_name_1:", df["category_name_1"].nunique())
print("Unique sku_and_cat:", df["sku_and_cat"].nunique())
print("Unique Customer ID:", df["Customer ID"].nunique())

# Create Sparse Matrix

### Perform one-hot encoding for categorical data

In [31]:
ohe = OneHotEncoder(handle_unknown = "ignore")
ohe_cols = ["sku", "category_name_1", "Customer ID"]
ohe_features = ohe.fit_transform(df[ohe_cols])
ohe_features

In [32]:
ohe_feature_names = ohe.get_feature_names()
df_ohe = pd.DataFrame(data = ohe_features.toarray(), index=range(len(df)), columns=ohe_feature_names)
df_ohe

### Perform TF-IDF encoding for text data

In [33]:
df["sku_and_cat"] = df["sku_and_cat"].str.replace("-", " ")
df["sku_and_cat"] = df["sku_and_cat"].str.replace("_", " ")

# min_df=2: Ignore terms that appear in less than 2 documents.
vectorizer = TfidfVectorizer(min_df=2)  
vectorizer.fit(df["sku_and_cat"].unique())
tfidf_features = vectorizer.transform(df["sku_and_cat"])
tfidf_features

In [34]:
tfidf_feature_names = vectorizer.get_feature_names()
df_tfidfvect = pd.DataFrame(data = tfidf_features.toarray(), index=range(len(df)), columns=tfidf_feature_names)
df_tfidfvect

### Sparse matrix for Price

In [35]:
# Sparse matrix for price.
# Reference: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html

row = range(len(df))
col = [0] * len(df)   # This is a list of zeros [0,0,0,....]
price = csr_matrix((df["price"].values, (row, col)), dtype="float32")
price

### Sparse matrix for all features

In [36]:
# Stack all the sparse matrices side by side.

X = hstack([ohe_features, tfidf_features, price], format="csr", dtype="float32")
X

In [37]:
X = hstack([ohe_features, tfidf_features, price], 
           format="csr", dtype="float32")
X

In [38]:
y = df["qty_ordered"].values.astype("float32")
y

In [39]:
total = X.shape[0] * X.shape[1]
non_zero = X.nnz
sparsity = (total - non_zero) / total

print("Total elements:", total)
print("Non-zero elements:", non_zero)
print("Sparsity:", round(sparsity*100, 4), "%")

# Splitting Data into Training and Test Set

In [40]:
# By default, shuffle=True.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=73)           # By default, shuffle=True.

print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape, "\n")
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

### Save local files

In [41]:
# Save the preprocessed csv file (records grouped by sku, category_name_1, sku_and_cat, Customer ID, and price, with sum of qty_ordered).
df.to_csv("fm_preprocessed.csv", index=False)

# save_npz: Save a sparse matrix for X_train and X_test to a file using .npz format.
# np.savez: Save array for y_test into a single file in uncompressed .npz format.
save_npz("X_train.npz", X_train) 
save_npz("X_test.npz", X_test)
np.savez("y_train.npz", y_train) 
np.savez("y_test.npz", y_test)

# Save the feature dimension to a text file.
feature_dim = X.shape[1]
with open("feature_dim.txt", "w") as f:
    f.write(str(feature_dim))

**Click [here](fm_v3_part2.ipynb) to continue on `Notebook Part 2` for**
- Creating Sparse RecordIO File
- Training Job & Hyperparameters
- Train Model
- Deploy Model
- Model Inference