##Recommendation engine using a dataset found online.

In [None]:

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'uelstoredataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2014458%2F3335493%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240603%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240603T072731Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1b9737c05719873846e22274ff201729177ba4ec9036bfc27d663516b954099ca340f8f2b28abe37aa67b79bcbbb39ec44cf768c0e04ee800f253a4cd5733651c6307babb0cbadbf8c7ae9370c1f3561ab6e2c392694787b0c2209744a425cafe8d6db97a3cd6516dc4afe7af7331033fe6aaadbffdba6c7417537d68df5d9dc5047d4ffcd76c1a4b1921e8cc03153cbaddd2d43b13183b8f3a75a6f9cd785b99ca82dffc48f90948d5a85d5c3830c967b30170e07b89f2ce74e15c8af08a778bee2bd51c9922066e45acb7427bd0764c9a3b5f2eed5869d29c6c7a3bfe538b72cd523fd6df01e1871349df505aa370cbfc873e2112e702d74d1d4aebcb41c96,traintestset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2438829%2F4127106%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240603%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240603T072731Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D23eff58934827ede5773e6a3e5131c11f69a160ade022565ccc3f1e265ccbacece188309741e84a3cd5b76107ad0c48845bc6267f5f2638c3fe1525a720fe30639495ae8eb5b29c9670e7340ac9d1b4d753774f8e601500371781a8393bd0ea8d57cf927506de0a7cb333f7e7bce43d5e95095a7d350317a2919e5dc416a176a4750152046b01ff1ee4387dbd8027a982e84a0eefae0afef9504e5db4ab41726e8a1e67f9a7eba49672a237845bbd5ace196a6e07999c840212fbb77a2aa0eb6ffe8b968036f025c9d6a0d43abd0186c18b005e6895bf35ef67e9c4c693c1d228c20a07546c2f9a9ac0fce8b0f887475b0344f835c4f799f4baa02794e8268e7'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading uelstoredataset, 1282217 bytes compressed
Downloaded and uncompressed: uelstoredataset
Downloading traintestset, 12376248 bytes compressed
Downloaded and uncompressed: traintestset
Data source import complete.


### Dataset Description

This dataset is from ecommerce website http://ecom.uelstore.com/
3 file datasets with JSon format
-678 users (id, nickname)
-732 products of all kinds (id, name)
-130754 product - reviews (rating, date)



In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/uelstoredataset/customers.json
/kaggle/input/uelstoredataset/products.json
/kaggle/input/uelstoredataset/ratings.json
/kaggle/input/traintestset/Rating_0.7_91608_test.json
/kaggle/input/traintestset/Rating_0.2_26087_test.json
/kaggle/input/traintestset/Rating_0.4_52428_test.json
/kaggle/input/traintestset/Rating_0.6_52178_train.json
/kaggle/input/traintestset/Rating_0.9_12940_train.json
/kaggle/input/traintestset/Rating_0.2_104667_train.json
/kaggle/input/traintestset/Rating_0.8_104567_test.json
/kaggle/input/traintestset/Rating_0.4_78326_train.json
/kaggle/input/traintestset/Rating_0.1_117671_train.json
/kaggle/input/traintestset/Rating_0.3_39153_test.json
/kaggle/input/traintestset/Rating_0.5_65486_test.json
/kaggle/input/traintestset/Rating_0.7_39146_train.json
/kaggle/input/traintestset/Rating_0.1_13083_test.json
/kaggle/input/traintestset/Rating_0.9_117814_test.json
/kaggle/input/traintestset/Rating_0.6_78576_test.json
/kaggle/input/traintestset/Rating_0.3_91601_trai

## Customers

In [None]:
customers=pd.read_json("../input/uelstoredataset/customers.json")

In [None]:
customers.size

1356

In [None]:
customers

Unnamed: 0,Id,NickName
0,103603,1000kgthanh
1,103760,999999999ok
2,103829,ac7ive
3,1,admin
4,103839,ahkk.nguyen
...,...,...
673,103904,yenxaome
674,103610,yoneteru
675,103718,young.che
676,23251,zeatop939


## Products

In [None]:
products=pd.read_json("../input/uelstoredataset/products.json")

In [None]:
products

Unnamed: 0,Id,Name,UnitPrice
0,1,Build your own computer,1200.0
1,2,Digital Storm VANQUISH 3 Custom Performance PC,1259.0
2,3,Lenovo IdeaCentre 600 All-in-One PC,500.0
3,4,Apple MacBook Pro 13-inch,1800.0
4,5,Asus N551JK-XO076H Laptop,1500.0
...,...,...,...
686,687,Bird Box,1.0
687,688,Snowpiercer,2.0
688,689,Edge of Tomorrow,1.0
689,690,Ponyo,3.0


In [None]:
products.size

2073

##Ratings

In [None]:
ratings=pd.read_json("../input/uelstoredataset/ratings.json")

In [None]:
ratings

Unnamed: 0,CustomerID,ProductID,Rate,CreateDate
0,103416,619,1,2018/01/01 01:36:30
1,103654,411,1,2018/01/01 01:36:35
2,103954,298,3,2018/01/01 01:36:38
3,103672,361,5,2018/01/01 01:37:15
4,103960,536,5,2018/01/01 02:36:25
...,...,...,...,...
130749,103907,501,1,2022/03/16 22:25:10
130750,103907,200,1,2022/03/16 22:49:28
130751,103907,184,1,2022/03/16 22:53:35
130752,103907,211,1,2022/03/16 23:14:47


In [None]:
ratings.drop('CreateDate', inplace=True, axis=1)

In [None]:
ratings

Unnamed: 0,CustomerID,ProductID,Rate
0,103416,619,1
1,103654,411,1
2,103954,298,3
3,103672,361,5
4,103960,536,5
...,...,...,...
130749,103907,501,1
130750,103907,200,1
130751,103907,184,1
130752,103907,211,1


In [None]:
#Matrix factorisation code found online for doing collaborative filtering.

class MatrixFactorization(object):
    def __init__(self, Y,customers,products, K, lam = 0.1, Xinit = None, Winit = None, learning_rate = 0.5, max_iter = 1000, print_every = 100):
        self.Y = Y # represents the utility matrix
        self.K = K
        self.lam = lam # regularization parameter
        self.learning_rate = learning_rate # for gradient descent
        self.max_iter = max_iter # maximum number of iterations
        self.print_every = print_every # print loss after each a few iters
        self.customers=customers
        self.products=products
        self.n_users = int(np.max(Y[:, 0])) + 1
        self.n_items = int(np.max(Y[:, 1])) + 1
        #self.n_users = customers.size
        #self.n_items = products.size
        self.n_ratings = Y.shape[0] # number of known ratings
        self.X = np.random.randn(self.n_items, K) if Xinit is None\
        else Xinit
        self.W = np.random.randn(K, self.n_users) if Winit is None\
        else Winit
        self.b = np.random.randn(self.n_items) # item biases
        self.d = np.random.randn(self.n_users) # user biases
    def loss(self):
        L = 0
        for i in range(self.n_ratings):
            # user_id, item_id, rating
            n, m, rating = int(self.Y[i,0]), int(self.Y[i,1]), self.Y[i,2]
            L += 0.5*(self.X[m].dot(self.W[:, n])\
            + self.b[m] + self.d[n] - rating)**2
        L /= self.n_ratings
        # regularization, don’t ever forget this
        return L + 0.5*self.lam*(np.sum(self.X**2) + np.sum(self.W**2))
    def updateXb(self):
        products = np.array(self.products["Id"])
        for m in range(0,products.size):
            pId = products[m]
            # get all users who rated item m and corresponding ratings
            ids = np.where(self.Y[:, 1] == pId)[0] # row indices of items m
            if ids.size>0:
                user_ids, ratings=self.Y[ids, 0].astype(np.int32),self.Y[ids, 2]
                Wm, dm = self.W[:, user_ids], self.d[user_ids]
                for i in range(30): # 30 iteration for each sub problem
                    xm = self.X[m]
                    error = xm.dot(Wm) + self.b[m] + dm - ratings
                    grad_xm = error.dot(Wm.T)/self.n_ratings + self.lam*xm
                    grad_bm = np.sum(error)/self.n_ratings
                    # gradient descent
                    self.X[m] -= self.learning_rate*grad_xm.reshape(-1).astype('float64')
                    self.b[m] -= self.learning_rate*grad_bm
    def updateWd(self): # and d
        customers = np.array(self.customers["Id"])
        for n in range(0,customers.size):
            custId=customers[n]
            # get all items rated by user n, and the corresponding ratings
            ids = np.where(self.Y[:,0] == custId)[0] #indexes of items rated by n
            if ids.size>0:
                item_ids,ratings=self.Y[ids, 1].astype(np.int32), self.Y[ids, 2]
                Xn, bn = self.X[item_ids], self.b[item_ids]
                for i in range(30): # 30 iteration for each sub problem
                    wn = self.W[:, n]
                    error = Xn.dot(wn) + bn + self.d[n] - ratings
                    grad_wn = Xn.T.dot(error)/self.n_ratings + self.lam*wn
                    grad_dn = np.sum(error)/self.n_ratings
                    grad_dn=grad_dn
                    # gradient descent
                    self.W[:, n] -= self.learning_rate*grad_wn.reshape(-1).astype('float64')
                    self.d[n] -= self.learning_rate*grad_dn
    def fit(self):
        for it in range(self.max_iter):
            self.updateWd()
            self.updateXb()
            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y)
                print("iter = %d, loss = %.4f, RMSE train = %.4f"%(it + 1,
                self.loss(), rmse_train))
    def predict(self, u, i):
        """
        predict the rating of user u for item i
        """
        try:
            u, i = int(u), int(i)
            pred = self.X[i, :].dot(self.W[:, u]) + self.b[i] + self.d[u]
            return max(0, min(5, pred))  # 5-scale in Ecommerce
        except:
            return  0
    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0] # number of test
        SE = 0 # squared error
        for n in range(n_tests):
            pred = self.predict(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2
        RMSE = np.sqrt(SE/n_tests)
        return RMSE

In [None]:
rate_train =ratings[0:129000]
rate_train = np.array(rate_train)
rate_test = ratings[129001:]
rate_test= np.array(rate_test)
print('Number of traing rates:', rate_train.shape[0])
print('Number of test rates:', rate_test.shape[0])

Number of traing rates: 129000
Number of test rates: 1753


In [None]:
# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1
mf = MatrixFactorization(rate_train,customers,products,K = 50, lam = .01, print_every = 5, learning_rate = 50,max_iter = 30)
mf.fit()
# evaluate on test data
RMSE = mf.evaluate_RMSE(rate_test)
print("\nMatrix Factorization CF, RMSE = %.4f" %RMSE)

iter = 5, loss = 25894.0147, RMSE train = 1.8817
iter = 10, loss = 25894.0147, RMSE train = 1.8817
iter = 15, loss = 25894.0147, RMSE train = 1.8817
iter = 20, loss = 25894.0147, RMSE train = 1.8817
iter = 25, loss = 25894.0147, RMSE train = 1.8817
iter = 30, loss = 25894.0147, RMSE train = 1.8817

Matrix Factorization CF, RMSE = 2.0414


In [None]:
rate_test

array([[103400,    462,      4],
       [103400,    288,      4],
       [103400,    475,      4],
       ...,
       [103906,    183,      1],
       [103906,    210,      1],
       [103906,    165,      1]])

# Test Recommendation

Test 10 Customer

In [None]:
expected_score=3.8
print("Expected Score =",expected_score)
for c in customers.values[0:5]:
    customerId=c[0]
    customerName=c[1]
    print("Customer [",customerId,customerName,"], recommendation products:")
    for p in products.values:
        productId=p[0]
        productName=p[1]
        result=mf.predict(customerId,productId)
        if result>=expected_score:
            print("\t Recommend Product [",productName, "] Score=",result)

Expected Score = 3.8
Customer [ 103603 1000kgthanh ], recommendation products:
	 Recommend Product [ Oversized Women T-Shirt ] Score= 3.960218639765726
	 Recommend Product [ Levi's 511 Jeans ] Score= 3.8221753140505736
	 Recommend Product [ Lightstick BLACKPINK ] Score= 3.824918920938888
	 Recommend Product [ TÊN SẢN PHẦM ] Score= 3.88366961335611
	 Recommend Product [ Ống hút giấy ] Score= 4.057826505841622
	 Recommend Product [ Tony Buổi Sáng - Trên Đường Băng ] Score= 3.869408221031734
	 Recommend Product [ Ống hút kim loại ] Score= 3.865159447497978
	 Recommend Product [ Vải sáp ong ] Score= 3.8278055185329007
	 Recommend Product [ Ống Hút Kim Loại Bằng Thép Không Gỉ ] Score= 3.8287510193889696
	 Recommend Product [ Ly Giấy Dùng Một Lần EDO TH7526 ] Score= 3.94825996622734
	 Recommend Product [ Ống hút giấy - copy ] Score= 4.0264780829863716
	 Recommend Product [ GIÀY CONVERSE SUNFLOWER THẤP CỔ SF ] Score= 3.9282071650859383
	 Recommend Product [ New Banlance 580 ] Score= 3.8377046

# Using surprise library for this.
from the documentation, videos found online this can be used for matrix classification to make a recommendation engine.

In [None]:
pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357252 sha256=ecabfb4cbcf9611ff9db3ded574adc65515

In [None]:
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

# Ratings Distribution

In [None]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

data = ratings['Rate'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / ratings.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} ratings'.format(ratings.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'),
              width=500,
              height=500,
              autosize=False,
             )
# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [None]:
data

Rate
5    35512
4    30458
3    15838
2    21070
1    27876
Name: count, dtype: int64

In [None]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['CustomerID','ProductID','Rate']], reader)

In [None]:
data

<surprise.dataset.DatasetAutoFolds at 0x7c95dca4e950>

In [None]:
from surprise import Dataset, KNNBaseline, Reader

reader = Reader(rating_scale=(1, 5))

rate_train =ratings[0:104603]
#rate_train = np.array(rate_train)
rate_test = ratings[104604:]
#rate_test= np.array(rate_test)

train_Dataset = Dataset.load_from_df(rate_train[['CustomerID','ProductID','Rate']], reader)
valid_Dataset = Dataset.load_from_df(rate_test[['CustomerID','ProductID','Rate']], reader)

In [None]:
rate_train

Unnamed: 0,CustomerID,ProductID,Rate
0,103416,619,1
1,103654,411,1
2,103954,298,3
3,103672,361,5
4,103960,536,5
...,...,...,...
104598,103391,498,4
104599,103883,666,5
104600,103749,372,3
104601,103394,153,3


In [None]:
rate_test

Unnamed: 0,CustomerID,ProductID,Rate
104604,103415,266,3
104605,103687,419,4
104606,103730,603,2
104607,103862,372,2
104608,14902,166,1
...,...,...,...
130749,103907,501,1
130750,103907,200,1
130751,103907,184,1
130752,103907,211,1


In [None]:
train_Dataset = train_Dataset.build_full_trainset()

In [None]:
algo = KNNBaseline()
algo.fit(train_Dataset)


Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7c95dca4f8b0>

In [None]:
testset = [valid_Dataset.df.iloc[i].to_list() for i in range(len(valid_Dataset.df))]

In [None]:
predictions=algo.test(testset)

In [None]:
acrmse=accuracy.rmse(predictions)
print(acrmse)

acmse=accuracy.mse(predictions)
print(acmse)

acmae=accuracy.mae(predictions)
print(acmae)

RMSE: 1.0753
1.0753009768574298
MSE: 1.1563
1.1562721908305427
MAE:  0.7782
0.7781571143764785


In [None]:
prediction=algo.test([[103416,619,1]])

In [None]:
prediction

[Prediction(uid=103416, iid=619, r_ui=1, est=2.180316384116525, details={'actual_k': 40, 'was_impossible': False})]

In [None]:
acrmse=accuracy.rmse(prediction)
print(acrmse)

RMSE: 1.1803
1.1803163841165252


In [None]:
# Make sure these two are in the training focus , And the real value is 4
uid = 103416 # user
iid = 619 # goods
# Get forecasts for specific users and items
pred = algo.predict(uid, iid, r_ui=1, verbose=True)

user: 103416     item: 619        r_ui = 1.00   est = 2.18   {'actual_k': 40, 'was_impossible': False}


In [None]:
# Make sure these two are in the training focus , And the real value is 4
uid = 103954 # user
iid = 298 # goods
# Get forecasts for specific users and items
pred = algo.predict(uid, iid, r_ui=3, verbose=True)

user: 103954     item: 298        r_ui = 3.00   est = 3.26   {'actual_k': 40, 'was_impossible': False}


https://surprise.readthedocs.io/en/stable/accuracy.html