Run first

### System rekomendacyjny oparty o model filtrowania kolaboratywnego

Import niezbędnych bibliotek, definicja ustawień logowania i stałych

In [1]:
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.data import Dataset
import pandas as pd

import logging as log
from os.path import dirname, join, abspath
import re
import sys
sys.path.insert(0, abspath(join(dirname('recommender_system'), '..')))

from src.utils.helper import (
    create_table_in_postgres_db,
    load_input_data_from_csv_to_postgres_table,
    get_newest_existing_model_version,
    pickle_model_results,
    read_data_from_gziped_file,
    train_lightfm_model,
    unpickle
)

In [2]:
logger = log.getLogger()
logger.setLevel(log.INFO)

In [3]:
BASE_PATH = abspath(join(dirname('recommender_system'), '../../..'))
RESULTS_PATH = f'{BASE_PATH}/results'

Walidacja ocen użytkowników z pliku All_Beauty.json.gz, zapis do pliku .CSV oraz załadowanie do bazy PostgresSQL

In [None]:
ratings = read_data_from_gziped_file(path=f'{BASE_PATH}/input_data/All_Beauty.json.gz')
ratings_df = pd.DataFrame.from_dict(ratings)

# 'ASIN' (Amazon Standard Identification Number) field validation, which is a 10-character alphanumeric code given by Amazon to every product, eg. B000URXP6E
ratings_df[ratings_df['asin'].apply(lambda x: True if re.search('^[A-Z0-9]{10}$', x) else False)]

# 'reviewerID' field validation, which is an alphanumeric code consisted of 10 to 20 characters, eg. A1V6B6TNIC10QE
ratings_df[ratings_df['reviewerID'].apply(lambda x: True if re.search('^[A-Z0-9]{10,20}$', x) else False)]

# 'overall' field validation which is a grade given by user in scale from 1.0 to 5.0
ratings_df['overall'].astype(float)
ratings_df = ratings_df[(ratings_df['overall'] >= 1) & (ratings_df['overall'] <= 5)]

# 'verified' field validation which is bool
ratings_df['verified'].astype(bool)

# 'reviewTime' field validation which is date eg. 2016-11-19
ratings_df['reviewTime'].astype('datetime64[ns]')

# 'unixReviewTime' field validation which is date expressed in number of seconds from Unix epoch, eg. 1522627200
ratings_df['unixReviewTime'].astype(int)

# 'vote' field validation which is number of given reviews - changing thousands expressed as strings eg. 1,104
ratings_df['vote'] = ratings_df['vote'].str.replace(',', '')
ratings_df['vote'] = ratings_df['vote'].fillna(0)
ratings_df['vote'].astype(int)

ratings_df.sample(n=5, ignore_index=True)

In [None]:
ratings_df.to_csv(f'{BASE_PATH}/input_data/All_Beauty.csv', index=False, header=True, escapechar='\\')
log.info("File saved")

In [None]:
## Create table to store ratings data if it does not exist
create_ratings = '''create table IF NOT exists ratings (
	overall float8,
	verified bool,
	reviewTime date,
	reviewerID varchar(20),
	asin varchar(10),
	reviewerName varchar,
	reviewText varchar,
	summary varchar,
	unixReviewTime int4,
	vote int4,
	style varchar,
	image varchar
);'''

create_table_in_postgres_db(create_ratings)

In [None]:
load_input_data_from_csv_to_postgres_table('ratings', f'{BASE_PATH}/input_data/All_Beauty.csv')

Walidacja danych z pliku meta_All_Beauty.json.gz, zapis do pliku .CSV oraz załadowanie do bazy PostgresSQL

In [None]:
items = read_data_from_gziped_file(path=f'{BASE_PATH}/input_data/meta_All_Beauty.json.gz')
items_df = pd.DataFrame.from_dict(items)

# Drop unnecessary columns, with no business value
items_df = items_df.drop(columns=['category', 'fit', 'similar_item', 'tech1', 'tech2'])

# Drop duplicates
log.info(f"Duplicated product asins: {items_df.duplicated(subset='asin').sum()}.")
items_df = items_df.drop_duplicates(subset='asin')
log.info(f"Rows number after data cleaning: {len(items_df)}.")

# 'ASIN' (Amazon Standard Identification Number) field validation, which is a 10-character alphanumeric code given by Amazon to every product, eg. B000URXP6E
items_df[items_df['asin'].apply(lambda x: True if re.search('^[A-Z0-9]{10}$', x) else False)]

# 'date' field validation which is date eg. 2016-11-19
re_dates = re.compile(r"(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}")
items_df[items_df['date'].apply(lambda x: True if re.search(re_dates, x) else False)]

# 'price' field validation expressed in USD
items_df['price'] = items_df.price.apply(lambda x: x if re.search('^\$\d*.\d{2}$', x) else '0.0')
items_df['price'] = items_df['price'].str.replace('$', '')

items_df.sample(n=5, ignore_index=True)

In [None]:
items_df.to_csv(f'{BASE_PATH}/input_data/meta_All_Beauty.csv', index=False, header=True, escapechar='\\')
log.info("File saved")

In [None]:
## Create table to store items metadata if it does not exist
create_items_metadata = '''create table IF NOT exists items_metadata (
	description varchar,
	title varchar,
	also_buy varchar,
	brand varchar(100),
	feature varchar,
	rank_info varchar,
	also_view varchar,
	details varchar,
	main_cat varchar,
	review_date varchar,
	price float8,
	asin varchar(10) PRIMARY KEY,
	imageURL varchar,
	imageURLHighRes varchar
);'''

create_table_in_postgres_db(create_items_metadata)

In [None]:
load_input_data_from_csv_to_postgres_table('items_metadata', f'{BASE_PATH}/input_data/meta_All_Beauty.csv')

Konstrukcja zbioru danych

In [None]:
dataset = Dataset()
dataset.fit((x['reviewerID'] for _, x in ratings_df.iterrows()),
            (x['asin'] for _, x in ratings_df.iterrows()))

# Control numbers
num_users, num_items = dataset.interactions_shape()
log.info(f"Dataset has been created. Interactions shape: num users {num_users}, num items {num_items}.")

Zbudowanie macierzy interakcji

In [45]:
(interactions, _) = dataset.build_interactions(((x['reviewerID'], x['asin'])
                                                      for _, x in ratings_df.iterrows()))

Podział danych na zbiór testowy i treningowy

In [46]:
(train, test) = random_train_test_split(interactions, test_percentage=0.2)
logger.info("Dataset has been split into train and test")

INFO:root:Dataset has been split into train and test


Stworzenie sześciu rodzajów modeli:
1. Model z mechanizmem optymalizacji adagrad oraz funkcją straty BPR.
2. Model z mechanizmem optymalizacji adadelta oraz funkcją straty BPR.
3. Model z mechanizmem optymalizacji adagrad oraz funkcją straty WARP.
4. Model z mechanizmem optymalizacji adadelta oraz funkcją straty WARP.
5. Model z mechanizmem optymalizacji adagrad oraz funkcją straty k-OS WARP.
6. Model z mechanizmem optymalizacji adadelta oraz funkcją straty k-OS WARP

In [47]:
alpha = 1e-3
epochs = 70

adagrad_bpr_model = LightFM(no_components=30,
                        loss='bpr',
                        learning_schedule='adagrad',
                        user_alpha=alpha,
                        item_alpha=alpha)

adadelta_bpr_model = LightFM(no_components=30,
                        loss='bpr',
                        learning_schedule='adadelta',
                        user_alpha=alpha,
                        item_alpha=alpha)

adagrad_warp_model = LightFM(no_components=30,
                        loss='warp',
                        learning_schedule='adagrad',
                        user_alpha=alpha,
                        item_alpha=alpha)

adadelta_warp_model = LightFM(no_components=30,
                        loss='warp',
                        learning_schedule='adadelta',
                        user_alpha=alpha,
                        item_alpha=alpha)

adagrad_kos_warp_model = LightFM(no_components=30,
                        loss='warp-kos',
                        learning_schedule='adagrad',
                        user_alpha=alpha,
                        item_alpha=alpha)

adadelta_kos_warp_model = LightFM(no_components=30,
                        loss='warp-kos',
                        learning_schedule='adadelta',
                        user_alpha=alpha,
                        item_alpha=alpha)

In [None]:
model_name = 'adagrad_bpr_model'
model_results_path = f'{RESULTS_PATH}/{model_name}'

adagrad_bpr_model_auc, adagrad_bpr_model_duration = train_lightfm_model(epochs, adagrad_bpr_model, model_name, test, train)

version = get_newest_existing_model_version(model_results_path) + 1

pickle_model_results(
    adagrad_bpr_model_auc,
    dataset,
    adagrad_bpr_model_duration,
    adagrad_bpr_model,
    model_name,
    model_results_path,
    version
)

In [None]:
model_name = 'adadelta_bpr_model'
model_results_path = f'{RESULTS_PATH}/{model_name}'

adadelta_bpr_model_auc, adadelta_bpr_model_duration = train_lightfm_model(epochs, adadelta_bpr_model, model_name, test, train)

version = get_newest_existing_model_version(model_results_path) + 1

pickle_model_results(
    adadelta_bpr_model_auc,
    dataset,
    adadelta_bpr_model_duration,
    adagrad_warp_model,
    model_name,
    model_results_path,
    version
)

Trenowanie modelu z mechanizmem optymalizacji adagrad oraz funkcją straty WARP

In [None]:
model_name = "adagrad_warp_model"
model_results_path = f'{RESULTS_PATH}/{model_name}'

adagrad_warp_model_auc, adagrad_warp_model_duration = train_lightfm_model(epochs, adagrad_warp_model, model_name, test, train)

version = get_newest_existing_model_version(model_results_path) + 1

pickle_model_results(
    adagrad_warp_model_auc,
    dataset,
    adagrad_warp_model_duration,
    adagrad_warp_model,
    model_name,
    model_results_path,
    version
)

Trenowanie modelu z mechanizmem optymalizacji adadelta oraz funkcją straty WARP

In [None]:
model_name = "adadelta_warp_model"
model_results_path = f'{RESULTS_PATH}/{model_name}'

adadelta_warp_model_auc, adadelta_warp_model_duration = train_lightfm_model(epochs, adadelta_warp_model, model_name, test, train)

version = get_newest_existing_model_version(model_results_path) + 1

pickle_model_results(
    adadelta_warp_model_auc,
    dataset,
    adadelta_warp_model_duration,
    adadelta_warp_model,
    model_name,
    model_results_path,
    version
)

Trenowanie modelu z mechanizmem optymalizacji adagrad oraz funkcją straty k-OS WARP

In [None]:
model_name = "adagrad_kos_warp_model"
model_results_path = f'{RESULTS_PATH}/{model_name}'

adagrad_kos_warp_model_auc, adagrad_kos_warp_model_duration = train_lightfm_model(epochs, adagrad_kos_warp_model, model_name, test, train)

version = get_newest_existing_model_version(model_results_path) + 1

pickle_model_results(
    adagrad_kos_warp_model_auc,
    dataset,
    adagrad_kos_warp_model_duration,
    adagrad_kos_warp_model,
    model_name,
    model_results_path,
    version
)

Trenowanie modelu z mechanizmem optymalizacji adadelta oraz funkcją straty k-OS WARP

In [None]:
model_name = "adadelta_kos_warp_model"
model_results_path = f'{RESULTS_PATH}/{model_name}'

adadelta_kos_warp_model_auc, adadelta_kos_warp_model_duration = train_lightfm_model(epochs, adadelta_kos_warp_model, model_name, test, train)

version = get_newest_existing_model_version(model_results_path) + 1

pickle_model_results(
    adadelta_kos_warp_model_auc,
    dataset,
    adadelta_kos_warp_model_duration,
    adadelta_kos_warp_model,
    model_name,
    model_results_path,
    version
)