# Data Processing

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

import plotly.offline as py
import plotly.express as px
py.init_notebook_mode(connected=True)

In [None]:
b_pandas = []

with open("../data/yelp/yelp_academic_dataset_business.json", "r") as f:
    reader = pd.read_json(f, orient="records", lines=True, 
                          chunksize=1000)
        
    for chunk in reader:
        reduced_chunk = chunk.query("`is_open` >= 1 & review_count >= 20")
        b_pandas.append(reduced_chunk)
    
df_item = pd.concat(b_pandas, ignore_index=True)
del b_pandas

In [None]:
df_item[["business_id", "state"]].groupby("state").count().sort_values(by="business_id", ascending=False).head(10)

In [None]:
b_pandas = []
r_dtypes = {
    "stars": np.float16, 
    "useful": np.int32, 
    "funny": np.int32,
    "cool": np.int32,
}
with open("../data/yelp/yelp_academic_dataset_user.json", "r") as f:
    reader = pd.read_json(f, orient="records", lines=True, 
                          dtype=r_dtypes, chunksize=1000)
        
    for chunk in reader:
        reduced_chunk = chunk.drop(columns=[
            "name", "yelping_since", "friends", "useful", "funny", "cool", "fans", "elite", "average_stars", 
            "compliment_hot", "compliment_more", "compliment_profile", "compliment_cute", "compliment_list", 
            "compliment_note", "compliment_plain", "compliment_cool", "compliment_funny", "compliment_writer", "compliment_photos"
            ])\
                             .query("`review_count` >= 5")
        b_pandas.append(reduced_chunk)
    
df_user = pd.concat(b_pandas, ignore_index=True)
del b_pandas

In [None]:
b_pandas = []
r_dtypes = {
    "stars": np.float16, 
    "useful": np.int32, 
    "funny": np.int32,
    "cool": np.int32,
}
with open("../data/yelp/yelp_academic_dataset_review.json", "r") as f:
    reader = pd.read_json(f, orient="records", lines=True, 
                          dtype=r_dtypes, chunksize=1000)
        
    for chunk in reader:
        reduced_chunk = chunk.drop(columns=["useful", "funny", "cool", "text"])#\
                            #.query("`date` >= '2019-06-01'")
        b_pandas.append(reduced_chunk)
    
df = pd.concat(b_pandas, ignore_index=True)
del b_pandas
df

In [None]:
df = pd.merge(df, df_user, how='inner', on='user_id')

del df_user

In [None]:
df_item = df_item[df_item["business_id"].isin(df["business_id"].unique())]
df = pd.merge(df, df_item, how='inner', on='business_id')

In [None]:
df[["review_id", "state"]].groupby("state").count().sort_values(by="review_id", ascending=False).head(10)

In [None]:
df[["review_id", "city"]].groupby("city").count().sort_values(by="review_id", ascending=False).head(10)

In [None]:
df = df.sort_values("date") 
df = df.rename(columns={"business_id": "item_id"})
df_item = df_item.rename(columns={"business_id": "item_id"})

In [None]:
# df_ca = df[df["state"] == "CA"]
# df_tn = df[df["state"] == "TN"]
# df_fl = df[df["state"] == "FL"]
# df_pa = df[df["state"] == "PA"]
# df_pd = df[df["city"] == "Philadelphia"]

# del df

In [None]:
# df_item_ca = df_item[df_item["state"] == "CA"]
# df_item_tn = df_item[df_item["state"] == "TN"]
# df_item_fl = df_item[df_item["state"] == "FL"]
# df_item_pa = df_item[df_item["state"] == "PA"]
# df_item_pd = df_item[df_item["city"] == "Philadelphia"]

# del df_item

In [None]:
item_encoder = preprocessing.LabelEncoder().fit(df.item_id.values)
df.item_id = item_encoder.transform(df.item_id.values)
df_item.item_id = item_encoder.transform(df_item.item_id.values)


user_encoder = preprocessing.LabelEncoder().fit(df.user_id.values)
df.user_id = user_encoder.transform(df.user_id.values)


# # ----------------------------------------------------------------------------------------------------------------------



# item_encoder = preprocessing.LabelEncoder().fit(df_ca.item_id.values)
# df_ca.item_id = item_encoder.transform(df_ca.item_id.values)
# df_item_ca.item_id = item_encoder.transform(df_item_ca.item_id.values)


# user_encoder = preprocessing.LabelEncoder().fit(df_ca.user_id.values)
# df_ca.user_id = user_encoder.transform(df_ca.user_id.values)

# # ----------------------------------------------------------------------------------------------------------------------

# item_encoder = preprocessing.LabelEncoder().fit(df_tn.item_id.values)
# df_tn.item_id = item_encoder.transform(df_tn.item_id.values)
# df_item_tn.item_id = item_encoder.transform(df_item_tn.item_id.values)


# user_encoder = preprocessing.LabelEncoder().fit(df_tn.user_id.values)
# df_tn.user_id = user_encoder.transform(df_tn.user_id.values)


# # ----------------------------------------------------------------------------------------------------------------------


# item_encoder = preprocessing.LabelEncoder().fit(df_fl.item_id.values)
# df_fl.item_id = item_encoder.transform(df_fl.item_id.values)
# df_item_fl.item_id = item_encoder.transform(df_item_fl.item_id.values)


# user_encoder = preprocessing.LabelEncoder().fit(df_fl.user_id.values)
# df_fl.user_id = user_encoder.transform(df_fl.user_id.values)


# ----------------------------------------------------------------------------------------------------------------------


# item_encoder = preprocessing.LabelEncoder().fit(df_pa.item_id.values)
# df_pa.item_id = item_encoder.transform(df_pa.item_id.values)
# df_item_pa.item_id = item_encoder.transform(df_item_pa.item_id.values)


# user_encoder = preprocessing.LabelEncoder().fit(df_pa.user_id.values)
# df_pa.user_id = user_encoder.transform(df_pa.user_id.values)


# ----------------------------------------------------------------------------------------------------------------------


# item_encoder = preprocessing.LabelEncoder().fit(df_pd.item_id.values)
# df_pd.item_id = item_encoder.transform(df_pd.item_id.values)
# df_item_pd.item_id = item_encoder.transform(df_item_pd.item_id.values)


# user_encoder = preprocessing.LabelEncoder().fit(df_pd.user_id.values)
# df_pd.user_id = user_encoder.transform(df_pd.user_id.values)

In [None]:
df[["user_id", "item_id", "stars_x"]].rename(columns={"stars_x": "stars"}).to_csv("../data/yelp/review.csv", index=False)
# df_ca[["user_id", "item_id", "stars_x"]].rename(columns={"stars_x": "stars"}).to_csv("../data/yelp/review_ca.csv", index=False)
# df_tn[["user_id", "item_id", "stars_x"]].rename(columns={"stars_x": "stars"}).to_csv("../data/yelp/review_tn.csv", index=False)
# df_fl[["user_id", "item_id", "stars_x"]].rename(columns={"stars_x": "stars"}).to_csv("../data/yelp_fl/review_fl.csv", index=False)
# df_pa[["user_id", "item_id", "stars_x"]].rename(columns={"stars_x": "stars"}).to_csv("../data/yelp_ca/review_pa.csv", index=False)
# df_pd[["user_id", "item_id", "stars_x"]].rename(columns={"stars_x": "stars"}).to_csv("../data/yelp_pd/review_pd.csv", index=False)

In [None]:
df_item = df_item.rename(columns={"name": "item_name"})
# df_item_ca = df_item_ca.rename(columns={"name": "item_name"})
# df_item_tn = df_item_tn.rename(columns={"name": "item_name"})
# df_item_fl = df_item_fl.rename(columns={"name": "item_name"})
# df_item_pa = df_item_pa.rename(columns={"name": "item_name"})
# df_item_pd = df_item_pd.rename(columns={"name": "item_name"})

In [None]:
df_item[["item_id", "item_name"]].to_csv("../data/yelp/items.csv", index=False)
# df_item_ca[["item_id", "item_name"]].to_csv("../data/yelp/items_ca.csv", index=False)
# df_item_tn[["item_id", "item_name"]].to_csv("../data/yelp/items_tn.csv", index=False)
# df_item_fl[["item_id", "item_name"]].to_csv("../data/yelp_fl/items_fl.csv", index=False)
# df_item_pa[["item_id", "item_name"]].to_csv("../data/yelp_ca/items_pa.csv", index=False)
# df_item_pd[["item_id", "item_name"]].to_csv("../data/yelp_pd/items_pd.csv", index=False)

In [None]:
_df_item = df_item[["item_id", "categories"]].fillna("other")
# _df_item_ca = df_item_ca[["item_id", "categories"]].fillna("other")
# _df_item_tn = df_item_tn[["item_id", "categories"]].fillna("other")
# _df_item_fl = df_item_fl[["item_id", "categories"]].fillna("other")
# _df_item_pa = df_item_pa[["item_id", "categories"]].fillna("other")
# _df_item_pd = df_item_pd[["item_id", "categories"]].fillna("other")


_df_item.categories  = _df_item.categories.str.split(",")
# _df_item_ca.categories  = _df_item_ca.categories.str.split(",")
# _df_item_tn.categories  = _df_item_tn.categories.str.split(",")
# _df_item_fl.categories  = _df_item_fl.categories.str.split(",")
# _df_item_pa.categories  = _df_item_pa.categories.str.split(",")
# _df_item_pd.categories  = _df_item_pd.categories.str.split(",")


_df_item = _df_item.rename(columns={"categories": "metadata"})
# _df_item_ca = _df_item_ca.rename(columns={"categories": "metadata"})
# _df_item_tn = _df_item_tn.rename(columns={"categories": "metadata"})
# _df_item_fl = _df_item_fl.rename(columns={"categories": "metadata"})
# _df_item_pa = _df_item_pa.rename(columns={"categories": "metadata"})
# _df_item_pd = _df_item_pd.rename(columns={"categories": "metadata"})


_df_item.to_csv("../data/yelp/items_metadata.csv", index=False)
# _df_item_ca.to_csv("../data/yelp/items_metadata_ca.csv", index=False)
# _df_item_tn.to_csv("../data/yelp/items_metadata_tn.csv", index=False)
# _df_item_fl.to_csv("../data/yelp_fl/items_metadata_fl.csv", index=False)
# _df_item_pa.to_csv("../data/yelp_ca/items_metadata_pa.csv", index=False)
# _df_item_pd.to_csv("../data/yelp_pd/items_metadata_pd.csv", index=False)

# TRAIN PMF

In [None]:
from __future__ import print_function

import os
import pickle

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.utils.data

import matplotlib.pyplot as plt

import sys 
sys.path.append('..')

from src.model.pmf import PMF

In [None]:
def RMSE(preds, truth):
    return np.sqrt(np.mean(np.square(preds-truth)))

In [None]:
batch_size = 100000
epoches = 500
no_cuda = False
seed = 1
weight_decay = 0.1
embedding_feature_size = 100
ratio = 0.8
lr = 0.0001
momentum = 0.9

In [None]:
df = pd.read_csv("../data/yelp/review.csv")
df.head(), df.count()

In [None]:
data = df[["user_id", "item_id", "stars"]].values

# Normalize rewards to [-1, 1]
data[:,2] = 0.5*(data[:,2] - 3)

# Shuffle data
np.random.shuffle(data)

In [None]:
NUM_ITEMS = df.item_id.max() + 1
NUM_USERS = df.user_id.max() + 1

print(NUM_USERS, NUM_ITEMS)

In [None]:
# Split data
train_data = data[:int(ratio * data.shape[0])]
vali_data = data[int(ratio * data.shape[0]): int((ratio+(1-ratio)/2)*data.shape[0])]
test_data = data[int((ratio + (1 - ratio) / 2) * data.shape[0]) :]

In [None]:
no_cuda=False

# Get CUDA device if available
cuda = torch.cuda.is_available()
 
# Set device to CUDA or CPU, depending on availability and desire
device = torch.device("cuda" if cuda and no_cuda else "cpu")
 
# Generate and apply seeds
torch.manual_seed(seed=seed)
if cuda:
    torch.cuda.empty_cache()
    torch.cuda.manual_seed(seed=seed)
 
# Specify number of workers for cuda
kwargs = {"num_workers":1, "pin_memory":True} if cuda else {}
 
# Construct Data Loaders
train_data_loader = torch.utils.data.DataLoader(torch.from_numpy(train_data), batch_size=batch_size, shuffle=False, **kwargs)
test_data_loader = torch.utils.data.DataLoader(torch.from_numpy(test_data), batch_size=batch_size, shuffle=False, **kwargs)

In [None]:
# Initialize model
model = PMF(n_users=NUM_USERS, n_items=NUM_ITEMS, n_factors=embedding_feature_size, no_cuda=no_cuda)
 
# Move model to CUDA if CUDA selected
if cuda and not no_cuda:
    model.cuda()
    print("Model moved to CUDA")
 
# Set loss function
loss_function = nn.MSELoss(reduction="sum")

# Set optimizer (uncomment Adam for adam)
# optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

In [None]:
# Function for training one epoch
def train(epoch, train_data_loader):
    # Initialize
    model.train()
    epoch_loss = 0.0
    optimizer.zero_grad()
 
    # Go through batches
    for batch_idx, ele in enumerate(train_data_loader):
        # Zero optimizer gradient
        optimizer.zero_grad()
 
        # Extract user_id_nums: row 0, item_id_nums: col 1 , ratings: val 2
        row = ele[:, 0]
        col = ele[:, 1]
        val = ele[:, 2]
 
        # Set to variables
        row = Variable(row.long())
        if isinstance(col, list):
            col = tuple(Variable(c.long()) for c in col)
        else:
            col = Variable(col.long())
        val = Variable(val.float())

        # Move data to CUDA
        if cuda and not no_cuda:
            row = row.cuda()
            col = col.cuda()
            val = val.cuda()
 
        # Train
        preds = model.forward(row, col)
        loss = loss_function(preds, val)
        loss.backward()
        optimizer.step()
 
        # Update epoch loss
        epoch_loss += loss.data
 
    epoch_loss /= train_data_loader.dataset.shape[0]
    return epoch_loss

In [None]:
# Training Model
count = 0
train_loss_list = []
last_vali_rmse = None
train_rmse_list = []
vali_rmse_list = []
print("parameters are: train ratio:{:f},batch_size:{:d}, epoches:{:d}, weight_decay:{:f}".format(ratio, batch_size, epoches, weight_decay))
print(model)

# Go through epochs
for epoch in range(1, epoches+1):

    # Train epoch
    train_epoch_loss = train(epoch, train_data_loader)

    # Get epoch loss
    train_loss_list.append(train_epoch_loss.cpu())

    # Move validation data to CUDA
    if cuda and not no_cuda:
        vali_row = Variable(torch.from_numpy(vali_data[:, 0]).long()).cuda()
        vali_col = Variable(torch.from_numpy(vali_data[:, 1]).long()).cuda()
    else:
        vali_row = Variable(torch.from_numpy(vali_data[:, 0]).long())
        vali_col = Variable(torch.from_numpy(vali_data[:, 1]).long())

    # Get validation predictions
    vali_preds = model.predict(vali_row, vali_col)

    # Calculate train rmse loss
    train_rmse = np.sqrt(train_epoch_loss.cpu())

    # Calculate validation rmse loss
    if cuda and not no_cuda:
        vali_rmse = RMSE(vali_preds.cpu().data.numpy(), vali_data[:, 2])
    else:
        vali_rmse = RMSE(vali_preds.data.numpy(), vali_data[:, 2])

    # Add losses to rmse loss lists
    train_rmse_list.append(train_rmse)
    vali_rmse_list.append(vali_rmse)

    print("Training epoch:{: d}, training rmse:{: .6f}, vali rmse:{:.6f}". \
            format(epoch, train_rmse, vali_rmse))

    # Early stop condition
    if last_vali_rmse and last_vali_rmse < vali_rmse:
        break
    else:
        last_vali_rmse = vali_rmse
    

In [None]:
# Testing Model

# Move test set to CUDA
if cuda:
    test_row = Variable(torch.from_numpy(test_data[:, 0]).long()).cuda()
    test_col = Variable(torch.from_numpy(test_data[:, 1]).long()).cuda()
else:
    test_row = Variable(torch.from_numpy(test_data[:, 0]).long())
    test_col = Variable(torch.from_numpy(test_data[:, 1]).long())
 
# Get test predictions
preds = model.predict(test_row, test_col)
 
# Get test rmse loss
if cuda:
    test_rmse = RMSE(preds.cpu().data.numpy(), test_data[:, 2])
else:
    test_rmse = RMSE(preds.data.numpy(), test_data[:, 2])
print("Test rmse: {:f}".format(test_rmse))

In [None]:
# Create plots
plt.figure(1)
plt.plot(range(1, len(train_rmse_list)+1), train_rmse_list, color="r", label="train rmse")
plt.plot(range(1, len(vali_rmse_list)+1), vali_rmse_list, color="b", label="test rmse")
plt.legend()
plt.annotate(r"train=%f" % (train_rmse_list[-1]), xy=(len(train_rmse_list), train_rmse_list[-1]),
             xycoords="data", xytext=(-30, 30), textcoords="offset points", fontsize=10,
             arrowprops=dict(arrowstyle="->", connectionstyle="arc3, rad=.2"))
plt.annotate(r"vali=%f" % (vali_rmse_list[-1]), xy=(len(vali_rmse_list), vali_rmse_list[-1]),
             xycoords="data", xytext=(-30, 30), textcoords="offset points", fontsize=10,
             arrowprops=dict(arrowstyle="->", connectionstyle="arc3, rad=.2"))
plt.xlim([1, len(train_rmse_list)+10])
plt.xlabel("iterations")
plt.ylabel("RMSE")
plt.title("RMSE Curve in Training Process")
plt.show()

In [None]:
# Save model
path_to_trained_pmf = "../model/pmf/yelp_pd_emb_{:d}_ratio_{:f}_bs_{:d}_e_{:d}_wd_{:f}_lr_{:f}_trained_pmf.pt".format(embedding_feature_size, ratio, batch_size, len(train_rmse_list), weight_decay, lr)
torch.save(model.state_dict(), path_to_trained_pmf)

In [None]:
idx =  -2
(model.predict(
    torch.tensor([data[idx][0]]).long().to("cuda"), 
    torch.tensor([data[idx][1]]).long().to("cuda")
).cpu().data[0] + 1) / 2, (data[idx][2] + 1) / 2

# Data Processing

In [None]:
import pandas as pd
import pickle
import numpy as np

In [None]:
import plotly.offline as py
import plotly.express as px
py.init_notebook_mode(connected=True)

In [None]:
df.head()

In [None]:
users_dict = {user: [] for user in set(df["user_id"])}

ratings_df_gen = df.iterrows()
users_dict_positive_items = {
    user: [] for user in set(df["user_id"])
}
for data in ratings_df_gen:
    users_dict[data[1]["user_id"]].append(
        (data[1]["item_id"], data[1]["stars"])
    )
    if data[1]["stars"] >= 4:
        users_dict_positive_items[data[1]["user_id"]].append(
            (data[1]["item_id"], data[1]["stars"])
        )
users_history_lens = [
    len(users_dict_positive_items[u])
    for u in set(df["user_id"])
]

users_num = max(df["user_id"]) + 1
items_num = max(df["item_id"]) + 1

print(users_num, items_num)

In [None]:
train_users_num = int(users_num * 0.8)
train_users_dict = {k: users_dict.get(k) for k in range(0, train_users_num - 1)}
train_users_history_lens = users_history_lens[:train_users_num]

# Evaluating setting
eval_users_num = int(users_num * 0.2)
eval_users_dict = {
    k: users_dict[k] for k in range(users_num - eval_users_num, users_num)
}
eval_users_history_lens = users_history_lens[-eval_users_num:]


In [None]:
# Save processed data
with open("../data/yelp_pd/train_users_dict.pkl", "wb") as file:
    pickle.dump(train_users_dict, file)

with open("../data/yelp_pd/train_users_history_lens.pkl", "wb") as file:
    pickle.dump(train_users_history_lens, file)

with open("../data/yelp_pd/eval_users_dict.pkl", "wb") as file:
    pickle.dump(eval_users_dict, file)

with open("../data/yelp_pd/eval_users_history_lens.pkl", "wb") as file:
    pickle.dump(eval_users_history_lens, file)

with open("../data/yelp_pd/users_history_lens.pkl", "wb") as file:
    pickle.dump(users_history_lens, file)


In [None]:
z = np.random.geometric(p=0.35, size=items_num)
w = z%10 
w = [i if i > 0 else 10 for i in w]

In [None]:
px.histogram(w)

In [None]:
item_group = {i: w[i] for i in range(items_num)}

In [None]:
with open("../data/yelp_pd/item_groups.pkl", "wb") as file:
    pickle.dump(item_group, file)