<a href="https://colab.research.google.com/github/nishanthjois/2021_DeepLearning/blob/main/yoochoose2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#download data
!apt-get install p7zip
!curl -Lo yoochoose-data.7z https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z
!7z x yoochoose-data.7z

Reading package lists... Done
Building dependency tree       
Reading state information... Done
p7zip is already the newest version (16.02+dfsg-6).
p7zip set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  273M  100  273M    0     0  28.5M      0  0:00:09  0:00:09 --:--:-- 32.4M

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 287211932 bytes (274 MiB)

Extracting archive: yoochoose-data.7z
--
Path = yoochoose-data.7z
Type = 7z
Physical Size = 287211932
Headers Size = 255
Method = LZMA:24
Solid = +
Blocks = 2

  0%      0% - yoochoose-buys.dat

In [2]:
#installing packages
!pip install git+https://github.com/maciejkula/spotlight.git

Collecting git+https://github.com/maciejkula/spotlight.git
  Cloning https://github.com/maciejkula/spotlight.git to /tmp/pip-req-build-5b2rpqyt
  Running command git clone -q https://github.com/maciejkula/spotlight.git /tmp/pip-req-build-5b2rpqyt
Building wheels for collected packages: spotlight
  Building wheel for spotlight (setup.py) ... [?25l[?25hdone
  Created wheel for spotlight: filename=spotlight-0.1.6-py3-none-any.whl size=33929 sha256=2747ec479f1c17b54d5357d6559a52579c7df9cf65a5ea150d4459a6a6811bfa
  Stored in directory: /tmp/pip-ephem-wheel-cache-pzmg08sr/wheels/e3/d1/9a/e23e52dedfb8b39d6702d11b0b07e2b39728516db407e0579b
Successfully built spotlight
Installing collected packages: spotlight
Successfully installed spotlight-0.1.6


In [3]:
import os
import numpy as np
import pandas as pd
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import time
import h5py
import hashlib
import json
import shutil
import sys
import torch

from sklearn.model_selection import ParameterSampler
from sklearn.preprocessing import LabelEncoder
random_state = np.random.RandomState(100)

from spotlight.interactions import Interactions
from spotlight.evaluation import mrr_score
from spotlight.evaluation import precision_recall_score
from spotlight.sequence.implicit import ImplicitSequenceModel
from spotlight.sequence.representations import CNNNet
from spotlight.evaluation import sequence_mrr_score
from spotlight.cross_validation import random_train_test_split
from spotlight.cross_validation import user_based_train_test_split
from spotlight.factorization.implicit import ImplicitFactorizationModel

In [4]:
# !git clone https://github.com/sparsh9012/python-util.git
sys.path.append('./python-util')
sys.path.append('./python-util/recsys')

In [10]:
%%writefile preprocess.py
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# from reco.evaluate import user_item_crossjoin, filter_by


def encode_user_item(df, user_col, item_col, rating_col, time_col):
    """Function to encode users and items
    
    Params:     
        df (pd.DataFrame): Pandas data frame to be used.
        user_col (string): Name of the user column.
        item_col (string): Name of the item column.
        rating_col (string): Name of the rating column.
        timestamp_col (string): Name of the timestamp column.
    
    Returns: 
        encoded_df (pd.DataFrame): Modifed dataframe with the users and items index
    """
    
    encoded_df = df.copy()
    
    user_encoder = LabelEncoder()
    user_encoder.fit(encoded_df[user_col].values)
    n_users = len(user_encoder.classes_)
    
    item_encoder = LabelEncoder()
    item_encoder.fit(encoded_df[item_col].values)
    n_items = len(item_encoder.classes_)

    encoded_df["USER"] = user_encoder.transform(encoded_df[user_col])
    encoded_df["ITEM"] = item_encoder.transform(encoded_df[item_col])
    
    encoded_df.rename({rating_col: "RATING", time_col: "TIMESTAMP"}, axis=1, inplace=True)
    
    print("Number of users: ", n_users)
    print("Number of items: ", n_items)
    
    return encoded_df, user_encoder, item_encoder


def random_split (df, ratios, shuffle=False):
    
    """Function to split pandas DataFrame into train, validation and test
    
    Params:     
        df (pd.DataFrame): Pandas data frame to be split.
        ratios (list of floats): list of ratios for split. The ratios have to sum to 1.
    
    Returns: 
        list: List of pd.DataFrame split by the given specifications.
    """
    seed = 42                  # Set random seed
    if shuffle == True:
        df = df.sample(frac=1)     # Shuffle the data
    samples = df.shape[0]      # Number of samples
    
    # Converts [0.7, 0.2, 0.1] to [0.7, 0.9]
    split_ratio = np.cumsum(ratios).tolist()[:-1] # Get split index
    
    # Get the rounded integer split index
    split_index = [round(x * samples) for x in split_ratio]
    
    # split the data
    splits = np.split(df, split_index)
    
    # Add split index (this makes splitting by group more efficient).
    for i in range(len(ratios)):
        splits[i]["split_index"] = i

    return splits


def user_split (df, ratios, chrono=False):
    
    """Function to split pandas DataFrame into train, validation and test (by user in chronological order)
    
    Params:     
        df (pd.DataFrame): Pandas data frame to be split.
        ratios (list of floats): list of ratios for split. The ratios have to sum to 1.
        chrono (boolean): whether to sort in chronological order or not
    
    Returns: 
        list: List of pd.DataFrame split by the given specifications.
    """
    seed = 42                  # Set random seed
    samples = df.shape[0]      # Number of samples
    col_time = "TIMESTAMP"
    col_user = "USER"
    
    # Split by each group and aggregate splits together.
    splits = []

    # Sort in chronological order, the split by users
    if chrono == True:
        df_grouped = df.sort_values(col_time).groupby(col_user)
    else:
        df_grouped = df.groupby(col_user)

        
    
    for name, group in df_grouped:
        group_splits = random_split(df_grouped.get_group(name), ratios, shuffle=False)
        
        # Concatenate the list of split dataframes.
        concat_group_splits = pd.concat(group_splits)
        splits.append(concat_group_splits)
    
    # Concatenate splits for all the groups together.
    splits_all = pd.concat(splits)

    # Take split by split_index
    splits_list = [ splits_all[splits_all["split_index"] == x] for x in range(len(ratios))]

    return splits_list

def neg_feedback_samples(
    df,
    rating_threshold, 
    ratio_neg_per_user=1
):
    """ function to sample negative feedback from user-item interaction dataset.

    This negative sampling function will take the user-item interaction data to create 
    binarized feedback, i.e., 1 and 0 indicate positive and negative feedback, 
    respectively. 

    Args:
        df (pandas.DataFrame): input data that contains user-item tuples.
        rating_threshold (int): value below which feedback is set to 0 and above which feedback is set to 1
        ratio_neg_per_user (int): ratio of negative feedback w.r.t to the number of positive feedback for each user. 

    Returns:
        pandas.DataFrame: data with negative feedback 
    """
    
    #df.rename({"user_id":"USER", "movie_id":"ITEM", "rating":"RATING"}, inplace=True)
    #print(df.columns)
    #print(df.columns)
    df.columns = ["USER", "ITEM", "RATING", "unix_timestamp"]
    #print(df.columns)
    
    seed = 42
    
    df_pos = df.copy()
    df_pos["RATING"] = df_pos["RATING"].apply(lambda x: 1 if x >= rating_threshold else 0)
    df_pos = df_pos[df_pos.RATING>0]


    # Create a dataframe for all user-item pairs 
    df_neg = user_item_crossjoin(df)

    #remove positive samples from the cross-join dataframe
    df_neg = filter_by(df_neg, df_pos, ["USER", "ITEM"])    

    #Add a column for rating - setting it to 0
    df_neg["RATING"] = 0
   
    # Combine positive and negative samples into a single dataframe
    df_all = pd.concat([df_pos, df_neg], ignore_index=True, sort=True)
    df_all = df_all[["USER", "ITEM", "RATING"]]
    
    
    # Sample negative feedback from the combined dataframe.
    df_sample = (
        df_all.groupby("USER")
        .apply(
            lambda x: pd.concat(
                [
                    x[x["RATING"] == 1],
                    x[x["RATING"] == 0].sample(
                        min(
                            max(
                                round(len(x[x["RATING"] == 1]) * ratio_neg_per_user), 1
                            ),
                            len(x[x["RATING"] == 0]),
                        ),
                        random_state=seed,
                        replace=False,
                    )
                    if len(x[x["RATING"] == 0] > 0)
                    else pd.DataFrame({}, columns=["USER", "ITEM", "RATING"]),
                ],
                ignore_index=True,
                sort=True,
            )
        )
        .reset_index(drop=True)
        .sort_values("USER")
    )

#     print("####")
#     print(df_sample.columns)
#     print(df.columns)
#     df_sample_w_ts = pd.merge(df_sample, df, on=["USER", "ITEM"], how="left")
#     print(df_sample.columns)
    df_sample.columns = ["movie_id", "rating", "user_id"]
    return df_sample[["user_id", "movie_id", "rating"]]
#    return df_sample


def sample_data():

    data = pd.DataFrame({
        "user_index": [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3],
        "item_index": [1, 1, 2, 2, 2, 1, 2, 1, 2, 3, 3, 3, 3, 3, 1],
        "rating": [4, 4, 3, 3, 3, 4, 5, 4, 5, 5, 5, 5, 5, 5, 4],
        "timestamp": [
            '2000-01-01', '2000-01-01', '2000-01-02', '2000-01-02', '2000-01-02',
            '2000-01-01', '2000-01-01', '2000-01-03', '2000-01-03', '2000-01-03',
            '2000-01-01', '2000-01-03', '2000-01-03', '2000-01-03', '2000-01-04'
        ]
    })
    
    return data

Overwriting preprocess.py


In [11]:
from IPython.display import SVG, display
from preprocess import encode_user_item, random_split, user_split

Clicks data

In [13]:
df_clicks = pd.read_csv('yoochoose-clicks.dat', sep=',', header=None,
                        dtype={0:np.int32, 1:str, 2:np.int64, 3:str},
                        names = ["SessionId", "TimeStr", "ItemId", "Item_Type"])
df_clicks.head()

Unnamed: 0,SessionId,TimeStr,ItemId,Item_Type
0,1,2014-04-07T10:51:09.277Z,214536502,0
1,1,2014-04-07T10:54:09.868Z,214536500,0
2,1,2014-04-07T10:54:46.998Z,214536506,0
3,1,2014-04-07T10:57:00.306Z,214577561,0
4,2,2014-04-07T13:56:37.614Z,214662742,0


In [14]:
#category types
'''The categories can be S (for promotion), 0 (when unknown), 
a number between 1-12 when it came from a category on the page
or any other that represents a brand'''

def assign_cat(x):
    if x == "S":
        return "PROMOTION"
    elif np.int(x) == 0:
        return "NONE"
    elif np.int(x) < 13:
        return "CATEGORY"
    else:
        return "BRAND"

df_clicks['Item_Type'] = df_clicks.loc[:,'Item_Type'].map(assign_cat)

Buy data

In [15]:
df_buys = pd.read_csv('yoochoose-buys.dat', sep=',', header=None,
                      dtype={0:np.int32, 1:str, 2:np.int64, 
                             3:np.int64, 4:np.int64},
                      names = ["SessionId", "TimeStr", "ItemId", "Price", "Quantity"])
df_buys.head()

Unnamed: 0,SessionId,TimeStr,ItemId,Price,Quantity
0,420374,2014-04-06T18:44:58.314Z,214537888,12462,1
1,420374,2014-04-06T18:44:58.325Z,214537850,10471,1
2,281626,2014-04-06T09:40:13.032Z,214535653,1883,1
3,420368,2014-04-04T06:13:28.848Z,214530572,6073,1
4,420368,2014-04-04T06:13:28.858Z,214835025,2617,1


In [16]:
df_buys.drop(["TimeStr"], inplace=True, axis=1)
df_buys["Action"] = "BUY"
df_buys.head()

Unnamed: 0,SessionId,ItemId,Price,Quantity,Action
0,420374,214537888,12462,1,BUY
1,420374,214537850,10471,1,BUY
2,281626,214535653,1883,1,BUY
3,420368,214530572,6073,1,BUY
4,420368,214835025,2617,1,BUY


In [None]:
df = pd.merge(left=df_clicks, right=df_buys, how="left", on=["SessionId", "ItemId"])
df.head()

Exploring data

In [None]:
query = "ItemId==@ItemId & SessionId==@SessionId"

In [None]:
ItemId = 214821371
SessionId = 11

In [None]:
df_clicks.query(query)

In [None]:
df_buys.query(query)

In [None]:
df.query(query)

In [None]:
# Drop duplicates
df.drop_duplicates(inplace=True)

Data subset selection based on thresholds

In [None]:
SESSION_THRESHOLD = 20
ITEM_THRESHOLD = 1000

In [None]:
session_lengths = df.groupby(["SessionId"]).size()
session_lengths_w_threshold = (session_lengths[session_lengths>SESSION_THRESHOLD]).reset_index()
df_with_session_threshold = df[df.SessionId.isin(session_lengths_w_threshold.SessionId)]

In [None]:
item_lengths = df.groupby(["ItemId"]).size()
item_lengths_w_threshold = item_lengths[item_lengths>ITEM_THRESHOLD].reset_index()
df_with_session_item_threshold = df_with_session_threshold[df_with_session_threshold.ItemId.isin(item_lengths_w_threshold.ItemId)]

In [None]:
session_lengths_2 = df_with_session_item_threshold.groupby(["SessionId"]).size()
session_lengths_2_w_threshold = (session_lengths_2[session_lengths_2 > SESSION_THRESHOLD]).reset_index()

In [None]:
df_final = df_with_session_item_threshold[df_with_session_item_threshold.SessionId.isin(session_lengths_2_w_threshold.SessionId)]
df_final.head()

In [None]:
df_final.Action.fillna(value="CLICK", inplace=True)
df_final.drop(["Price", "Quantity"], axis=1, inplace=True)
df_final.head()

Some more changes

In [None]:
df_final['Time'] = df_final.TimeStr.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp())
del(df_final["TimeStr"])
df_final.sort_values(by=["SessionId", "Time"], inplace=True)
df_final["Rating"] = df_final.Action.apply(lambda x: 1 if (x == "CLICK") else 5)
df_final.head()

In [None]:
df_final.to_csv('yoochoose_processed.csv')

Modeling

In [None]:
df = pd.read_csv("yoochoose_processed.csv")

In [None]:
# Data Encoding
DATA, user_encoder, item_encoder = encode_user_item(df, "SessionId", "ItemId", "Rating", "Time")

In [None]:
# Spotlight requires encoders to begin from 1 (instead of 0). We will add 1 to the encoders 
# When doing inverse transform, remember to subtract 1.

DATA.USER = DATA.USER + 1
DATA.ITEM = DATA.ITEM + 1

In [None]:
DATA.RATING = DATA.RATING.astype(np.int32)
DATA.USER = DATA.USER.astype(np.int32)
DATA.ITEM = DATA.ITEM.astype(np.int32)

In [None]:
DATA.head()

In [None]:
df_for_interaction_matrix = (DATA.USER.values,DATA.ITEM.values,DATA.RATING,DATA.TIMESTAMP)
df_interaction = Interactions(*df_for_interaction_matrix)

Train and Validation set

In [None]:
train_with_val, test = user_based_train_test_split(df_interaction,
                                                   random_state=random_state, 
                                                   test_percentage = 0.2)

train, val = user_based_train_test_split(train_with_val, test_percentage=0.2, 
                                         random_state=random_state)

Implicit Model

In [None]:
model_implicit = ImplicitFactorizationModel(n_iter=3, loss='bpr')
model_implicit.fit(train)

In [None]:
user_for_reco = test.user_ids[0]
pred_for_user = model_implicit.predict(user_for_reco)
pred_for_user

In [None]:
rec_item_ids = (-pred_for_user).argsort()
rec_item_ids

In [None]:
# ground truth
target = test.item_ids[0]
target

In [None]:
np.where(rec_item_ids == target)

Evaluation

In [None]:
implicit_mrr_score = mrr_score(model_implicit, test)
(pk, rk) = precision_recall_score(model_implicit, test, k= 5)

Sequence Model

In [None]:
max_sequence_length = 200
min_sequence_length = 50
step_size = 200

In [None]:
train = train.to_sequence(max_sequence_length=max_sequence_length,
                          min_sequence_length=min_sequence_length,
                          step_size=step_size)
test = test.to_sequence(max_sequence_length=max_sequence_length,
                        min_sequence_length=min_sequence_length,
                        step_size=step_size)
val = val.to_sequence(max_sequence_length=max_sequence_length,
                                    min_sequence_length=min_sequence_length,
                                    step_size=step_size)

In [None]:
print(train.sequences.shape)
print(test.sequences.shape)
print(val.sequences.shape)

In [None]:
net = CNNNet(train.num_items,
             embedding_dim=128,
             kernel_width=3,
             dilation=[1,1,1,1],
             num_layers=2,
             nonlinearity="relu",
             residual_connections=False)

In [None]:
model = ImplicitSequenceModel(loss="bpr",
                              representation=net,
                              batch_size=32,
                              learning_rate=0.1,
                              l2=0.0,
                              n_iter=2,
                              random_state=random_state)

In [None]:
model.fit(train)

Prediction

In [None]:
query = test.sequences[1][0:199]
target = test.sequences[1][199]

print("Shape of query is : ",query.shape)
print("The value of target is : ",target)

In [None]:
pred = model.predict(query)

In [None]:
rec_item_ids = (-pred).argsort()
np.where(rec_item_ids == target)

In [None]:
#Item ID that is to be recommended :
item_encoder.inverse_transform([rec_item_ids[0]-1])[0]