In [1]:
import pandas as pd
import helpers
import json
import numpy as np
import random
import math
import pickle
from sklearn.preprocessing import MinMaxScaler
import collections
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

# Model parameters

In [2]:
TEST = False
FACTORS = 100
EPOCHS = 20
SEARCH_WEIGHT = 0.15 / 5
KAGGLE = False
MODEL_TYPE = 'rankfm'#rankfm

# Load processed data

In [3]:
items = helpers.load_items_df()
items_dict = helpers.load_items()
domain_item_dict = helpers.load_domain_item_dict(items_dict)
all_items = list(items_dict.keys())
print("Items loaded!")
interactions_train = helpers.load_interactions_df()
if TEST:
    interactions_test = helpers.load_interactions_test_df()
print("Interactions loaded!")

Items loaded!
Interactions loaded!


In [6]:
sold_times = collections.Counter(interactions_train.target)

In [7]:
def process(row):
    nan = float('nan')
    target = int(row['target'])
    itemid = int(row['item_id'])
    row['item_id'] = items_dict[int(row['item_id'])]['domain_id'] if (type(row['item_id']) != float or not math.isnan(row['item_id'])) else nan
    row['target'] = items_dict[int(row['target'])]['domain_id']
    return row
interactions_train.head(100).apply(process, axis=1)

Unnamed: 0,user_id,item_id,event_type,event_timestamp,target
0,0,MLB-SMARTWATCHES,view,2019-10-19T11:25:42.444-0400,MLB-SMARTWATCHES
1,0,MLB-SMARTWATCHES,view,2019-10-20T19:28:41.646-0400,MLB-SMARTWATCHES
2,0,MLB-SMARTWATCHES,view,2019-10-20T19:28:14.619-0400,MLB-SMARTWATCHES
3,0,MLB-SMARTWATCHES,view,2019-10-20T10:37:47.699-0400,MLB-SMARTWATCHES
4,0,MLB-SMARTWATCHES,view,2019-10-20T10:37:23.202-0400,MLB-SMARTWATCHES
...,...,...,...,...,...
95,4,MLB-SMARTWATCHES,search,2019-09-25T08:41:34.424-0400,MLB-SMARTWATCHES
96,4,MLB-KIDS_TENTS,view,2019-09-25T16:16:36.102-0400,MLB-SMARTWATCHES
97,4,MLB-TOYS_AND_GAMES,view,2019-09-25T16:09:22.866-0400,MLB-SMARTWATCHES
98,4,MLB-TOYS_AND_GAMES,view,2019-09-25T16:09:42.478-0400,MLB-SMARTWATCHES


# Functions

In [8]:
def encode_item_features(items, interactions):
    viewed_times = collections.Counter(interactions[interactions['event_type'] == 'view'].item_id)
    items_df = items[['item_id', 'domain_id', 'price', 'condition']].copy()
    #items_df = pd.get_dummies(items_df, columns=['domain_id', 'condition'])
    '''
    domains = items.domain_id.unique() 
    m = int(math.log2(len(domains)) + 1)
    columns = {f'domain_bit_{i}': [] for i in range(m)}
    indexed_domains = {domains[i]: i for i in range(len(domains))}

    def domain_apply(x):
        arr = helpers.bin_array(indexed_domains[x], m)
        for j in range(m):
            columns[f'domain_bit_{j}'].append(arr[j])

    items_df['domain_id'].apply(domain_apply)
'''
    #for k in columns.keys():
    #    items_df[k] = columns[k]
    #le = LabelEncoder()
    
    #items_df['condition'] = items_df['condition'].apply(lambda x: 1 if x == 'new' else 0)
    #items_df['sold_times'] = items_df['item_id'].apply(lambda x: sold_times[x])
    items_df['viewed_times'] = items_df['item_id'].apply(lambda x: viewed_times[x])
    #items_df['domain_id'] = le.fit_transform(items_df['domain_id'])
    #items_df = items_df.drop(columns=['domain_id'])
    #items_df['item_id'] = items_df['item_id'].astype(int)
    #items_df['price'] = items_df['price'].fillna(0)
    #scaler = MinMaxScaler()
    #transformed_price = scaler.fit_transform(items_df['price'].values.reshape(-1, 1)).flatten()
    #items_df['price'] = pd.Series(transformed_price)
    return items_df

def encode_user_features(users, interactions):
    event_dict = dict(list(interactions.groupby('user_id')))
    data = {'user_id': [], 'items_viewed': [], 'searches_done': [], 'categories_viewed': []}
    for u in users:
        events = event_dict[u].values.tolist()
        items_viewed = 0
        categories_viewed = set()
        searches_done = 0
        for event in events:
            user_id, info, event_type, timestamp, target = event
            if np.isnan(info): 
                continue
            
            if event_type == 'search':
                searches_done += 1
            else:
                items_viewed += 1
            
            categories_viewed.add(items_dict[info]['domain_id'])
            
        data['user_id'].append(u)
        data['items_viewed'].append(items_viewed)
        data['searches_done'].append(searches_done)
        data['categories_viewed'].append(len(categories_viewed))
    return pd.DataFrame(data)

In [9]:
def encode_interactions(df):
    new_df = df[pd.notnull(df['item_id'])].copy()
    new_df['user_id'] = new_df['user_id'].astype(float).astype(int)
    new_df['item_id'] = new_df['item_id'].astype(float).astype(int)
    sample_weights = np.array([(3 if x != 'search' else SEARCH_WEIGHT) for x in new_df['event_type']])
    return new_df[['user_id', 'item_id']], sample_weights
    

In [10]:
def build_candidate_pairs(users, valid_item_ids):
    users_column = []
    items_column = []
    user_lengths = []
    i = 0
    for u in users:
        candidates = [x for x in get_candidates(u) if x in valid_item_ids]
        items_column += candidates
        users_column += [u] * len(candidates)
        user_lengths.append((u, len(candidates)))
        if i % 100000 == 0:
            print(f"Progress {i}/{len(users)}")
        i += 1
    pairs = pd.DataFrame({'user_id': users_column, 'item_id': items_column})
    return pairs, users_column, items_column, user_lengths

In [11]:
def build_recommendations(recommendations_pairs, items_column, user_lengths):
    offset = 0
    recommendations = {}
    for user, user_len in user_lengths:
        user_recs = recommendations_pairs[offset:offset+user_len]
        ranked_recs = np.argsort(user_recs)[::-1]
        top_10 = [x for x in ranked_recs if not np.isnan(user_recs[x])][:10]
        recommendations[user] = [items_column[x + offset] for x in top_10]
        offset += user_len
    return recommendations

In [12]:
def get_domains_from_items(items):
    return set(items_dict[int(item)]['domain_id'] for item in items)

def get_candidates(user):
    items_interacted = event_dict[user] if user in event_dict else set()
    domains = get_domains_from_items(items_interacted) if items_interacted else top_domains[:10]
    items_for_domains = [domain_top_items[d] for d in domains]
    item_universe = sum(items_for_domains, [])
        
    for item in item_universe:
        items_interacted.add(item)
            
    return list(items_interacted)


In [13]:
def combine_interactions(i1, i2):
    i1c = i1.copy()
    i2c = i2.copy()
    i2c['user_id'] += i1c.shape[0]
    return i1c.append(i2c)

# Setup training data

In [14]:
%%time
users = None
interactions = None
sample_weights = None
user_features = None

if TEST:
    interactions = (combine_interactions(interactions_train, interactions_test))#shuffle
    validation_users = interactions_test.user_id.unique() + interactions_train.shape[0]
    all_users = np.concatenate([interactions_train.user_id.unique(), validation_users])
else:
    interactions = (interactions_train)
    validation_users = interactions_train.user_id.unique()
    all_users = validation_users

user_target_dict = None
#interactions = interactions[interactions['event_type'] != 'search']

## Calculate user features
#interactions_users = set(interactions.user_id.dropna().unique())
#%time user_features = encode_user_features(interactions_users, interactions)

## Calculate item features
#interactions_items = set(interactions.item_id.dropna().unique())
#items_cp = items.copy()
#items_cp.set_index('item_id', inplace=True, drop=False)
#%time item_features = encode_item_features(items_cp.loc[interactions_items], interactions)
#item_features = item_features.reset_index(drop=True)

## Calculate auxiliary data
interactions, sample_weights = encode_interactions(interactions)
domain_top_items = helpers.load_top_items(interactions_train, domain_item_dict)
top_domains = helpers.load_top_domains(interactions_train, domain_top_items)
event_dict = interactions.groupby('user_id')['item_id'].unique().apply(set).to_dict()
valid_item_ids = set(interactions['item_id'].unique())

CPU times: user 39.8 s, sys: 5.41 s, total: 45.2 s
Wall time: 45.4 s


# Training

In [14]:
if KAGGLE:
    !pip install rankfm
    !pip install lightfM

### LightFM

In [15]:
%%time
if MODEL_TYPE == 'lightfm':
    from lightfm.data import Dataset
    from lightfm import LightFM
    from lightfm.data import Dataset
    from scipy.sparse import coo_matrix
    import scipy

    item_feature_values = set()
    for column in item_features.columns:
        if column == 'item_id': continue
        item_feature_values |= set(item_features[column].unique())

    user_feature_values = set()
    for column in user_features.columns:
        if column == 'user_id': continue
        user_feature_values |= set(user_features[column].unique())

    item_ids = interactions['item_id'].unique()
    user_ids = all_users

    dataset = Dataset()
    dataset.fit(
        user_ids,
        item_ids,
        item_features=item_feature_values,
        user_features=user_feature_values
    )

    train_interactions, train_weights = dataset.build_interactions(
        ((x[1], x[2], sample_weights[i]) for i, x in enumerate(interactions.itertuples())),
    )

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.78 µs


In [16]:
%%time
if MODEL_TYPE == 'lightfm':
    lightfm_item_features = dataset.build_item_features(
        item_features.apply(lambda x: (x['item_id'], [x[y] for y in x.index[1:] if y != 'item_id']), axis=1).values,
    )
    lightfm_user_features = dataset.build_user_features(
        user_features.apply(lambda x: (x['user_id'], [x[y] for y in x.index[1:] if y != 'user_id']), axis=1).values,
    )

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.54 µs


In [17]:
%%time
ITEM_ALPHA = 1e-6
if MODEL_TYPE == 'lightfm':
    
    print(f'Training {train_interactions.shape[0]} samples for {EPOCHS} epochs')
    model = LightFM(
        no_components=25,
        loss='warp',
        item_alpha=ITEM_ALPHA
    )
    ##
    ## Try no normalization
    ## Try changing the get_candidates() method
    ## Do a submit
    ## Try using searches to find the domains
    ##
    model.fit(
        train_interactions,
        epochs=20,
        #sample_weight=train_weights,
        #item_features=lightfm_item_features,
        #user_features=lightfm_user_features,
        num_threads=4,
        verbose=True
    )
    print("Building recommendation pairs...")
    pairs, users_column, items_column, user_lengths = build_candidate_pairs(validation_users, valid_item_ids)

    user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
    print(f"Generating recommendation pairs")
    recommendations_pairs = model.predict(
        np.array([user_id_map[x] for x in users_column]),#user_le.transform(users_column),
        np.array([item_id_map[x] for x in items_column]),#item_le.transform(items_column),
        #item_features=lightfm_item_features,
        #user_features=lightfm_user_features,
        num_threads=4,
    )

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 8.34 µs


### RankFM

In [18]:
%%time
if MODEL_TYPE == 'rankfm':
    from rankfm.rankfm import RankFM
    
    model = RankFM(factors=FACTORS, loss='warp', max_samples=20, alpha=0.01, sigma=0.1, learning_rate=0.10, learning_schedule='invscaling')
    
    print(f"Fitting {interactions.shape[0]} interactions...")
    
    %time
    model.fit(
        interactions,
        epochs=EPOCHS,
        verbose=True,
        sample_weight=sample_weights,
        #item_features=item_features,
        #user_features=user_features
    )

Fitting 11999164 interactions...
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.78 µs

training epoch: 0
log likelihood: -5387343.0

training epoch: 1
log likelihood: -3077177.5

training epoch: 2
log likelihood: -1921957.125

training epoch: 3
log likelihood: -1345258.25

training epoch: 4
log likelihood: -1049952.0

training epoch: 5
log likelihood: -878200.0

training epoch: 6
log likelihood: -775879.8125

training epoch: 7
log likelihood: -710139.0625

training epoch: 8
log likelihood: -665708.6875

training epoch: 9
log likelihood: -634133.5625

training epoch: 10
log likelihood: -609428.0

training epoch: 11
log likelihood: -589565.5625

training epoch: 12
log likelihood: -573660.25

training epoch: 13
log likelihood: -560414.625

training epoch: 14
log likelihood: -549603.0625

training epoch: 15
log likelihood: -540292.3125

training epoch: 16
log likelihood: -532446.0625

training epoch: 17
log likelihood: -525471.125

training epoch: 18
log likelihood: -519403.0625

In [19]:
import pickle

with open(f'./data/model_(e=25, t=False, sw=3, vw=3, t=st)2.pickle', 'wb') as f:
    pickle.dump(model, f)

In [None]:
print(f"Generating candidate pairs")

%time
pairs, users_column, items_column, user_lengths = build_candidate_pairs(validation_users, valid_item_ids)

print(f"Generating recommnedation pairs")

%time
recommendations_pairs = model.predict(pairs, cold_start='nan')

Generating candidate pairs
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 134 µs
Progress 0/413163
Progress 100000/413163
Progress 200000/413163


## Create item users pairs to feed the model

In [18]:
def fill(recommendations):
    for k in recommendations.keys():
        if len(recommendations[k]) == 0:
            recommendations[k] = random.choices(all_items, k=10)
        elif len(recommendations[k]) < 10:
            category = items_dict[recommendations[k][0]]['domain_id']
            recommendations[k] += random.choices(domain_item_dict[category], k=(10 - len(recommendations[k])))

# Assert required sizes
            
assert len(recommendations) == len(validation_users)
unfilled = len([True for k in recommendations.keys() if len(recommendations[k]) != 10])
if unfilled > 0:
    print(f"{unfilled} entries were not filled. Extending the items...")
    fill(recommendations)

NameError: name 'recommendations' is not defined

## Scoring (if training)

In [None]:
if not TEST and not user_target_dict:
    user_target_dict = interactions_train.groupby('user_id')['target'].unique().apply(lambda x: x).to_dict()

In [None]:
def _relevance(items_dict, item, target):
    if item == target:
        return 15
    if items_dict[item]['domain_id'] == items_dict[target]['domain_id']:
        return 1
    return 0

def _get_perfect_dcg():
    perfect = [15, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    return sum(perfect[i] / np.log2(i + 2) for i in range(len(perfect))) / len(perfect)

def _dcg(items_dict, recommendations, target):
    
    dcg = sum(_relevance(items_dict, recommendations[i], target) / np.log2(i + 2) for i in range(len(recommendations)))
    return dcg / len(recommendations)

def ndcg_score(items_dict, recommendations, user_targets_dict):
    sum_ndcg = 0
    sum_perfect = 0
    for x in recommendations.keys():
        sum_ndcg += _dcg(items_dict, [int(w) for w in recommendations[x]], int(user_targets_dict[x]))
        sum_perfect += _get_perfect_dcg()

    return sum_ndcg / sum_perfect

In [None]:
if not TEST:
    print(ndcg_score(items_dict, recommendations, user_target_dict))

In [296]:
#0.25357310893765944
#0.25013763660310895
#train with search w=0.1 factors=250 item_features=no -> 0.243
#train with search w=0.2 factors=150 item_features=no -> 0.24443886619434535
#train with search w=0.5 factors=250 item_features=no -> 0.23223811466616529
#train with search w=1.0 factors=250 item_features=no -> 0.19637033782036525
#train with search w=1.0 factors=100 item_features=no -> 0.197764356297992
#train with search w=0.15 factors=100 item_features=no ->0.24546062804586635

# lightfm n=50 no search -> 0.2481555866759734
# lightfm n=25 -> 0.24950549809750702
# lightfm n=25 w integer item features w item alpha -> 0.22521187808442616
# lightfm n=25 -> 0.25071840818293245
# lightfm n=25 w item alpha -> 0.2508261003278925

#lightfm n=25 w search w weights-> 0.2191484993506384

# only domain ~0.07660177371164308
# only target ~0.16885885433
# lightfm only domain with item_f = 0.08535225261531444

## Generating submit (if testing)

In [207]:
if TEST:
    submit = pd.DataFrame(recommendations)
    print(f'Submit shape is {submit.shape}')
    assert submit.shape == (10, 177070)
    submit.transpose().to_csv(f'submit_f={FACTORS}_e={EPOCHS}.csv', index=False, header=False)

In [1]:
if TEST:
    import pickle
    with open(f"./data/recommendations/E={epochs}", "wb") as f:
        pickle.dump(recommendations_pairs, f)

NameError: name 'TEST' is not defined

# Finding domains