## Dataset source
https://www.kaggle.com/gspmoreira/articles-sharing-reading-from-cit-deskdrop

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np

# Import from my module.
from utils import smooth_user_preference, cat_to_id_transform

In [49]:
# Load data
interactions_df = pd.read_csv('data/users_interactions.csv')

# Process interactions_df
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 3.0, 
   'FOLLOW': 4.0,
   'COMMENT CREATED': 5.0,  
}
interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])

In [50]:
interactions_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry,eventStrength
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,,1.0
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US,1.0
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,,1.0
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,,4.0
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,,1.0


In [51]:
# Set column name mappings.
USER_KEY = "personId"
ITEM_KEY = "contentId"
RATE_KEY = "eventStrength"
TIME_KEY = "timestamp"

In [52]:
tmp = interactions_df.groupby([USER_KEY, ITEM_KEY]).size()
users_interactions_count_df = tmp.groupby(USER_KEY).size()

print('# users: %d' % len(users_interactions_count_df))

# users: 1895


In [53]:
# Keep the users with at least two records (interactions).
least_entries = 2

users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= least_entries]
users_with_enough_interactions_df = users_with_enough_interactions_df.reset_index()[[USER_KEY]]

print(f'# users with at least {least_entries} interactions: {len(users_with_enough_interactions_df)}')

# users with at least 2 interactions: 1620


In [54]:
interactions_from_selected_users_df = interactions_df.merge(
    users_with_enough_interactions_df, 
    how = 'right',
    left_on = USER_KEY,
    right_on = USER_KEY,
)
print(f'# of interactions from users with at least {least_entries} interactions: {len(interactions_from_selected_users_df)}')

# of interactions from users with at least 2 interactions: 71893


In [65]:
interactions_full_df = interactions_from_selected_users_df[[USER_KEY, ITEM_KEY, RATE_KEY, TIME_KEY]]

In [70]:
# Transform all ids to categories
u2idx, u_cat = cat_to_id_transform(interactions_full_df[USER_KEY])
i2idx, i_cat = cat_to_id_transform(interactions_full_df[ITEM_KEY])
interactions_full_df[USER_KEY] = u_cat
interactions_full_df[ITEM_KEY] = i_cat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions_full_df[USER_KEY] = u_cat
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions_full_df[ITEM_KEY] = i_cat


In [224]:
# Create test data.
# Test data is composed of the last interaction of each user.
# Therefore, the rows must be as same as the number of users.

# Initialize the empty dataframe for the test data.
test_df = pd.DataFrame(columns=interactions_full_df.columns)

# There are some records for the same user at the same timestep (duplicates).
# We'll put those records into a list and remove them from the train set later.
exclusives = []

for user_id in np.unique(interactions_full_df['personId']):
    # Get the data of the last timestep.
    max_t = max(interactions_full_df[interactions_full_df['personId'] == user_id]['timestamp'])
    result = interactions_full_df[(interactions_full_df['personId']==user_id) & \
                                  (interactions_full_df['timestamp']==max_t)]

    if len(result)>1:    # which means duplicates.
        exclusives.append(result.index[1])
        result = result.iloc[0, :]
    test_df = test_df.append(result)

In [230]:
len(exclusives)

25

In [227]:
# Create Train data.

# Get the differnce from the test df.
difference = list(set(interactions_full_df.index) - set(test_df.index))
train_df = interactions_full_df.iloc[difference, :]

# Remove duplicates from the train data.
train_df = train_df.drop(exclusives)

In [249]:
# Make sure all columns are in INT.

for k in [USER_KEY, ITEM_KEY, RATE_KEY]:
    train_df[k] = train_df[k].astype('int32')
    test_df[k] = test_df[k].astype('int32')

In [250]:
# Save data to in the format of tsv.
train_df.iloc[:, [0, 1, 2]].to_csv("data/kaggle.train.rating", sep="\t", index=False, header=None)
test_df.iloc[:, [0, 1, 2]].to_csv("data/kaggle.test.rating", sep="\t", index=False, header=None)

In [254]:
# Create negative test data.
# For each user, negative test data are another 99 item candidates that the user hasn't interacted with.

# We first create the interacted dictionary for users and items based on the train data.
interacted = {}
for uid in np.unique(train_df.personId):
    interacted[uid] = train_df[train_df.personId == uid][ITEM_KEY].tolist()

# Get uids from test data.
test_uids = np.unique(test_df.personId)

# Each entry in entries will be a tuple (userID, itemID).
entries = []
for idx, uid in enumerate(test_uids):
    entries.append((uid, test_df[test_df.personId == uid]['contentId'].values[0]))
    row = np.array([], dtype=np.int32)
    for i in range(99):
        j = np.random.choice(interactions_full_df['contentId'])
        while j in interacted[uid]:
            # Random pick an item from the original item set.
            j = np.random.choice(interactions_full_df['contentId'])

        row = np.append(row, j)
    
    if idx == 0:
        rows = np.expand_dims(row, axis=0)
    else:
        row = np.expand_dims(row, axis=0)
        rows = np.concatenate((rows, row), axis=0)

neg_df = pd.DataFrame(rows)

In [256]:
# Put the (userID, itemID) tuples to the first column.
neg_df.insert(loc=0, column="u_i", value=entries)

In [257]:
neg_df

Unnamed: 0,u_i,0,1,2,3,4,5,6,7,8,...,89,90,91,92,93,94,95,96,97,98
0,"(0, 1087)",678,2775,32,2730,2984,2423,1243,2433,860,...,1206,1761,2116,1952,63,2890,113,2878,1241,2553
1,"(1, 2314)",1927,314,209,2180,982,1341,63,1024,922,...,2849,2325,654,2527,2277,1252,1165,465,1764,2023
2,"(2, 847)",2382,840,1776,1537,1236,2539,2485,1039,2707,...,62,1332,454,74,1157,795,79,365,165,506
3,"(3, 865)",683,1396,1022,1268,1873,562,1973,2688,2453,...,1550,2945,2207,83,257,1056,217,1740,2945,2045
4,"(4, 1145)",2197,2349,1781,1612,958,1371,690,957,945,...,668,493,922,1101,775,1058,1212,2976,1746,909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615,"(1615, 2975)",408,1586,225,2158,2218,1366,947,1246,439,...,403,2459,2775,2244,1746,2011,1022,2444,433,2406
1616,"(1616, 1399)",47,188,873,1749,138,198,348,622,1211,...,1694,394,2649,1968,1349,2783,900,2535,1782,1231
1617,"(1617, 605)",2390,699,286,842,2899,1140,2722,1746,900,...,339,2725,980,875,805,1029,2070,503,1420,555
1618,"(1618, 847)",1720,1482,1640,225,615,438,210,215,327,...,538,2179,1491,460,1241,1886,2679,1179,1528,1994


In [259]:
neg_df.to_csv("data/kaggle.test.negative", sep="\t", index=False, header=None)