# Data Preparation
We put ourselves in a scenario in which the dataset is in implicit form. If explicit with ratings, we binarize it with a threshold of 3. If implicit for music, we binarize it with a threshold of 2 listening events. 

## Pipeline
 - Binarize interactions --> Implicit dataset
 - Define the user *classes*: 
   - *faithful* users ($f\%$, with $f$ fixed)
   - *harmful* users ($h\%$, with $h$ variable), with recsyslearn
 - Train-val-test split: [60%, 20%, 20%] at the user level, with RecBole

python run_hyper.py --config_files=bpr_config.yaml --params_file=bpr_params.yaml --output_file=./bpr_out.yaml --tool=Hyperopt

In [1]:
import pandas as pd
from tqdm import tqdm
import os
import numpy as np
from recsyslearn.dataset.segmentations import InteractionSegmentation
from recsyslearn.dataset.segmentations import ActivitySegmentation
from recsyslearn.dataset.segmentations import PopularityPercentage

In [2]:
def map_series_to_int(
    series: pd.Series,
    path_for_dict=None,
) -> pd.Series:
    
    import pickle
    int_to_ml_dict = {id_: ml for id_, ml in enumerate(series.unique())}
    ml_to_int_dict = {ml: id_ for id_, ml in int_to_ml_dict.items()}
    if path_for_dict:
        with open(path_for_dict, 'wb') as f:
            pickle.dump(ml_to_int_dict, f)
    return series.replace(ml_to_int_dict)

In [3]:
BASE_FOLDER = '/home/marta/jku/activity_fair/'

DATASET = 'ml-100k'
# DATASET = 'amazon_digital_music'
# DATASET = 'lastfm'

FULL_DATASETS_FOLDER = BASE_FOLDER + f'datasets/full_datasets/{DATASET}/'

FULL_INTERACTION_FILE = FULL_DATASETS_FOLDER + f'{DATASET}.inter'

In [4]:
full_dataset = pd.read_csv(FULL_INTERACTION_FILE, sep='\t')

In [5]:
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype
---  ------           --------------   -----
 0   user_id:token    100000 non-null  int64
 1   item_id:token    100000 non-null  int64
 2   rating:float     100000 non-null  int64
 3   timestamp:float  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [6]:
full_dataset['user_id:token'].nunique()

943

In [7]:
full_dataset.groupby('user_id:token').count().values.min()

20

In [8]:
if DATASET=='lastfm':
    threshold = 1
    print(full_dataset['weight:float'].values.min(), full_dataset['weight:float'].values.max())
    full_dataset = full_dataset[['user_id:token', 'artist_id:token', 'weight:float']]
    full_dataset.columns = ['user_id:token', 'item_id:token', 'rating:float']
    
else: 
    threshold = 2

In [9]:
full_dataset

Unnamed: 0,user_id:token,item_id:token,rating:float,timestamp:float
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [10]:
full_dataset[full_dataset['rating:float'] > threshold].tail()

Unnamed: 0,user_id:token,item_id:token,rating:float,timestamp:float
99992,721,262,3,877137285
99994,378,78,3,880056976
99995,880,476,3,880175444
99996,716,204,5,879795543
99999,12,203,3,879959583


In [11]:
binarized = full_dataset[full_dataset['rating:float'] > threshold]
binarized = binarized[['user_id:token', 'item_id:token']]
binarized = binarized.astype({
    'user_id:token': str,
    'item_id:token': str
})

In [12]:
binarized.head()

Unnamed: 0,user_id:token,item_id:token
0,196,242
1,186,302
5,298,474
7,253,465
8,305,451


In [13]:
binarized['user_id:token'].nunique()

943

In [14]:
core_users = binarized.groupby('user_id:token').count()[binarized.groupby('user_id:token').count().values > 4].index.values

In [15]:
binarized = binarized[binarized['user_id:token'].isin(core_users)]

In [16]:
binarized.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82520 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   user_id:token  82520 non-null  object
 1   item_id:token  82520 non-null  object
dtypes: object(2)
memory usage: 1.9+ MB


In [17]:
binarized['user_id:token'].nunique()

943

In [18]:
binarized.head()

Unnamed: 0,user_id:token,item_id:token
0,196,242
1,186,302
5,298,474
7,253,465
8,305,451


## User class definition

In [19]:
def create_folder_if_not_exist(path):
    isExist = os.path.exists(path)
    if not isExist:
       # Create a new directory because it does not exist
       os.makedirs(path)
       print(f"Directory {path} created!")

In [20]:
USER_ACTIVITY_PROP = [0.8, 0.2]
harmful_list = [0.01, 0.05, 0.10]

for harmful in harmful_list: 
    faithful = 1. - harmful
    USER_CLASSES_PROP = [faithful, harmful]
    

    DS_STRING = f'{DATASET}_harm{str(int(100*harmful)).zfill(2)}'
    ACTIVITY_FOLDER = BASE_FOLDER + f'user_activity/{DS_STRING}/'
    ACTIVITY_FILE = ACTIVITY_FOLDER + f'{DS_STRING}.uact'
    
    INTERACTION_FOLDER = BASE_FOLDER + f'item_interaction/{DS_STRING}/'
    INTERACTION_FILE = ACTIVITY_FOLDER + f'{DS_STRING}.iint'
    
    CLASSES_FOLDER = BASE_FOLDER + f'user_classes/{DS_STRING}/'
    CLASSES_FILE = CLASSES_FOLDER + f'{DS_STRING}.uclass'

    FILTERED_DATASETS_FOLDER = BASE_FOLDER + f'datasets/filtered_datasets/{DS_STRING}/'
    FILTERED_INTERACTION_FILE = FILTERED_DATASETS_FOLDER + f'{DS_STRING}.inter'
    
    create_folder_if_not_exist(FILTERED_DATASETS_FOLDER)
    create_folder_if_not_exist(CLASSES_FOLDER)
    create_folder_if_not_exist(INTERACTION_FOLDER)
    create_folder_if_not_exist(ACTIVITY_FOLDER)
    
    binarized.columns = ['user', 'item']
    # Transpose the dataset to use the popularity segmentation on users
    # binarized.columns = ['item', 'user']
    #print(np.sum(USER_CLASSES_PROP * 10) / 10)
    USER_CLASSES = ActivitySegmentation().segment(binarized, proportions=USER_CLASSES_PROP)
    USER_ACTIVITY = ActivitySegmentation().segment(binarized, proportions=USER_ACTIVITY_PROP)

    # Transpose the dataset to use the popularity segmentation on users
    # binarized.columns = ['user', 'item']
    binarized_filtered = binarized.merge(USER_CLASSES, how='inner', left_on='user', right_on='user')
    
    binarized.columns = ['user_id:token', 'item_id:token']
    binarized_filtered.columns = ['user_id:token', 'item_id:token', 'class:token']
    USER_CLASSES.columns = ['user_id:token', 'class:token']
    USER_ACTIVITY.columns = ['user_id:token', 'activity:token']
    
    # print(f"{harmful}: {USER_CLASSES['class:token'].value_counts()}")
    USER_CLASSES.to_csv(CLASSES_FILE, sep='\t', index=None)
    USER_ACTIVITY.to_csv(ACTIVITY_FILE, sep='\t', index=None)
    binarized['user_id:token'] = binarized['user_id:token'].astype(str)
    USER_CLASSES['user_id:token'] = USER_CLASSES['user_id:token'].astype(str)
    
    binarized_filtered = binarized_filtered[binarized_filtered['class:token'].isin(['1'])]
    binarized_filtered = binarized_filtered.drop(columns=['class:token'])
    
    # binarized_filtered['user_id:token'] = map_series_to_int(binarized_filtered['user_id:token'])
    binarized_filtered.to_csv(FILTERED_INTERACTION_FILE, sep='\t', index=None)

# END OF DATA PREPARATION