In [1]:
import pandas as pd
import joblib
from sklearn import preprocessing

from collie.cross_validation import stratified_split,random_split
from collie.interactions import Interactions
from collie.utils import convert_to_implicit, remove_users_with_fewer_than_n_interactions



In [2]:
import os
parent_directory = r"d:\\When-Hybrid-meets-the-Prompt"

os.chdir(parent_directory)
os.getcwd()

'd:\\When-Hybrid-meets-the-Prompt'

In [3]:
df_ratings = pd.read_csv("Data/ratings.csv", usecols=["item_id","user_id","rating"], nrows=10000).dropna().astype(int)
df_ratings = df_ratings.reset_index(drop=True)
df_ratings.info()
df_ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   item_id  10000 non-null  int32
 1   user_id  10000 non-null  int32
 2   rating   10000 non-null  int32
dtypes: int32(3)
memory usage: 117.3 KB


Unnamed: 0,item_id,user_id,rating
0,473160,152417014,4
1,473160,152414066,5
2,473160,152405496,4
3,473160,152400132,5
4,473160,152397002,4


In [4]:
# convert to implicit data
df_implicit_ratings = convert_to_implicit(df_ratings, min_rating_to_keep=2)
df_implicit_ratings = remove_users_with_fewer_than_n_interactions(df_implicit_ratings, min_num_of_interactions=1)
df_implicit_ratings.info()
df_implicit_ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9872 entries, 0 to 9871
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   item_id  9872 non-null   int32
 1   user_id  9872 non-null   int32
 2   rating   9872 non-null   int64
dtypes: int32(2), int64(1)
memory usage: 154.4 KB


Unnamed: 0,item_id,user_id,rating
0,473160,113517720,1
1,473160,119333016,1
2,473160,121424348,1
3,473160,113518034,1
4,473160,138236258,1


In [5]:
user_item_interaction = Interactions(
    users=df_implicit_ratings["user_id"],
    items=df_implicit_ratings["item_id"],
    ratings=df_implicit_ratings["rating"],
    allow_missing_ids=True
)
user_item_interaction

Checking for and removing duplicate user, item ID pairs...
Checking ``num_negative_samples`` is valid...
Maximum number of items a user has interacted with: 1
Generating positive items set...


Interactions object with 9872 interactions between 152417015 users and 473161 items, returning 10 negative samples per interaction.

In [6]:
train_interactions, val_interactions = random_split(user_item_interaction, test_p=0.2, seed=42)
train_interactions, val_interactions

Generating positive items set...
Generating positive items set...
Generating positive items set...


(Interactions object with 7897 interactions between 152417015 users and 473161 items, returning 10 negative samples per interaction.,
 Interactions object with 1975 interactions between 152417015 users and 473161 items, returning 10 negative samples per interaction.)

In [7]:
# joblib.dump(train_interactions, 'Data/train_interactions.pkl')
# joblib.dump(val_interactions, 'Data/val_interactions.pkl')
# joblib.dump(train_interactions, 'Data/train_interactions_1M.pkl')
# joblib.dump(val_interactions, 'Data/val_interactions_1M.pkl')
joblib.dump(train_interactions, 'Data/train_interactions_10K.pkl')
joblib.dump(val_interactions, 'Data/val_interactions_10K.pkl')


['Data/val_interactions_10K.pkl']