In [1]:
import pandas as pd

In [2]:
path = "/home/ubuntu/duc.nm195858/data/ml-1m/movies.dat"

In [3]:
df_user = pd.read_csv('/home/ubuntu/duc.nm195858/data/ml-1m/users.dat',
                              sep='::',
                              engine='python',
                              names=['user', 'gender', 'age', 'occupation', 'Zip-code'],
                              usecols=['user', 'gender'])

In [4]:
df = pd.read_csv(path, sep='::',engine='python',encoding = "ISO-8859-1",names = ['item','Title','Genres'],usecols = ['item','Genres'])

In [5]:
df

Unnamed: 0,item,Genres
0,1,Animation|Children's|Comedy
1,2,Adventure|Children's|Fantasy
2,3,Comedy|Romance
3,4,Comedy|Drama
4,5,Comedy
...,...,...
3878,3948,Comedy
3879,3949,Drama
3880,3950,Drama
3881,3951,Drama


In [51]:
df_2 = df['Genres'].str.split('|').str[0]

In [53]:
set(df_2.to_list())

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [35]:
df['Genres'].to_list()[4].split('|')[0]

'Comedy'

In [49]:
def genres(df):
    genres_all = df['Genres'].to_list()

    set_genre = set()
    for genres in genres_all:
        set_genre.add(genres.split('|')[0])
    map_genres_to_index = dict()
    for i, genre in enumerate(set_genre):
        map_genres_to_index[genre] = i+1
    
    # for i in range(1, len(df)+1):
    return map_genres_to_index

In [50]:
genres(df)

{'Crime': 1,
 'Musical': 2,
 'Western': 3,
 'Horror': 4,
 'Fantasy': 5,
 'Animation': 6,
 "Children's": 7,
 'Mystery': 8,
 'Action': 9,
 'Thriller': 10,
 'Romance': 11,
 'Adventure': 12,
 'Sci-Fi': 13,
 'Film-Noir': 14,
 'Comedy': 15,
 'War': 16,
 'Drama': 17,
 'Documentary': 18}

In [36]:
genres_all = df['Genres'].to_list()

list_genre = set()
for genres in genres_all:
    list_genre.add(genres.split('|')[0])

In [37]:
list_genre

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [23]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from collections import Counter
from sklearn.model_selection import train_test_split

from copy import deepcopy


class DatasetLoader(object):
    def load(self):
        """Minimum condition for dataset:
          * All users must have at least one item record.
          * All items must have at least one user record.
        """
        raise NotImplementedError
class MovieLens1M(DatasetLoader):
    def __init__(self, data_dir):
        self.fpath_rate = os.path.join(data_dir, 'ratings.dat')
        self.fpath_user = os.path.join(data_dir, 'users.dat')
        self.fpath_item = os.path.join(data_dir, 'movies.dat')

    def load(self):
        # Load data
        df_rate = pd.read_csv(self.fpath_rate,
                              sep='::',
                              engine='python',
                              names=['user', 'item', 'rate', 'time'],
                              usecols=['user', 'item', 'rate'])
        df_user = pd.read_csv(self.fpath_user,
                              sep='::',
                              engine='python',
                              names=['user', 'gender', 'age', 'occupation', 'Zip-code'],
                              usecols=['user', 'gender'])
        df_movies = pd.read_csv(self.fpath_item,
                                 sep='::',
                                 engine='python',
                                 encoding = "ISO-8859-1",
                                 names = ['item','Title','Genres'],
                                 usecols = ['item','Genres'])

        df = pd.merge(df_rate, df_user, on='user')
        df = pd.merge(df,df_movies, on='item')
        df = df.dropna(axis=0, how='any')
        # TODO: save negative rating?
        df_negative = df[df['rate'] <=3]
        df = df[df['rate'] > 3]
        
        df = df.reset_index().drop(['index'], axis=1)

        return df, df_negative
    
def create_user_list(df, df_negative):
    """
    This function creates a list of lists where each inner list contains the items associated with a
    specific user in a given dataframe.
    
    :param df: a pandas DataFrame containing user-item interactions data
    :param user_size: The parameter user_size is the number of users in the dataset. It is used to
    initialize an empty list for each user in the user_list variable
    :return: The function `create_user_list` returns a list of lists, where each inner list contains the
    items that a particular user has interacted with, based on the input dataframe `df` and the number
    of users `user_size`.
    user_neg = [u:[neg]]
    """
    # user_list = [list() for u in range(user_size)]
    # user_neg = [list() for u in range(user_size)]
    user_list = dict()
    user_neg = dict()
    for row in df.itertuples():
        if row.user not in user_list:
            user_list[row.user] = []
        user_list[row.user].append(row.item)
    for row in df_negative.itertuples():
        if row.user not in user_neg:
            user_neg[row.user] = []
        user_neg[row.user].append(row.item)

    return user_list, user_neg

In [67]:
def split_data(user_list, user_neg, test_ratio = 0.1, val_ratio = 0.1, seed=0):
    train_data = dict()
    val_data = dict()
    test_data =dict()

    train_data['Positive'] = dict()
    val_data['Positive'] = dict()
    test_data['Positive'] = dict()

    train_data['Negative'] = dict()
    val_data['Negative'] = dict()
    test_data['Negative'] = dict()
    for user_id, item_list in user_list.items():
        if len(item_list) >1:
            train_val_item, test_item = train_test_split(item_list, test_size=test_ratio, random_state=seed)
            if len(train_val_item) >1:
                train_item,val_item = train_test_split(train_val_item, test_size=val_ratio, random_state=seed)
            elif len(train_val_item) == 1:
                train_item, val_item = train_val_item, train_val_item
        elif len(item_list) == 1:
            train_item = item_list
            test_item = item_list
            val_item = item_list
        train_data['Positive'][user_id] = train_item
        test_data['Positive'][user_id] = test_item
        val_data['Positive'][user_id] = val_item
    for user_id, item_list in user_neg.items():
        if len(item_list) >1:
            train_val_item, test_item = train_test_split(item_list, test_size=test_ratio, random_state=seed)
            if len(train_val_item) >1:
                train_item,val_item = train_test_split(train_val_item, test_size=val_ratio, random_state=seed)
            elif len(train_val_item) == 1:
                train_item, val_item = train_val_item, train_val_item
            # train_item,val_item = train_test_split(train_val_item, test_size=val_ratio, random_state=seed)
        elif len(item_list) == 1:
            train_item = item_list
            test_item = item_list
            val_item = item_list
        train_data['Negative'][user_id] = train_item
        test_data['Negative'][user_id] = test_item
        val_data['Negative'][user_id] = val_item
    return train_data, val_data, test_data

In [79]:
list1 = [1, 2, 3, 4, 5]
list2 = [3, 4, 5, 6, 7]
list3 = [1, 2, 5, 5, 9]

# Lấy tất cả các phần tử từ list3 không xuất hiện trong list1 và list2
unique_elements_list3 = list(set(list3) - set(list1) - set(list2))

print("Các phần tử trong list3, khác với list1 và list2:", unique_elements_list3)


Các phần tử trong list3, khác với list1 và list2: [9]


In [29]:
user_list,user_neg = create_user_list(dataset, data_neg)

In [68]:
train_data, val_data, test_data=split_data(user_list, user_neg)

In [72]:
train_data['Negative']

{28: [3260,
  648,
  266,
  1923,
  3101,
  1193,
  2571,
  3471,
  357,
  1208,
  1617,
  901,
  899,
  2881,
  650,
  2174,
  2033,
  593,
  1748,
  1097,
  1302,
  2011,
  2797,
  144,
  3210,
  1,
  454,
  1240,
  3000],
 42: [2528,
  2003,
  34,
  2408,
  1653,
  1320,
  2100,
  2140,
  3623,
  1037,
  1921,
  2009,
  3359,
  1193,
  780,
  858,
  880,
  173,
  1590,
  2058,
  733,
  288,
  296,
  2664,
  1377,
  2642,
  3354,
  1909,
  2797,
  32,
  3701,
  1049,
  2716,
  3698,
  1954,
  1304,
  2161,
  2001,
  1580,
  1924,
  10,
  2115,
  2571,
  3769,
  3780,
  2000,
  494,
  1124,
  3269,
  3448,
  367,
  1544,
  1527,
  3070,
  1608,
  2872,
  737,
  1221,
  1307,
  3153,
  2456,
  1917,
  1591,
  3937,
  1779,
  3361,
  2989,
  1077,
  1831,
  748,
  1220],
 96: [2871,
  1784,
  3623,
  1193,
  2804,
  590,
  17,
  3114,
  3421,
  1694,
  1954,
  1394,
  1266,
  3751,
  36,
  1614,
  47,
  1265,
  3578,
  1907,
  830,
  2997,
  1408,
  1873,
  3148],
 99: [2997,
  1584,
  

In [38]:
min = 10e10
max = 0
for _, value in user_list.items():
    if len(value) <min:
        min = len(value)
    if len(value) > max:
        max = len(value)
    

In [39]:
print(min)
print(max)

1
1435


In [11]:
data_ = MovieLens1M('/home/ubuntu/duc.nm195858/data/ml-1m')

In [12]:
dataset, data_neg = data_.load()

In [19]:
data_neg

Unnamed: 0,user,item,rate,gender,Genres
8,28,1193,3,F,Drama
11,42,1193,3,M,Drama
26,96,1193,3,F,Drama
27,99,1193,2,F,Drama
29,104,1193,2,M,Drama
...,...,...,...,...,...
1000200,5420,1843,3,F,Children's|Comedy
1000201,5433,286,3,F,Action|Sci-Fi|Thriller
1000203,5556,2198,3,M,Documentary
1000205,5675,2703,3,M,Drama


In [62]:
def split_data(user_list, user_neg, test_ratio = 0.1, val_ratio = 0.1, seed=0):
    train_data = dict()
    val_data = dict()
    test_data =dict()

    train_data['Positive'] = dict()
    val_data['Positive'] = dict()
    test_data['Positive'] = dict()

    train_data['Negative'] = dict()
    val_data['Negative'] = dict()
    test_data['Negative'] = dict()
    for user_id, item_list in user_list.items():
        if len(item_list) >1:
            train_val_item, test_item = train_test_split(item_list, test_size=test_ratio, random_state=seed)
            train_item,val_item = train_test_split(train_val_item, test_size=val_ratio, random_state=seed)
        elif len(item_list) == 1:
            train_item = item_list
            test_item = item_list
            val_item = item_list
        train_data['Positive'][user_id] = train_item
        test_data['Positive'][user_id] = test_item
        val_data['Positive'][user_id] = val_item
    for user_id, item_list in user_neg.items():
        if len(item_list) >1:
            train_val_item, test_item = train_test_split(item_list, test_size=test_ratio, random_state=seed)
            train_item,val_item = train_test_split(train_val_item, test_size=val_ratio, random_state=seed)
        elif len(item_list) == 1:
            train_item = item_list
            test_item = item_list
            val_item = item_list
        train_data['Negative'][user_id] = train_item
        test_data['Negative'][user_id] = test_item
        val_data['Negative'][user_id] = val_item
    return train_data, val_data, test_data

In [20]:
def create_user_list(df, user_size):
    """
    This function creates a list of lists where each inner list contains the items associated with a
    specific user in a given dataframe.
    
    :param df: a pandas DataFrame containing user-item interactions data
    :param user_size: The parameter user_size is the number of users in the dataset. It is used to
    initialize an empty list for each user in the user_list variable
    :return: The function `create_user_list` returns a list of lists, where each inner list contains the
    items that a particular user has interacted with, based on the input dataframe `df` and the number
    of users `user_size`.
    """
    user_list = [list() for u in range(user_size)]
    for row in df.itertuples():
        user_list[row.user].append(row.item)

    return user_list


In [1]:
import torch

In [9]:
a = torch.tensor([1,2,3])
b = torch.tensor([4,5,6])
c = list()
c.append(a)
c.append(b)

In [1]:
import torch
print(torch.__version__)


2.1.0+cu121


In [6]:
a = torch.tensor([1,2,3,4])
b = torch.tensor([2,3,6,7,8,7,8])

In [16]:
import numpy as np
np.random.uniform(0,1)

0.2548407311469362