In [20]:
import numpy as np 
import pandas as pd 

In [21]:
# Read the data after the outlier detection
ori_data=pd.read_csv("Dataset_ready.csv")


In [22]:
# The 'watched_episode_normalized' and the "watching_status" is only used for the outlier detectoin
# We'll use the 'user_id', 'anime_id' and 'rating' for our recommendation.
model_data=ori_data.drop(columns=['watched_episode_normalized','watching_status'])
model_data 


Unnamed: 0,user_id,anime_id,rating
0,47,1,10
1,50,1,9
2,58,1,7
3,59,1,8
4,80,1,8
...,...,...,...
59826646,353404,243,7
59826647,353404,507,7
59826648,353404,392,9
59826649,353404,882,6


In [23]:
'''
   Since the data size is still too large for our computational resource, we'll pick the users 
   whose watching records is more than 500 for our model. 

'''

# Step 1: Calculate the count of movies watched by each user
count = model_data['user_id'].value_counts()

# Step 2: Filter user_movie_count to get users with more than 500 movies
index =count[count > 500].index

# Step 3: Filter the original DataFrame to only include these users
filtered_data = model_data[model_data['user_id'].isin(index)]

In [24]:
filtered_data

Unnamed: 0,user_id,anime_id,rating
45,3901,1,6
66,6253,1,9
80,7238,1,8
86,8326,1,8
95,9324,1,7
...,...,...,...
59826401,353398,3712,3
59826402,353398,14075,4
59826403,353398,763,7
59826404,353398,37976,6


In [25]:
# How many unique users are there. 
len(filtered_data['user_id'].unique())




24170

In [26]:
# How many unique animes are there
len(filtered_data['anime_id'].unique())


16882

In [27]:
'''
Since the user_id and anime_id here is not a continous number set, to use them as the coordinates 
in our User-Item Matrix, we'll need to encode to a set of continous number ranging from 0 to 
len(filtered_data['anime_id'].unique()) -1.

'''




from sklearn.preprocessing import LabelEncoder
# Use LabelEncoder to convert users and movies into categorical
# This step also remaps each column integer range to [0, len(df.column.unique())-1]
# Remapping is useful to reduce memory size of nn.embeddings
# Initialize the encoders# Initialize the LabelEncoders
user_encoder = LabelEncoder()
anime_encoder = LabelEncoder()

# Fit and transform the columns using LabelEncoder
filtered_data['user_id'] = user_encoder.fit_transform(filtered_data['user_id'])
filtered_data['anime_id'] = anime_encoder.fit_transform(filtered_data['anime_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['user_id'] = user_encoder.fit_transform(filtered_data['user_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['anime_id'] = anime_encoder.fit_transform(filtered_data['anime_id'])


In [30]:
class Train_Test_Split:
    def __init__(self, dataset, traintest_ratio= 0.8, Kfold=5):
        self.data=dataset
        self.traintest_ratio=traintest_ratio
        self.K=Kfold
    def train_test_split(self ):
        """
        I hope to pick the data follow standard normal distributon and pick 90% of the data from
        each user for training and 10% for testing, using a random seed to fix my state 
        
        """
        """
        Splits the data into training and testing datasets while maintaining a ratio and ensuring
        each user is represented in both datasets.
        """
        np.random.seed(2)  # Fixing the random state for reproducibility
        # Separate data per user
        tr_data = pd.DataFrame()
        te_data = pd.DataFrame()
        
        for user_id, group in self.data.groupby('user_id'):
            n_samples = len(group)
            n_train = int(self.traintest_ratio * n_samples)
            
            # Shuffle the indices
            shuffled_indices = np.random.permutation(n_samples)
            
            # Split the indices according to the ratio
            train_indices = shuffled_indices[:n_train]
            test_indices = shuffled_indices[n_train:]
            
            # Append split data to the respective DataFrames
            tr_data = pd.concat([tr_data, group.iloc[train_indices]])
            te_data = pd.concat([te_data, group.iloc[test_indices]])
        
        return tr_data, te_data
        
        


In [31]:
a=Train_Test_Split(filtered_data )

In [32]:
tr,te=a.train_test_split()

In [36]:
#Show traing_set
tr

Unnamed: 0,user_id,anime_id,rating
2195640,0,6060,9
2195538,0,10898,7
2195646,0,8284,8
2195347,0,10997,5
2195825,0,57,9
...,...,...,...
59826395,24169,6734,7
59826314,24169,6132,7
59826364,24169,4831,8
59826209,24169,8868,7


In [37]:
#Save training set
tr.to_csv("training_set.csv",index=False )

In [39]:
#Show the number of rows for each user to show the balance
print(tr.groupby('user_id').size())

user_id
0        567
1        545
2        544
3        401
4        467
        ... 
24165    764
24166    459
24167    478
24168    596
24169    612
Length: 24170, dtype: int64


In [40]:
#Show testing_set
te

Unnamed: 0,user_id,anime_id,rating
2195640,0,6060,9
2195538,0,10898,7
2195646,0,8284,8
2195347,0,10997,5
2195825,0,57,9
...,...,...,...
59826395,24169,6734,7
59826314,24169,6132,7
59826364,24169,4831,8
59826209,24169,8868,7


In [41]:
# Save the testing set 
te.to_csv("testing_set.csv",index=False )

In [42]:
#Show the number of rows for each user to show the balance
print(te.groupby('user_id').size())

user_id
0        142
1        137
2        137
3        101
4        117
        ... 
24165    192
24166    115
24167    120
24168    149
24169    154
Length: 24170, dtype: int64
