In [None]:
import numpy as np
import pandas as pd


## dataframes
'''You can create a DataFrame from various data sources, including:
Dictionaries
Lists
NumPy Arrays
CSV/Excel files
SQL Databases
'''

'You can create a DataFrame from various data sources, including:\nDictionaries\nLists\nNumPy Arrays\nCSV/Excel files\nSQL Databases\n'

In [3]:
BASE_PATH = '../dataset/'
rating_df = pd.read_csv(BASE_PATH + 'rating.csv')
print(rating_df.size)
rating_df = rating_df[   
                        (rating_df['rating'] > 0) 
                        & (rating_df['user_id'] != 42653)
                    ]

23441211


In [5]:
rating_df.head   # userId, animeId, rating

<bound method NDFrame.head of          user_id  anime_id  rating
47             1      8074      10
81             1     11617      10
83             1     11757      10
101            1     15451      10
153            2     11771      10
...          ...       ...     ...
7813732    73515     16512       7
7813733    73515     17187       9
7813734    73515     22145      10
7813735    73516       790       9
7813736    73516      8074       9

[6337151 rows x 3 columns]>

In [6]:
anime_df = pd.read_csv(BASE_PATH + 'anime.csv')
anime_df.head()     ### animeId, name....., rating

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [27]:
anime_df.size   ##  86058

86058

# Pivot ratings

In [30]:
r_df = rating_df.reset_index()   ## add index from 0
r_df = r_df.pivot(index='user_id', columns='anime_id', values='rating')
r_df   ## many are NaN
r_df = r_df.fillna(0)    ## user_id: 69599; anime_id: 9927
r_df.shape

(69599, 9927)

In [9]:
r = r_df.to_numpy()
r

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [10., 10., 10., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

## Model-Based CF (svd)

In [10]:
rating_mean = np.mean(r, axis=1)
rating_mean   ## each user has a mean
rating_mean.size   ## 69599

69599

In [11]:

# Example matrix 'r' where rows are users and columns are items
r_test = np.array([
    [4, 5, 3],  # User 1's ratings
    [2, 1, 4],  # User 2's ratings
    [5, 3, 4]   # User 3's ratings
])

r_test_mean = np.mean(r_test, axis=1)
r_test_mean.reshape(-1, 1)
r_test_demeaned = r_test - r_test_mean.reshape(-1, 1)
r_test_demeaned

array([[ 0.        ,  1.        , -1.        ],
       [-0.33333333, -1.33333333,  1.66666667],
       [ 1.        , -1.        ,  0.        ]])

In [12]:
rating_mean.reshape(-1 ,1)    ## turning series into column vector

r_demeaned = r - rating_mean.reshape(-1, 1)
r_demeaned

array([[-4.02941473e-03, -4.02941473e-03, -4.02941473e-03, ...,
        -4.02941473e-03, -4.02941473e-03, -4.02941473e-03],
       [-1.00735368e-03, -1.00735368e-03, -1.00735368e-03, ...,
        -1.00735368e-03, -1.00735368e-03, -1.00735368e-03],
       [-7.01118163e-02, -7.01118163e-02, -7.01118163e-02, ...,
        -7.01118163e-02, -7.01118163e-02, -7.01118163e-02],
       ...,
       [-1.00735368e-03, -1.00735368e-03, -1.00735368e-03, ...,
        -1.00735368e-03, -1.00735368e-03, -1.00735368e-03],
       [ 9.84587489e+00,  9.84587489e+00,  9.84587489e+00, ...,
        -1.54125113e-01, -1.54125113e-01, -1.54125113e-01],
       [-1.81323663e-03, -1.81323663e-03, -1.81323663e-03, ...,
        -1.81323663e-03, -1.81323663e-03, -1.81323663e-03]])

In [13]:
from scipy.sparse.linalg import svds
## SVD compresses this data into a smaller, more manageable size while maintaining user and item relationships

In [14]:
U, sigma, Vt = svds(r_demeaned, k=10)   ##  k=20: The number of singular values and vectors to compute. This parameter controls the rank of the approximation. 
                                    ## A higher k captures more variance but increases computational cost.
sigma = np.diag(sigma)
preds = np.dot(np.dot(U, sigma), Vt) + \
    rating_mean.reshape(-1, 1)

'''
U: A matrix of left singular vectors with shape (m, k), where m is the number of users.
sigma: A 1D array of singular values with length k.
Vt: A matrix of right singular vectors (transposed) with shape (k, n), where n is the number of items.
'''
preds     ## still np

array([[ 2.40282266e-01,  2.27262023e-02,  1.57908243e-01, ...,
         7.45959953e-03,  8.11917348e-03,  8.56142417e-03],
       [-2.98342529e-02, -5.26298215e-02,  4.80962652e-04, ...,
         6.61773864e-04,  6.37344299e-04,  7.92432119e-04],
       [ 1.61417004e+00,  3.29268629e-01,  1.06906702e+00, ...,
         5.56629395e-03,  5.71915573e-03,  7.10756066e-03],
       ...,
       [ 2.02885232e-01,  1.45219519e-01,  1.10014285e-01, ...,
        -1.19922912e-03, -1.24029911e-03, -1.28833393e-03],
       [ 5.61680079e+00,  2.98405826e+00,  4.77407543e+00, ...,
        -1.82542110e-02, -2.05801603e-02, -1.68072583e-02],
       [ 1.95905327e-01,  8.93779985e-02,  1.34623916e-01, ...,
         1.00584816e-03,  1.12365218e-03,  1.32745048e-03]])

In [15]:
pred_df = pd.DataFrame(preds, columns=r_df.columns)     ## userId, animeId1, animeId2...
pred_df

anime_id,1,5,6,7,8,15,16,17,18,19,...,34238,34239,34240,34252,34283,34324,34325,34349,34367,34475
0,0.240282,0.022726,0.157908,-0.030478,-0.007432,-0.051881,-0.259004,-0.025144,-0.034505,-0.101568,...,0.000510,0.008559,-0.044300,0.008594,0.006009,0.007416,0.006733,0.007460,0.008119,0.008561
1,-0.029834,-0.052630,0.000481,-0.024311,0.006699,0.081111,0.036563,0.023040,0.028332,0.045029,...,0.000847,0.000782,-0.006692,0.000592,0.000751,0.000692,0.000753,0.000662,0.000637,0.000792
2,1.614170,0.329269,1.069067,-0.144261,0.032039,0.629785,-0.372729,0.164086,0.174366,0.588720,...,0.024957,0.005427,0.425722,0.005057,0.010805,0.011412,0.018640,0.005566,0.005719,0.007108
3,3.743573,2.056753,2.968061,0.176287,0.237580,2.084301,1.191756,0.591632,1.156027,1.425413,...,0.022735,-0.023716,0.469932,-0.023389,-0.007721,-0.013685,0.001957,-0.014520,-0.017080,-0.019805
4,0.679966,0.892333,0.392075,0.156445,0.003344,-0.542589,-0.729341,-0.138427,-0.146495,-1.162916,...,0.046239,-0.017637,1.224343,-0.009564,0.005169,-0.001414,0.022282,-0.006645,-0.006738,-0.015147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69594,0.651323,0.412860,0.431854,0.246522,-0.007013,-0.146198,0.281721,-0.034052,-0.035986,0.026884,...,-0.007203,-0.007657,0.041713,-0.007634,-0.008952,-0.008323,-0.008040,-0.007903,-0.008246,-0.008336
69595,3.199532,1.867287,2.021102,0.618402,0.015382,-0.014554,0.843020,0.006242,0.249849,0.541339,...,-0.022781,-0.016718,0.047331,-0.016525,-0.021881,-0.019237,-0.019975,-0.016452,-0.017093,-0.017502
69596,0.202885,0.145220,0.110014,0.038396,-0.003443,-0.026151,0.028749,-0.008178,-0.004689,-0.000692,...,-0.001140,-0.001298,0.005727,-0.001281,-0.001555,-0.001354,-0.001238,-0.001199,-0.001240,-0.001288
69597,5.616801,2.984058,4.774075,1.606487,0.147554,0.527910,0.342307,0.033212,0.502144,2.783222,...,-0.029564,-0.020838,-0.307080,-0.024664,-0.013089,-0.013526,-0.032742,-0.018254,-0.020580,-0.016807


In [23]:
# Original ratings matrix (users x items)
rr = np.array([
    [4, 5, 3, 0],
    [2, 1, 4, 5],
    [5, 3, 4, 2],
    [0, 0, 0, 0]  # User with no ratings
])

# Compute mean ratings for each user (ignoring zeros if they represent missing ratings)
rr_rating_mean = np.true_divide(rr.sum(1), (rr != 0).sum(1))
rr_rating_mean = np.nan_to_num(rr_rating_mean)  # Replace NaN with 0 for users with no ratings

# Demean the ratings
rr_demeaned = rr - rr_rating_mean.reshape(-1, 1)
rr_demeaned[rr == 0] = 0  # Optionally keep missing ratings as zero
# Perform SVD with k=2 (for simplicity in this example)
U, sigma, Vt = svds(rr_demeaned, k=2)

# Convert sigma to a diagonal matrix
sigma = np.diag(sigma)

# Reconstruct the approximated ratings
preds = np.dot(np.dot(U, sigma), Vt) + rr_rating_mean.reshape(-1, 1)

print("Original Ratings:\n", rr)
print("Mean Ratings:\n", rr_rating_mean)
print("Demeaned Ratings:\n", rr_demeaned)
print("Predicted Ratings:\n", preds)



Original Ratings:
 [[4 5 3 0]
 [2 1 4 5]
 [5 3 4 2]
 [0 0 0 0]]
Mean Ratings:
 [4.  3.  3.5 0. ]
Demeaned Ratings:
 [[ 0.   1.  -1.   0. ]
 [-1.  -2.   1.   2. ]
 [ 1.5 -0.5  0.5 -1.5]
 [ 0.   0.   0.   0. ]]
Predicted Ratings:
 [[3.80779532 5.10363423 3.21048208 3.87808837]
 [1.91292608 1.04694911 4.09535408 4.94477073]
 [4.90643093 3.05045121 4.10246687 1.94065099]
 [0.         0.         0.         0.        ]]


  rr_rating_mean = np.true_divide(rr.sum(1), (rr != 0).sum(1))


In [33]:
## Filtering Unrated Movies: The function filters out the movies that the user has already rated by using ~movie_df['anime_id'].isin(existing_user_ratings['anime_id']).
## Merging with Predictions: The filtered movie DataFrame is merged with the user's sorted predicted ratings to append the predicted scores to the movie information.
## Renaming and Sorting: The merged DataFrame renames the predicted ratings column as Predictions, sorts it by the predicted ratings in descending order, and selects the top num recommendations using .iloc[:num].

def recommend_movie(pred_df, user_id, movie_df, origin_rating_df, num=5):
    user_index = user_id - 1
    sorted_user_preds = pred_df.iloc[user_index].sort_values(ascending=False)   ## iloc[] is used for integer-location based indexing 
    print(sorted_user_preds)    ## 9927 animess
    existing_user_ratings = origin_rating_df[origin_rating_df['user_id'] == user_id] ## user_id, anime_id, rating
    existing_ratings_df = existing_user_ratings.merge(movie_df, how='left', left_on='anime_id', right_on='anime_id'). \
                          sort_values(['rating'], ascending=False)
    print(f"User {user_id} has already rated {existing_ratings_df.shape[0]} movies")

### ## 将425 => 'Predictions'
    recommends = (movie_df[~movie_df['anime_id'].isin(existing_user_ratings['anime_id'])]). \
                 merge(sorted_user_preds.reset_index(), how='left', left_on='anime_id', right_on='anime_id'). \
                 rename(columns={user_index: 'Predictions'}). \
                 sort_values('Predictions', ascending=False). \
                 iloc[:num]
    
    return existing_ratings_df, recommends


In [34]:
anime_df = anime_df.rename(columns={'rating': 'old_rating'})
already_rated, user_preds = recommend_movie \
            (pred_df, 426, anime_df, rating_df, num=10)


anime_id
199      5.564479
431      4.665917
164      4.075966
523      3.765099
1535     3.633541
           ...   
10030   -0.542235
28701   -0.555043
918     -0.566346
5258    -0.729409
263     -0.789035
Name: 425, Length: 9927, dtype: float64
User 426 has already rated 10 movies


In [26]:
already_rated

Unnamed: 0,user_id,anime_id,rating,name,genre,type,episodes,old_rating,members
2,426,6211,10,Tokyo Magnitude 8.0,Drama,TV,11,8.19,121349
8,426,21105,10,Love Stage!!,"Comedy, Romance, Shounen Ai",TV,10,7.69,83397
9,426,23441,10,Love Stage!! OVA,"Comedy, Romance, Shounen Ai",OVA,1,7.74,23631
3,426,6702,9,Fairy Tail,"Action, Adventure, Comedy, Fantasy, Magic, Sho...",TV,175,8.22,584590
4,426,9926,9,Sekaiichi Hatsukoi,"Comedy, Drama, Romance, Shounen Ai",TV,12,8.15,94820
6,426,11123,9,Sekaiichi Hatsukoi 2,"Comedy, Drama, Romance, Shounen Ai",TV,12,8.31,69253
7,426,13125,9,Shinsekai yori,"Drama, Horror, Mystery, Sci-Fi, Supernatural",TV,25,8.53,288376
0,426,20,8,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
1,426,442,8,Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shin...,"Adventure, Comedy, Drama, Historical, Shounen,...",Movie,1,7.17,120571
5,426,9982,8,Fairy Tail OVA,"Comedy, Ecchi, Fantasy, Magic, Shounen",OVA,5,7.83,83421


In [1]:

import numpy as np
my_array = np.array([1, 2, 3])
print(my_array)

dict = {}
dict['jiamian'] = 'value'
dict[1] = 2

print(dict)

three_dim_array = [[[1], [2]], [[2], [3]], [[3], [4]]]
print(len(three_dim_array))
print(np.shape(three_dim_array))   ## returns a tuple representing the dimensions of the array.

[1 2 3]
{'jiamian': 'value', 1: 2}
3
(3, 2, 1)
