In [2]:
import pandas as pd
from pandas import DataFrame
import numpy as np
from scipy import linalg

In [3]:
# Split into Training Data and Test Data from Data
# Training Data (timestamp >= 1104505203 ~ timestamp <= 1230735592)
# Test Data (timestamp >= 1230735600 이후)
# <userId>,<movieId>,<predicted rating>,<timestamp>
# 구현의 성능 향승을 위한 동시성 사용방안에 대해 고민할 것 
    # 성능 향사을 위해 SVD 등 Linear Algebra 활용
    
# 데이터 불러오는 것도 함수화
# 전처리도 함수화
# pandas 가능하면 사용 지양

In [4]:
df = pd.read_csv("./ml-25m/ratings.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [5]:
# 함수화
import csv
def load_data(file_name):
    with open(file_name, mode='r') as f:
        df = csv.reader(f)
#         for line in df:
#             print(line)
#     return df

In [6]:
# Preprocessing
# load data
rating_data = pd.read_csv("./ml-25m/ratings.csv")
movie_data = pd.read_csv("./ml-25m/movies.csv")
genre_data = pd.read_csv("./ml-25m/genome-scores.csv")

In [7]:
# Check outlier, missing data
rating_data.isnull().sum(), movie_data.isnull().sum(), genre_data.isnull().sum()
# no missing data

(userId       0
 movieId      0
 rating       0
 timestamp    0
 dtype: int64,
 movieId    0
 title      0
 genres     0
 dtype: int64,
 movieId      0
 tagId        0
 relevance    0
 dtype: int64)

In [8]:
# Training data, Test data slicing
train_df = df[(df['timestamp'] >= 1104505203) & (df['timestamp'] <= 1230735592)]
test_df = df[df['timestamp'] >= 1230735600]

In [9]:
# Percenttage of Train_df , Test_df
print("Train: ", len(train_df) / len(df) * 100) # train data is too small...
print("Test: ", len(test_df) / len(df) * 100)

Train:  18.408354048254616
Test:  45.89131361300827


In [10]:
# data를 user_id x movie-id로 rating matrix로 변환
new_train = train_df.set_index("userId")
# Construct rating Matrix
new_train = new_train.groupby(["userId", "movieId"])['rating'].sum().unstack()
# fill 0 value in NaN
new_train = new_train.fillna(0)
new_train
# sparse matrix 

movieId,1,2,3,4,5,6,7,8,9,10,...,64957,64969,64976,64983,64986,64990,64993,64997,65011,65025
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,4.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162512,4.0,3.5,3.5,0.0,3.5,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162516,4.5,2.5,0.5,2.0,0.0,4.5,2.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# new_test = test_df.set_index("userId")
# new_test = new_test.groupby(["userId", "movieId"])["rating"].sum().unstack()
# new_test = new_test.fillna(0)
# Unstacked DataFrame is too big, causing int32 overflow

In [10]:
# Matrix 분해 , new_test = new_test1 + new_testf2
test_df
new_test = pd.DataFrame.copy(test_df)
test_df

Unnamed: 0,userId,movieId,rating,timestamp
254,3,1,4.0,1439472215
255,3,29,4.5,1484754967
256,3,32,4.5,1439474635
257,3,50,5.0,1439474391
258,3,111,4.0,1484753849
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [42]:
new_test1 = new_test.iloc[1:3824544]
# len(new_test1)
new_test1 = new_test1.set_index("userId")
new_test1 = new_test1.groupby(["userId", "movieId"])["rating"].sum().unstack().fillna(0)
new_test1

movieId,1,2,3,4,5,6,7,8,9,10,...,208941,208955,209049,209051,209053,209055,209069,209103,209121,209163
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54154,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54158,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
new_test2 = new_test.iloc[8315961:12140250]
# len(new_test2)
new_test2 = new_test2.set_index("userId")
new_test2 = new_test2.groupby(["userId", "movieId"])["rating"].sum().unstack().fillna(0)
new_test2d

movieId,1,2,3,4,5,6,7,8,9,10,...,209141,209143,209145,209147,209151,209153,209155,209157,209169,209171
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
117574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
117575,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
117576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
117577,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
117580,3.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,2.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162534,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162538,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# merge new_test1, new_test2
# new_test = pd.concat([new_test1, new_test2])
# new_test = new_test.fillna(0)
# new_test

In [11]:
# train_index = new_train.index
# train_df = new_train[new_train.index.isin(train_index)]

In [12]:
U, s, VT = linalg.svd(new_train, compute_uv = True)

In [13]:
A = U @ linalg.diagsvd(s, new_train.shape[0], new_train.shape[1]) @ VT

In [14]:
A

array([[ 1.23243723e-15, -4.51086518e-15,  2.04546032e-15, ...,
        -2.05001031e-15, -1.24851979e-15,  3.06927396e-19],
       [ 3.50000000e+00, -2.42742319e-14, -2.31132420e-14, ...,
        -6.36020946e-16,  1.10439544e-16, -3.49401896e-15],
       [ 3.50000000e+00, -4.01292939e-15,  2.16728515e-14, ...,
        -1.29228847e-15,  1.24109809e-16,  1.29674361e-15],
       ...,
       [ 2.19620820e-16, -2.64763970e-18, -5.06239223e-15, ...,
        -3.33361675e-16, -1.21430643e-16, -5.03621276e-16],
       [ 4.50000000e+00,  2.50000000e+00,  5.00000000e-01, ...,
        -1.84564244e-15, -6.65493034e-16,  9.96890016e-17],
       [ 4.00000000e+00, -5.48199723e-18, -3.72006871e-15, ...,
        -1.99883767e-15, -2.13155841e-18,  2.94289708e-15]])

In [15]:
np.allclose(A, new_train)

True

In [16]:
new_train

movieId,1,2,3,4,5,6,7,8,9,10,...,64957,64969,64976,64983,64986,64990,64993,64997,65011,65025
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,4.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162512,4.0,3.5,3.5,0.0,3.5,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162516,4.5,2.5,0.5,2.0,0.0,4.5,2.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
s

array([3.80081582e+03, 1.18914285e+03, 1.09553007e+03, ...,
       2.77179691e-13, 2.77179691e-13, 4.39746197e-15])

In [19]:
r = s.shape[0] - sum(lx < 1.e-8 for lx in s)

In [20]:
A1 = U[:, :r] * s[:r] @ VT[:r, :]

In [21]:
A1

array([[ 1.23243723e-15, -4.51086518e-15,  2.04546032e-15, ...,
        -2.05001031e-15, -1.24851979e-15,  1.88795174e-17],
       [ 3.50000000e+00, -2.42742319e-14, -2.31132420e-14, ...,
        -6.36020946e-16,  1.10439544e-16, -5.49012875e-17],
       [ 3.50000000e+00, -4.01292939e-15,  2.16728515e-14, ...,
        -1.29228847e-15,  1.24109809e-16, -3.70255042e-17],
       ...,
       [ 2.19620820e-16, -2.64763970e-18, -5.06239223e-15, ...,
        -3.33361675e-16, -1.21430643e-16,  5.17503249e-17],
       [ 4.50000000e+00,  2.50000000e+00,  5.00000000e-01, ...,
        -1.84564244e-15, -6.65493034e-16,  9.96890016e-17],
       [ 4.00000000e+00, -5.48199723e-18, -3.72006871e-15, ...,
        -1.99883767e-15, -2.13155841e-18,  9.56808417e-17]])

In [22]:
A1.shape

(27915, 10533)

In [23]:
# svd를 하고 다시 복원을 하면 빈 값들이 채워진다. 이것을 모델로 만들어서 사용?
# SVD 자체를 numpy로 구현하고 Train data 사용 모델링, 피팅 이후 Test data로 평가...?

In [24]:
U

array([[-1.27012028e-03, -1.84519126e-04,  3.07455523e-03, ...,
        -1.75963319e-04,  4.49771364e-04, -7.15639333e-04],
       [-6.43694762e-03,  7.39938567e-03, -5.48209975e-03, ...,
         1.53738394e-03,  2.62868797e-03, -8.65909196e-03],
       [-1.99277724e-03,  3.59848135e-03,  1.68091115e-03, ...,
         6.90935048e-04,  7.08645413e-03,  1.04703869e-03],
       ...,
       [-4.99584243e-03,  9.77802688e-03,  5.17040224e-03, ...,
         9.52509441e-01,  2.36889458e-03,  1.21443354e-03],
       [-2.62857623e-02, -5.24755382e-02, -9.45123452e-04, ...,
        -1.08420217e-19, -5.55111512e-17,  2.81892565e-18],
       [-7.69795207e-03,  6.72991311e-03, -3.91694492e-03, ...,
        -4.96121327e-04, -3.51554753e-03,  6.70033180e-01]])

In [25]:
s

array([3.80081582e+03, 1.18914285e+03, 1.09553007e+03, ...,
       2.77179691e-13, 2.77179691e-13, 4.39746197e-15])

In [26]:
VT

array([[-7.40819214e-02, -3.34350252e-02, -1.29053131e-02, ...,
        -1.57362451e-04, -2.32780299e-05, -2.48176540e-06],
       [ 7.10145375e-02,  9.55534830e-04, -1.27381117e-02, ...,
        -2.62738984e-04, -2.58713500e-05, -3.11858232e-05],
       [-8.46449752e-04, -3.98757999e-02, -1.27749783e-02, ...,
        -1.11629946e-04, -2.16881827e-05, -2.52141218e-05],
       ...,
       [ 0.00000000e+00, -1.21640460e-26,  8.17969521e-27, ...,
        -3.44361099e-20, -5.79395083e-20, -9.02889533e-17],
       [ 0.00000000e+00,  4.47536636e-26, -3.00944323e-26, ...,
        -7.47332200e-21,  4.65791503e-20,  2.07947365e-17],
       [ 0.00000000e+00, -5.29793848e-18, -9.14076621e-18, ...,
        -2.43448585e-17, -6.76456561e-17, -2.86919848e-03]])

In [13]:
# SVD 를 사용 
# 모든 형태의 Matrix에 적용 가능
# U: AAT 의 고유벡터
# V: ATA 의 고유벡터

In [23]:
# ATA
ATA = new_train.T @ new_train

# AAT
AAT = new_train @ new_train.T

userId,1,2,10,11,12,18,20,43,54,56,...,162488,162490,162495,162498,162510,162511,162512,162513,162516,162521
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1088.00,71.50,22.50,0.00,183.50,75.00,0.00,229.00,12.50,41.50,...,140.75,92.00,141.50,85.25,72.50,80.50,207.25,140.25,433.00,207.75
2,71.50,2814.00,217.00,70.00,563.00,875.00,194.00,1385.00,239.50,356.25,...,788.25,538.00,1440.50,468.00,645.00,75.00,1230.00,591.25,2086.00,718.75
10,22.50,217.00,680.50,20.25,180.25,265.75,107.50,366.00,22.50,160.75,...,249.50,135.25,304.50,54.50,303.75,0.00,226.00,301.50,633.00,185.75
11,0.00,70.00,20.25,256.25,52.25,65.25,42.50,92.25,22.50,47.25,...,99.25,62.00,77.00,46.00,66.75,0.00,61.50,66.75,202.50,82.25
12,183.50,563.00,180.25,52.25,4888.25,789.50,143.00,1531.50,127.75,160.00,...,731.75,439.25,1544.75,292.00,350.75,167.75,1016.25,480.25,3237.25,731.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162511,80.50,75.00,0.00,0.00,167.75,36.00,8.00,190.25,0.00,0.00,...,81.50,41.50,198.25,68.50,25.00,539.75,151.50,50.25,371.75,60.50
162512,207.25,1230.00,226.00,61.50,1016.25,1186.75,228.25,2224.25,181.75,317.25,...,1091.25,661.25,2371.50,413.00,612.50,151.50,4915.75,624.50,3474.25,1252.50
162513,140.25,591.25,301.50,66.75,480.25,648.75,233.50,935.50,103.50,352.50,...,656.00,455.75,747.25,246.50,741.00,50.25,624.50,1387.75,1275.25,489.50
162516,433.00,2086.00,633.00,202.50,3237.25,2228.75,471.75,4745.25,359.75,471.00,...,2599.00,1130.25,6423.50,880.75,1016.25,371.75,3474.25,1275.25,28667.50,2407.00


In [None]:
# Find U
AAT_eigvals, AAT_eigvecs = np.linalg.eig(AAT)

In [9]:
# Find V

In [10]:
train_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
24997287,162521,56174,4.0,1219357742
24997298,162521,59315,4.5,1219339433
24997301,162521,59615,3.5,1219339443
24997302,162521,59725,4.0,1219339392


In [11]:
train_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24997268,24997269,24997278,24997280,24997284,24997287,24997298,24997301,24997302,24997308
userId,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,162521.0,162521.0,162521.0,162521.0,162521.0,162521.0,162521.0,162521.0,162521.0,162521.0
movieId,296.0,306.0,307.0,665.0,899.0,1088.0,1175.0,1217.0,1237.0,1250.0,...,50798.0,50872.0,54004.0,54272.0,55553.0,56174.0,59315.0,59615.0,59725.0,60074.0
rating,5.0,3.5,5.0,5.0,3.5,4.0,3.5,3.5,5.0,4.0,...,1.5,4.0,3.5,4.5,4.0,4.0,4.5,3.5,4.0,4.0
timestamp,1147880000.0,1147869000.0,1147869000.0,1147879000.0,1147869000.0,1147868000.0,1147869000.0,1147878000.0,1147869000.0,1147868000.0,...,1219339000.0,1219339000.0,1219339000.0,1207989000.0,1219339000.0,1219358000.0,1219339000.0,1219339000.0,1219339000.0,1219339000.0


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
24997287,162521,56174,4.0,1219357742
24997298,162521,59315,4.5,1219339433
24997301,162521,59615,3.5,1219339443
24997302,162521,59725,4.0,1219339392


In [17]:
new_train

movieId,1,2,3,4,5,6,7,8,9,10,...,64957,64969,64976,64983,64986,64990,64993,64997,65011,65025
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,4.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162512,4.0,3.5,3.5,0.0,3.5,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162516,4.5,2.5,0.5,2.0,0.0,4.5,2.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# new_train 행렬로 변환
train_matrix = new_train.values
train_matrix

array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [3.5, 0. , 0. , ..., 0. , 0. , 0. ],
       [3.5, 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [4.5, 2.5, 0.5, ..., 0. , 0. , 0. ],
       [4. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [51]:
x1_data = new_train.index
x2_data = new_train.columns
y_data = new_train.values

In [52]:
# y = ax1 + bx2 ?
n = x1_data.shape[0]
X = np.zeros((n,2))
X[:,0] = x1_data
X[:,1] = x2_data

beta, res, rank, s = linalg.lstsq(X, y_data)

ValueError: could not broadcast input array from shape (10533) into shape (27915)

In [40]:
beta

array([3.04579347e-05, 1.13119000e-05])

In [41]:
x1_data[0], x2_data[0]

(1, 296)

In [44]:
len(A)

27915

In [17]:
# reduced SVD를 사용!
# cold start를 대응하기 위한 방안
# Train이랑 Test 합쳐서 한번에 학습 후 Test에 할당

# r = qTp구현

In [28]:
# Singular Value Decomposition U, s, Vt = svd(new_train)
# new_train data is too sparse
# U: AAT 의 고유벡터
# s: A의 특이값들을 대각항으로 가지는 대각행렬
# VT: ATA의 고유벡터
# s는 eigenvalue에 sqrt값을 취한 값으로 scaling으로 볼수 있음

$r_{ui} = p_{u} \cdot q_{i}$
- $p_{u}$: $U$ 의 한행으로 사용자의 요인을 나타낸다
- $q_{i}$: ${V}^{T}$의 한 열로 영화 요인을 나타냄

In [29]:
U

array([[-1.27012028e-03, -1.84519126e-04,  3.07455523e-03, ...,
        -1.75963319e-04,  4.49771364e-04, -7.15639333e-04],
       [-6.43694762e-03,  7.39938567e-03, -5.48209975e-03, ...,
         1.53738394e-03,  2.62868797e-03, -8.65909196e-03],
       [-1.99277724e-03,  3.59848135e-03,  1.68091115e-03, ...,
         6.90935048e-04,  7.08645413e-03,  1.04703869e-03],
       ...,
       [-4.99584243e-03,  9.77802688e-03,  5.17040224e-03, ...,
         9.52509441e-01,  2.36889458e-03,  1.21443354e-03],
       [-2.62857623e-02, -5.24755382e-02, -9.45123452e-04, ...,
        -1.08420217e-19, -5.55111512e-17,  2.81892565e-18],
       [-7.69795207e-03,  6.72991311e-03, -3.91694492e-03, ...,
        -4.96121327e-04, -3.51554753e-03,  6.70033180e-01]])

In [30]:
VT

array([[-7.40819214e-02, -3.34350252e-02, -1.29053131e-02, ...,
        -1.57362451e-04, -2.32780299e-05, -2.48176540e-06],
       [ 7.10145375e-02,  9.55534830e-04, -1.27381117e-02, ...,
        -2.62738984e-04, -2.58713500e-05, -3.11858232e-05],
       [-8.46449752e-04, -3.98757999e-02, -1.27749783e-02, ...,
        -1.11629946e-04, -2.16881827e-05, -2.52141218e-05],
       ...,
       [ 0.00000000e+00, -1.21640460e-26,  8.17969521e-27, ...,
        -3.44361099e-20, -5.79395083e-20, -9.02889533e-17],
       [ 0.00000000e+00,  4.47536636e-26, -3.00944323e-26, ...,
        -7.47332200e-21,  4.65791503e-20,  2.07947365e-17],
       [ 0.00000000e+00, -5.29793848e-18, -9.14076621e-18, ...,
        -2.43448585e-17, -6.76456561e-17, -2.86919848e-03]])

In [31]:
s

array([3.80081582e+03, 1.18914285e+03, 1.09553007e+03, ...,
       2.77179691e-13, 2.77179691e-13, 4.39746197e-15])

In [35]:
# Construct p_{u}
U.shape, s.shape, VT.shape

((27915, 27915), (10533,), (10533, 10533))

In [38]:
U, s, VT = linalg.svd(new_train, compute_uv = True)

In [40]:
U.shape, s.shape, VT.shape

((27915, 27915), (10533,), (10533, 10533))

In [41]:
r = s.shape[0] - sum(lx < 1.e-8 for lx in s)

In [42]:
pu = U[:, :r] * s[:r]

In [43]:
qu = s[:r] @ VT[:r, :]

In [44]:
pu

array([[-4.82749327e+00, -2.19419599e-01,  3.36826771e+00, ...,
         2.06785883e-05, -8.28371468e-05,  1.20788497e-05],
       [-2.44656524e+01,  8.79892658e+00, -6.00580514e+00, ...,
         5.35650701e-05,  1.01085583e-04,  8.55098342e-06],
       [-7.57417926e+00,  4.27910838e+00,  1.84148872e+00, ...,
        -3.11582707e-05,  4.19008597e-06,  8.87434703e-07],
       ...,
       [-1.89882770e+01,  1.16274708e+01,  5.66433114e+00, ...,
         3.27297757e-05, -2.59386688e-05, -2.73262614e-06],
       [-9.99073414e+01, -6.24009112e+01, -1.03541116e+00, ...,
        -3.54171227e-07,  1.37151988e-06, -1.49808215e-07],
       [-2.92584980e+01,  8.00282807e+00, -4.29113095e+00, ...,
        -4.64532002e-06,  2.17534257e-05, -7.50468522e-06]])

In [45]:
qu

array([-391.29024836,  -39.79124503, -144.50112724, ...,   -4.80125063,
         -0.74099812,   -0.91670203])

In [57]:
pu.shape

(27915, 10326)

In [70]:
g = np.diag(qu)

In [72]:
g.shape

(10533, 10533)

In [74]:
# data에 svd를 바로 적용하여 pu, qu를 구하고 r - pu qu를 최소화하는 sgd를 사용하냐?
# matrix에서 직접 ATA, AAT의 orthogonal 행렬을 구해서 sgd를 사용하냐?

In [75]:
# ATA, AAT의 고유벡터 구하기
 # ATA
ATA = new_train.T @ new_train

# AAT
AAT = new_train @ new_train.T

In [None]:
re_U = np.linalg.eig(AAT)

In [None]:
re_VT = np.linalg.eig(ATA)

In [None]:
print("A")