# SVD test

### read Movielens data

In [125]:
from collections import defaultdict
from pprint import pprint
from pandas import DataFrame
import pandas as pd

def loadMovieLens(path='data/movielens'):
  # Get movie titles
  movies={}
  # ingnore
  for line in open(path+'/u.item',errors='ignore'):
    (id,title)=line.split('|')[0:2]
    movies[id]=title
  
  # Load data
  prefs=defaultdict(dict)
  for line in open(path+'/u.data'):
    (user,movieid,rating,ts)=line.split('\t')
    prefs[int(user)][int(movieid)]=float(rating)
  return prefs,movies

### drop rows that has to much NaN

In [214]:
# data = DataFrame(movies,index=[0])
# data
prefs,movies = loadMovieLens()
data = DataFrame(prefs)
print(data)
counts=[n for n in range(1,1682) if data.ix[n].count()>100]
print(len(counts))
cleanedData = data.ix[counts]
cleanedData=cleanedData.fillna(0)

      1    2    3    4    5    6    7    8    9    10  ...   934  935  936  \
1     5.0  4.0  NaN  NaN  4.0  4.0  NaN  NaN  NaN  4.0 ...   2.0  3.0  4.0   
2     3.0  NaN  NaN  NaN  3.0  NaN  NaN  NaN  NaN  NaN ...   4.0  NaN  NaN   
3     4.0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN ...   NaN  NaN  4.0   
4     3.0  NaN  NaN  NaN  NaN  NaN  5.0  NaN  NaN  4.0 ...   5.0  NaN  NaN   
5     3.0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN ...   NaN  NaN  NaN   
6     5.0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  5.0  NaN ...   NaN  NaN  5.0   
7     4.0  NaN  NaN  NaN  NaN  2.0  5.0  3.0  4.0  4.0 ...   NaN  NaN  4.0   
8     1.0  NaN  NaN  NaN  NaN  4.0  5.0  NaN  NaN  NaN ...   NaN  NaN  NaN   
9     5.0  NaN  NaN  NaN  NaN  4.0  5.0  NaN  NaN  4.0 ...   NaN  1.0  4.0   
10    3.0  2.0  NaN  NaN  NaN  NaN  4.0  NaN  NaN  NaN ...   NaN  NaN  NaN   
11    2.0  NaN  NaN  4.0  NaN  NaN  3.0  3.0  NaN  4.0 ...   NaN  NaN  NaN   
12    5.0  NaN  NaN  NaN  NaN  4.0  5.0  NaN  NaN  5.0 ...   NaN

### svd test

In [127]:
import numpy as np

U,Sigma,VT=np.linalg.svd(cleanedData)
Squ=[value**2 for value in Sigma]
valueSum = sum(Squ)
print(valueSum)
for n in range(0,len(Sigma)+1,20):
    print("%d: %f" % (n,sum(Squ[:n])/valueSum))

955621.0
0: 0.000000
20: 0.621745
40: 0.692749
60: 0.747882
80: 0.792778
100: 0.830471
120: 0.862321
140: 0.889204
160: 0.912028
180: 0.931400
200: 0.947672
220: 0.961278
240: 0.972500
260: 0.981599
280: 0.988843
300: 0.994392
320: 0.998309


# LFM test 

In [253]:
import math

def LFM(user_items, F, N, alpha=0.02, lam=0.01):
    P,Q = InitModel(user_items, F)
    print(P.shape)
    print(Q.shape)
    user_num, item_num = user_items.shape
    for step in range(N):
        total_error = 0.0
        for u in range(user_num):
            for i in range(item_num):
                err = user_items[u][i]-np.dot(P[u],Q[i])
                total_error += err**2
                gp = err*Q[i]+lam*P[u]
                gq = err*P[u]+lam*Q[i]
                P[u] += alpha*gp
                Q[i] += alpha*gq        
        print("step %d: %f"%(step,total_error))
    return P,Q
                    
def InitModel(user_items, F):
    user_num, item_num = user_items.shape
    #归一化
    P = np.random.rand(user_num,F)/math.sqrt(F)
    Q = np.random.rand(item_num,F)/math.sqrt(F)
    return P,Q

def Recommend(user, P, Q):
    rank = dict()
    for f, puf in P[user].items():
        for i, qfi in Q[f].items():
            if i not in rank:
                rank[i] += puf*qfi
    return rank

In [254]:
arr = np.array(cleanedData)
P,Q = LFM(arr,100,1000)
print(P)
print(Q)
print(np.dot(P,Q.T))

(334, 100)
(943, 100)
step 0: 584337.583726
step 1: 490191.278921
step 2: 418864.012668
step 3: 358881.901395
step 4: 321618.951172
step 5: 310418.634103
step 6: 306510.812212
step 7: 303417.442156
step 8: 301803.642731
step 9: 301413.912038
step 10: 301827.688231
step 11: 302747.147785
step 12: 303973.375980
step 13: 305398.626397
step 14: 306872.829500
step 15: 308393.424479
step 16: 311826.061934
step 17: 348221.975988
step 18: 416902.277428
step 19: 545567.512213
step 20: 524578.036288
step 21: 442045.588550
step 22: 396997.031676
step 23: 376903.086617
step 24: 365042.757329
step 25: 357016.699922
step 26: 350958.211261
step 27: 346321.247024
step 28: 342829.326506
step 29: 340029.759324
step 30: 337758.409284
step 31: 335799.030269
step 32: 334216.789684
step 33: 332877.120638
step 34: 331899.110075
step 35: 331735.247804
step 36: 339832.081594
step 37: 348969.708274
step 38: 361326.054613
step 39: 381049.640604
step 40: 375102.573256
step 41: 364623.979324
step 42: 352931.521536

KeyboardInterrupt: 

In [249]:
mat = [[5,5,0,5],[5,0,3,4],[3,4,0,3],[0,0,5,3],[5,4,4,5],[5,4,5,5]]
arr = np.array(mat)
P,Q = LFM(arr,3,100)
print(P)
print(Q)
print(np.dot(P,Q.T))

(6, 3)
(4, 3)
step 0: 315.916934
step 1: 275.978519
step 2: 220.798179
step 3: 159.324154
step 4: 108.119942
step 5: 77.506033
step 6: 64.059623
step 7: 59.247409
step 8: 57.550914
step 9: 56.798797
step 10: 56.321052
step 11: 55.921708
step 12: 55.535920
step 13: 55.135737
step 14: 54.704792
step 15: 54.230633
step 16: 53.701785
step 17: 53.106249
step 18: 52.430523
step 19: 51.658811
step 20: 50.772343
step 21: 49.748806
step 22: 48.561987
step 23: 47.181824
step 24: 45.575145
step 25: 43.707501
step 26: 41.546534
step 27: 39.067262
step 28: 36.259338
step 29: 33.135713
step 30: 29.741041
step 31: 26.157059
step 32: 22.501436
step 33: 18.917414
step 34: 15.554236
step 35: 12.542556
step 36: 9.972335
step 37: 7.880780
step 38: 6.253849
step 39: 5.039088
step 40: 4.163568
step 41: 3.550446
step 42: 3.130299
step 43: 2.846557
step 44: 2.656420
step 45: 2.529215
step 46: 2.443802
step 47: 2.386000
step 48: 2.346458
step 49: 2.319065
step 50: 2.299831
step 51: 2.286146
step 52: 2.276286
s