In [1]:
%matplotlib inline

import csv
import numpy as np 
import scipy as sp
import pandas as pd
from collections import Counter

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib.cm as cm

In [2]:
PATH = "./movies_csv/"
NUM_OBS = 500

PLOT_DISABLE = 0
PLOT_ENABLE = 1

In [3]:
def getData(file):
    reader=csv.reader(open(file,"rb"),delimiter=',')
    x=list(reader)
    result=np.array(x).astype(np.float32)
    return result

In [4]:
def genC(n, w):
	w_cumul	= w.cumsum()
	c	= np.zeros((1,n))
	w_arr	= pd.Series(w_cumul)
	c 	= [w_arr[(w_arr > np.random.random())].index[0] for i in range(0,n)]		
	return c

In [5]:
def euclid(a,b):
    return np.linalg.norm(a-b)

In [6]:
def kmeans(x, k, iter_num, plot_req):
    ## Function takes inputs:
    ## x --> input data & k--> k value & iter_num --> number of iterations
    ## Returns:
    ## c --> cluster assignment, 
    ## mu--> mean of classes
    ## L --> Objective function value
    d = np.size(x,axis=1)
    n = np.size(x,axis=0)
    mu = np.random.rand(d,k) # 2xk in our case.
    c = np.zeros((n,1))
    L = np.zeros((1,iter_num))
    #print d,n
    
    #int_res = []  # Intermediate x-mu values for all mu.
    
    for i in range(0,iter_num):
            sum = 0
            temp_sum = np.zeros((d,k)) # holds summation xi (c=k)
            n_k = np.zeros(k) #holds nk
            for j in range(0,n):
                int_res = [euclid(x[j,:], mu[:,m])**2 for m in range(0,k)]
                c[j][0] = np.argmin(int_res)
                sum = sum + np.min(int_res)
                allo_k = int(c[j][0])
                n_k[allo_k] = n_k[allo_k]+1
                temp_sum[:,allo_k] = temp_sum[:,allo_k] + x[j,:]
            mu = temp_sum*1.0/n_k    
                
            L[0][i] = L[0][i-1] if np.isnan(sum) else sum 
    if (plot_req):        
        plt.clf()
        plt.scatter(x[:,0], x[:,1], c=c*500)
        plt.plot(mu[0,:],mu[1,:], 'rx')
        plt.title("Scatter plot for k=%d"%k)
        plt.savefig("./images/k_means_scatter_plot_k=%d.jpg"%k)
        plt.show()
    return c, mu , L

In [21]:
def part1():
    print "Running K-means."
    pi = np.array([0.2, 0.5, 0.3])
    c = genC(NUM_OBS,pi)
    mu = [[0,0],[3,0],[0,3]]
    sig = [[[1,0],[0,1]],
          [[1,0],[0,1]],
          [[1,0],[0,1]]]
    n = np.zeros(np.size(pi))
    samples = []
    for i in range(0,len(n)):
        n[i] = sum([elem==i for elem in c])
        samples=samples+list(np.random.multivariate_normal(mean =mu[i], cov=sig[i], size=int(n[i])))
    samples = np.array(samples)
    #print samples
    
    k_val = [2,3,4,5,6]
    L_list = []
    mu_list = []
    c_list = []
    for k in k_val:
        print "k=%d" %k
        c, mu , L = kmeans(samples, k, 20, PLOT_ENABLE)
        c_list.append(c)
        mu_list.append(mu)
        L_list.append(L)
        #print L
    #print L_list
    
    plt.clf()
    for idx,L in enumerate(L_list):
    #    print str(idx+2) +":"
    #    print (L[0])
        
        plt.plot(L[0], label="L=%d"%(idx+2))
    
    plt.xlabel("Iteration Number")
    plt.ylabel("L values")
    plt.title("Plot of Objective Functions")
    plt.legend(loc='upper right', shadow=True)
    plt.savefig("./images/k_means_L_plot.jpg")
    plt.show()
    
    

In [181]:
def map_infer(M_train, M_test, u, v, lam, var, num_iter):
    
    N1 = np.size(u, axis=0) #N1-->number of users 
    N2 = np.size(v, axis=0) #N2-->number of movies
    d  = np.size(u, axis=1) #d --> rank of matrix
    
    L_list = []
    err_list = []
    
    iden = lam*var*np.eye(d)
    
    j_train = findIdx(M_train)
    i_train = findIdx(M_train.T)

    for iter_number in range(0,num_iter):    
        # Update u
        for i in range(0,N1):
            #u_i = u[i,:]
            tem = 0
            for j in j_train[i]:
                    term_1 = iden
                    term_3 = np.zeros(d)
                    term_2 = np.zeros((d,d))
                    #print "j_list:"
                    #print j_list
                    print "j:"
                    print j
                    M_ij = M_train[i,j]
                    v_j = v[j,:].T
                    term_2 = term_2 + np.dot(v_j,v_j.T)
                    term_3 = term_3 + (M_ij*v_j)
            u[i,:] = np.dot(np.linalg.inv(term_1+term_2), term_3)
    # Update v 
    return L_list, err_list, u, v

In [182]:
def genM(filename):
    temp_list = []
    with open(filename, 'r') as file:
        temp_list.append([elem.split(',') for elem in file.read().split('\n')[:-1]])
    list = np.array(temp_list[0])
    for row in list:
        for idx,col in enumerate(row):
            row[idx]=int(int(col))
    return list

In [183]:
def findIdx(matrix):
    idx_arr=[]
    for idx,row in enumerate(matrix):
        tem_list=[]
        for i,col in enumerate(row):
            if col != 0:
                tem_list.append(i)
        idx_arr.append(tem_list)
    return idx_arr

In [184]:
def part2():
    print "Implementing Matrix Factorization"
    
    ## From the data readme
    ##
    ## This data set consists of:
    ## 100,000 ratings (1-5) from 943 users on 1682 movies. 
    
    N1=943
    N2=1682
    NUM_ITER = 100
    temp_train_list = []
    temp_test_list = []
    with open("./movies_csv/movies.txt", 'r') as file:
        mov_list = file.read().split('\n')[:-1]
        
    train_list = genM("./movies_csv/ratings.txt")
    test_list = genM("./movies_csv/ratings.txt")
#    print test_list
    lam = 10
    d = 10
    var = 0.25
    u = np.random.multivariate_normal(mean =np.zeros(d), cov=(1.0/lam)*np.eye(d), size=N1)
    v = np.random.multivariate_normal(mean =np.zeros(d), cov=(1.0/lam)*np.eye(d), size=N2)
    
    #print u.shape
    #print v.shape
    tem = np.round(np.dot(u,v.T))
    
    #M_train = np.random.randint(1,6,(N1,N2))
    #M_test = np.random.randint(1,6,(N1,N2))
    M_train = np.zeros((N1,N2))
    M_test = np.zeros((N1,N2))
    
    for row in train_list:
        M_train[int(row[0])-1,int(row[1])-1]=int(row[2])
    for row in test_list:
        #print row
        M_test[int(row[0])-1,int(row[1])-1]=int(row[2])

    #print i_train
    L_list, err_list,u,v = map_infer(M_train, M_test, u, v, lam, var, NUM_ITER)
    

In [185]:
def main():
    #part1()
    part2()

In [186]:
main()

Implementing Matrix Factorization
j:
0
j:
1
j:
2
j:
3
j:
4
j:
5
j:
6
j:
7
j:
8
j:
9
j:
10
j:
11
j:
13
j:
14
j:
15
j:
16
j:
18
j:
19
j:
20
j:
21
j:
22
j:
23
j:
24
j:
25
j:
26
j:
27
j:
28
j:
29
j:
30
j:
31
j:
32
j:
33
j:
34
j:
35
j:
36
j:
37
j:
38
j:
39
j:
40
j:
41
j:
42
j:
43
j:
44
j:
45
j:
46
j:
47
j:
48
j:
49
j:
50
j:
51
j:
52
j:
53
j:
54
j:
55
j:
56
j:
57
j:
58
j:
59
j:
60
j:
61
j:
62
j:
63
j:
64
j:
65
j:
66
j:
67
j:
68
j:
69
j:
70
j:
71
j:
72
j:
73
j:
74
j:
75
j:
76
j:
77
j:
78
j:
79
j:
80
j:
81
j:
82
j:
83
j:
84
j:
85
j:
86
j:
87
j:
88
j:
89
j:
91
j:
92
j:
93
j:
94
j:
96
j:
97
j:
98
j:
99
j:
100
j:
101
j:
102
j:
103
j:
104
j:
105
j:
106
j:
107
j:
108
j:
109
j:
110
j:
111
j:
112
j:
113
j:
114
j:
115
j:
116
j:
117
j:
118
j:
119
j:
120
j:
121
j:
122
j:
123
j:
124
j:
125
j:
126
j:
127
j:
128
j:
130
j:
131
j:
132
j:
133
j:
134
j:
135
j:
136
j:
137
j:
138
j:
139
j:
140
j:
141
j:
142
j:
143
j:
144
j:
145
j:
146
j:
147
j:
148
j:
149
j:
150
j:
151
j:
152
j:
153
j:
154
j:
155
j:
156
j:
157
j

ValueError: I/O operation on closed file