In [1]:
import re
import numpy as np
from math import sqrt
from collections import defaultdict
from scipy.sparse import csr_matrix
from sklearn.cross_validation import KFold
from sklearn.feature_extraction import DictVectorizer
import re
from sklearn.feature_extraction.text import CountVectorizer
import json
from pprint import pprint

### Data Extraction and Preprocessing


In [2]:
def load_file(path):
    """ Load the file and extract the data into a list in json format """
    d=[]
    with open(path) as json_data:
        for line in json_data:
            d.append(json.loads(line))
        json_data.close()
    return d
    #pprint(d)
data =load_file(r'reviews_Musical_Instruments.json')
print "Sample data: \n"
pprint(data[0])


Sample data: 

{u'asin': u'0006428320',
 u'helpful': [0, 0],
 u'overall': 3.0,
 u'reviewText': u'The portfolio is fine except for the fact that the last movement of sonata #6 is missing. What should one expect?',
 u'reviewTime': u'03 11, 2014',
 u'reviewerID': u'A1YS9MDZP93857',
 u'reviewerName': u'John Taylor',
 u'summary': u'Parts missing',
 u'unixReviewTime': 1394496000}


In [3]:
def create_lists_from_main_data(main_data):
    """ Extract the uses ID, item ID, Reviews, Ratings from the dataset """
    item = []
    user_id = [ ]
    rating = [ ]
    reviews = [ ]
    for each in main_data:
        item.append("item-"+each['asin']) # appending 'item-' to the item ID for better understanding 
        user_id.append(each['reviewerID'])
        rating.append(each['overall'])
        reviews.append(each['reviewText'])
    return item,rating,reviews,user_id
item,rating,reviews,user_id=create_lists_from_main_data(data)
print "Number of users ",len(user_id)
print "Number of items", len(item)
print "Number of ratings", len(rating)
print "Number of user reviews",len(reviews)

Number of users  500176
Number of items 500176
Number of ratings 500176
Number of user reviews 500176


In [4]:
def create_nested_dict(main_key,nested_key,rating,reviews):
    """ Storing the data set in python dictionary (user { item: (rating,reviews)}"""
    
    data = defaultdict(dict)
    for i in range(0,len(user_id)):
        data[main_key[i]][nested_key[i]] = (rating[i],reviews[i])  
        
    cf_data = defaultdict(dict)
    for i in range(0,len(user_id)):
        cf_data[main_key[i]][nested_key[i]] = rating[i]  
    return cf_data, data    

#creating user->item->rating dictionary
cf_data , user_dict = create_nested_dict(user_id,item,rating,reviews) 

print "Size of the data set:", len(user_dict)

print "\nSample format of data inside dictionary:"
for key in user_dict:
    print key , user_dict[key]
    break

Size of the data set: 339231

Sample format of data inside dictionary:
AX4PH3FN2FEQO {u'item-B004K3AMO2': (3.0, u'This cab is clearly advertised here as containing a tweeter and 4 ohm impedance. I recently received this item and it does not have a tweeter nor is it 4 ohms. It clearly states on the back of the cab that it is 8 ohms. It also does not have speakon connectors, it only has one 1/4 inch connector on the back. The photo is misleading as well as it clearly shows a tweeter. Apparently this is a bare bones simple 4x10 cab, which is why it may be labeled as "extended range." BE WARNED THAT THIS ITEM IS NOT WHAT IS ADVERTISED. However, regardless of it\'s simplicity, it does have that good eden sound (at a shockingly low price) and when paired with my 2x10 with tweeter I get a great sound combination. In a way it is good that the cab is 8 ohms because my 2x10 is also 8 ohms, together I get the 4 ohms needed for my amp.')}


In [5]:
def filter_data(dict_to_filter,threshold):
    """  returns a filtered dictionary and a unique list of nested keys in that filtered dictionary """
    
    #creating new dictionary for storing only filtered items
    filtered_dict={ }   
    
    # create a dictionary of {main_key{nested_key: value}} according to the threshhold value
    for i,each in enumerate(dict_to_filter):
        if len(dict_to_filter[each])> threshold:
            filtered_dict[each] = dict_to_filter[each]
    
    # create a list of unique nested_keys(users/movies) that are in the filtered dictionary
    unique_nested_keys_in_filtered_dict=[]
    for user in filtered_dict:
        for item in filtered_dict[user]:
            if item not in unique_nested_keys_in_filtered_dict:
                unique_nested_keys_in_filtered_dict.append(item)
    
    return filtered_dict,unique_nested_keys_in_filtered_dict

#1.filter users as per the threshold 
#2.find unique number of items for filtered users

filtered_users,unique_items_for_filtered_user=filter_data(user_dict,20)
print "(dict)filtered users with rating as values->",len(filtered_users)
print "(list)unique items for filtered users with rating as values->",len(unique_items_for_filtered_user)


(dict)filtered users with rating as values-> 420
(list)unique items for filtered users with rating as values-> 11664


In [6]:
def convert(data):
    
    """ convert the dictionary from {main_key{nested_key:value}} to {nested_key{main_key:value}}.
    This conversion is to filter a dictionary in both way,eg user-item filter and item-user filter """
    
    item_data = { }
    for user in data:
        for item in data[user]:
            item_data.setdefault(item, {})
            item_data[item][user] = data[user][item]
    return item_data

item_dict = convert(filtered_users)
print "Size of items available" ,len(item_dict)

print "\n Sample item:"
for i in item_dict:
    print i ,  item_dict[i]
    break

Size of items available 11664

 Sample item:
item-B0002PYXA6 {u'A2NYK9KWFMJV4Y': (4.0, u"The good first: these brushes have a heavier and thicker staccato sound than wire bristle brushes. That makes them perfect for Afro-Cuban and Brazilian music. As a substitute for wire brushes they work poorly because legato sweeps are not very distinct. As an additional tool, though they will add to your sound palette.What I do not like is the fact that they are 'throw out' style retractable brushes. You push the bristles into the handle to retract them and a throw motion will extend them. That in itself is clever, but if you use different fan outs for each hand this brush is a distraction. For example, I typically will keep the fan of my right brush tighter so I get a more staccato sound, while fully fanning out on the left one for a legato quarter note sweep. That is awkward to do with these the way they are designed.Physically these have a relatively thick handle at .579 inches, but it's light a

In [7]:
'''filter item dict and then get unique set of users for the filtered items'''
filtered_items, unique_users_for_filtered_item=filter_data(item_dict,2)

print "(dict)filtered items with rating as values->",len(filtered_items)
print "(list)unique users for filtered items with ratings as values->",len(unique_users_for_filtered_item)

for i in filtered_items:
    print i ,  filtered_items[i]
    break
    
item_names = filtered_items.keys()
print len(item_names)    

(dict)filtered items with rating as values-> 774
(list)unique users for filtered items with ratings as values-> 396
item-B000B6FBA2 {u'A3FLOANV9JOFAM': (5.0, u'I am really pleased with the ease of use and accuracy of tone control.  Not just on 6 strings but on 12 string guitars, too.'), u'A3PGQWCSJPCYDH': (3.0, u'You have to turn the knobs every time you want to use this. What a pain. Oh well, it was cheap enough to have around.'), u'A2Z89YMZZJWBHS': (2.0, u"I ordered one of these because I like my other Planet Waves Capo a lot (I believe its the older metal version of this, not the &#34;Lite&#34;). However, this one just does not screw down enough to hold down the strings, especially down at the lower frets where the neck is a little thinner (on a Martin jumbo acoustic). It will barely bend the strings down behind the fret but not enough to prevent buzzing. Your fretting hand will naturally push the capo back and out of position quite easily when playing open chords too.I'm not sure w

In [8]:
def get_movie_reviews_for_countVec(dict_to_use):
    review_dict=[]

    #print filtered_items.values()[0].values()
    for i,item in enumerate(dict_to_use):
      
        for user in dict_to_use[item]:
            #print dict_to_use[item][user][1]
            review_dict.append(dict_to_use[item][user][1])
            break

    return review_dict
    
reviews_to_vectorize=get_movie_reviews_for_countVec(filtered_items)

print "Total Reviews:" , len(reviews_to_vectorize)

print  "\nSample Review:\n" , reviews_to_vectorize[0]


Total Reviews: 774

Sample Review:
I am really pleased with the ease of use and accuracy of tone control.  Not just on 6 strings but on 12 string guitars, too.


In [9]:
def get_rating_csr_matrix(dict_to_convert):
    
    """ Given any dictionary, convert the dictionary to a csr matrix format"""
    v = DictVectorizer(sparse=False)
    list2 = [ ]
    for each_value in dict_to_convert:
        list2.append(dict_to_convert[each_value])
    for l in list2:        
        for key in l:            
            l[key]=float(l[key][0])       
    X = v.fit_transform(list2)
    csr_ratings = csr_matrix(X)
   
    return csr_ratings

#csr matrix of rating with items as rows and users as columns

ratings = get_rating_csr_matrix(filtered_items)
print ratings.shape

csr_ratings=np.transpose(ratings.todense())
print csr_ratings.shape


(774, 396)
(396L, 774L)


### Item - Based Colloborative Filtering

In [10]:
def utility_matrix(dict_to_convert):
    """ Given any dictionary, convert the dictionary to a csr matrix format"""
    v = DictVectorizer(sparse=False)
    list2 = [ ]
    for each_value in dict_to_convert:
        list2.append(dict_to_convert[each_value])
    for l in list2:        
        for key in l:            
            l[key]=float(l[key])       
    X = v.fit_transform(list2)
    csr_ratings = csr_matrix(X)
   
    return csr_ratings

#csr matrix of rating with items as rows and users as columns

ratings = utility_matrix(filtered_items)
print ratings.shape

csr_ratings=np.transpose(ratings.todense())
print csr_ratings.shape

(774, 396)
(396L, 774L)


In [11]:
print ratings.todense()[0]

[[ 5.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  4.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0. 

As you can see the ratings matrix is very sparse. 

In [12]:
def boosted_similarity(data, user1, user2):
    common_rate = [ ]
    for i in data[user1]:
        if i in data[user2]:
            common_rate.append(i)
    n = len(common_rate)
    if n == 0: return 0
    u1 = [data[user1][rate] for rate in common_rate]
    u2 = [data[user2][rate] for rate in common_rate]
    u1 = map(float,u1)
    u2 = map(float,u2)
    sum_u1 = sum([u1[i] for i in range(0,len(u1))])
    sum_u2 = sum([u2[i] for i in range(0,len(u1))])
    square_u1 = sum([u1[i]**2 for i in range(0,len(u1))])
    square_u2 = sum([u2[i]**2 for i in range(0,len(u1))])
    sum_both = sum(u1[i] * u2[i] for i in range(0,len(u1)))
    s_xx = square_u1 - pow(sum_u1,2) / n
    s_yy = square_u2 - pow(sum_u2,2) / n
    num = sum_both / (sum_u1 * sum_u2 / n )
    denom = sqrt( (s_xx * s_yy))
    if denom == 0: 
        return 0
    else: 
        score = num / denom
    return score

sim_scores = [ ]
for i in range(0,len(user_id)-1):
    score = boosted_similarity(cf_data, user_id[i], user_id[i+1])
    sim_scores.append(score)

In [13]:
score = boosted_similarity(cf_data, user_id[2206], user_id[2207])
print "similarity score is", score

similarity score is 0.221785714286


In [14]:
def matrix_factorization(R, P, Q, K, steps=1000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in xrange(steps):
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in xrange(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in xrange(K):
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        if e < 0.001:
            break
    return P, Q.T

In [15]:
def item_based(ratings):
    """ Prediction of unknown ratings based on the existing ratings"""
    R = np.array(ratings)
    N = len(R)  
    M = len(R[0])
    K = 2
    P = np.random.rand(N,K)
    Q = np.random.rand(M,K)
    nP, nQ = matrix_factorization(R, P, Q, K)
    predicted_rating = np.dot(nP, nQ.T)
    return predicted_rating
predict = item_based(ratings.todense())

In [16]:
def compute_MAE(true, predict):
    """ compute mean absolute error"""
    e = (true - predict )
    error=np.linalg.norm(e)
    return error

def compute_MSE(true, predict):
    """ compute mean squared error """
    error = (true - predict )**2    
    return error

In [17]:
def evaluation(ratings,predict):
    """ Evaluation of recommendation system """
    MAE = [ ]
    MSE=[]
    RMSE=[]
    precision=[]
    ratings_copy = ratings.copy()
    predict_copy = predict.copy()
    for i in range(0,ratings_copy.shape[0]):
        for j in range(0,ratings_copy.shape[1]):
            if ratings_copy[i,j] != 0.:
                mae = compute_MAE(ratings_copy[i,j],predict_copy[i,j])
                MAE.append(mae)
                mse=compute_MSE(ratings_copy[i,j],predict_copy[i,j])
                MSE.append(mse)                
    mean_abs_error = np.sum(MAE) / ratings_copy.todense().shape[0]
    mean_sq_error = np.sum(MSE) / ratings_copy.todense().shape[0]
    root_mean_sq_error = (np.sum(MSE))**0.5 / ratings_copy.todense().shape[0]
    return mean_abs_error,mean_sq_error,root_mean_sq_error
MAE,MSE,RMSE = evaluation(ratings,predict)
print "Mean Absolute Error", MAE
print "Mean Squared Error",MSE
print "Root Mean Squared Error",RMSE

Mean Absolute Error 1.98270245749
Mean Squared Error 1.70030425145
Root Mean Squared Error 0.0468697719075


In [18]:
def item_recommended_user(ratings,predict,item_names):
    ratings_copy = ratings.copy()
    predict_copy = predict.copy()
    predicted_scores = {}
    rec_item_list=[]
    for i in range(0,ratings_copy.shape[0]):
        temp_list=list(predict[i])
        ind=temp_list.index(max(temp_list))
        if ratings_copy[i,ind] == 0.:
            predicted_scores[unique_users_for_filtered_item[i]]=item_names[ind]
        else:
            temp_list.pop(ind) 
            ind2=temp_list.index(max(temp_list))
            predicted_scores[unique_users_for_filtered_item[i]]=item_names[ind2]             
    return predicted_scores
predicted_items_for_users=item_recommended_user(ratings.T,predict.T,item_names)
print "Predicted items for user", len(predicted_items_for_users)
for user in predicted_items_for_users:
    print "user" , user
    print "Item recommended", predicted_items_for_users[user]
    break

Predicted items for user 396
user A3CRG8NYUSK9PS
Item recommended item-B00EUEWR2G


In [19]:
def top_recommendation(ratings,predict,item_names):
    """ Top recommendation for the users """
    ratings_copy = ratings.copy()
    predict_copy = predict.copy()
    scores = { }
    for i in range(0,ratings_copy.shape[0]):
        output1 = [ ]
        for j in range(0,ratings_copy.shape[1]):
            if ratings_copy[i,j] == 0.:
                output1.append((item_names[j],predict_copy[i,j]))
        output1.sort()
        output1.reverse()
        scores[unique_users_for_filtered_item[i]] = output1[0:5]
    return scores
top_recommendations = top_recommendation(ratings.T,predict.T,item_names)

In [20]:
for user in top_recommendations:
    print " User:" , user
    print "\nTop 5 item recommendations: " 
    for item in top_recommendations[user]:
        print item[0]
    break

 User: A3CRG8NYUSK9PS

Top 5 item recommendations: 
item-B00JBIVXGC
item-B00IZCSW3M
item-B00IAD18NM
item-B00HFRXACG
item-B00GTSM8FW


### User Based Recommendation System

In [36]:
user_matrix=csr_matrix(np.transpose(ratings.todense()))
print user_matrix.shape
predict = item_based(user_matrix.todense())
MAE,MSE,RMSE = evaluation(user_matrix,predict)

(396, 774)


In [37]:
print "Mean Absolute Error", MAE
print "Mean Squared Error",MSE
print "Root Mean Squared Error",RMSE

Mean Absolute Error 3.84303607339
Mean Squared Error 3.26310460595
Root Mean Squared Error 0.0907753443746


### Content based Recommendation System

In [21]:
def tokenize(text):
    """Given a string, return a list of tokens such that: (1) all
    tokens are lowercase, (2) all punctuation is removed. Note that underscore (_) 
    is not considered punctuation.
    """
    e=text.lower()
    t=re.sub(r'[^\x00-\x7F]+',' ', e)
    tokens = re.sub('\W+', ' ', t).split()

    return tokens
print "Sample review:"
print reviews[0]+"\n"
token=tokenize(reviews[0])
print "List of tokens:"
print token

Sample review:
The portfolio is fine except for the fact that the last movement of sonata #6 is missing. What should one expect?

List of tokens:
[u'the', u'portfolio', u'is', u'fine', u'except', u'for', u'the', u'fact', u'that', u'the', u'last', u'movement', u'of', u'sonata', u'6', u'is', u'missing', u'what', u'should', u'one', u'expect']


In [22]:
"""Convert a list of reviews into a sparse csr_matrix, where each row is a review and each 
column represents a unique word."""
def do_vectorize(review_list, tokenizer_fn=tokenize, min_df=1,
                 max_df=1., binary=True, ngram_range=(1,1)):
    vec=CountVectorizer(review_list,tokenizer=tokenizer_fn, min_df=min_df,max_df=max_df, binary=binary, 
                        ngram_range=ngram_range,dtype=float)
    matrix=vec.fit_transform(review_list)
    return matrix,vec

print reviews_to_vectorize[0]
matrix, vec = do_vectorize(reviews_to_vectorize)
print ('\n matrix represents %d documents with %d features' % (matrix.shape[0], matrix.shape[1]))

I am really pleased with the ease of use and accuracy of tone control.  Not just on 6 strings but on 12 string guitars, too.

 matrix represents 774 documents with 12149 features


In [23]:
def document_frequencies(movie_term_matrix):
    """ Compute the number of different reviews that each term appears in.It returns numpy array with 
    one element per term in the vocabulary."""
    
    X = movie_term_matrix.copy()
    (i,j) = X.nonzero()
    col_sums = np.zeros(X.shape[1])
    for n in np.asarray(j):
        col_sums[n] += 1.
    return col_sums
    
dfs = document_frequencies(matrix)
print dfs

[  3.  10.   2. ...,   3.   1.   2.]


In [24]:
def tfidf(movie_term_matrix, dfs):
    """ Create a new matrix that transforms item_term_matrix using tfidf.
    Simply divide each value by the document frequency for that term.
      A csr_matrix that is a copy of term_matrix where value
      i,j is divided by the document frequency of term j"""
      
    x = movie_term_matrix.copy()
    x=x/dfs
    a = csr_matrix(x)
    return a
 
# tfidf matrix: row=movie, col=term

tfidf_matrix = tfidf(matrix, dfs)
tfidf_matrix.todense()

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [25]:
print tfidf_matrix.shape
print csr_ratings.shape

(774, 12149)
(396L, 774L)


In [26]:
def make_user_profiles(ratings, tfidf_matrix):
    
    """
    Create a user profile matrix by computing the weighted average of the tfidf
    vectors of each item user has rated. E.g., if a person has rated 
    one item .2 with tfidf vector ([.1, .3]) and rated another item
    .6 with tfidf vector([.2, .4]), then the weighted average is: [(.2*.1 + .6*.2) / (.2 + .6), (.2*.3 + .6*.4) / (.2 + .6)]
     Returns: A csr matrix where each row represents a user and the columns represent terms.
    """
    X = ratings * tfidf_matrix
    s=ratings.sum(axis=1)
    X=X/s
    a = csr_matrix(X)
    return a
print "utility matrix",csr_ratings.shape
print "prediction matrix",predict.T.shape
print "tfidf matrix",tfidf_matrix.shape
user_profiles = make_user_profiles(csr_matrix(ratings.T), tfidf_matrix)
print "csr matrix" , user_profiles.shape
print "\nUser profile Matrix:"
print user_profiles.todense()

utility matrix (396L, 774L)
prediction matrix (396L, 774L)
tfidf matrix (774, 12149)
csr matrix (396, 12149)

User profile Matrix:
[[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.03333333  0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]


In [27]:
def norm(vector):
    """
    Compute the Euclidean norm of one row of a csr_matrix.
    https://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm
    """
    total=0
    data= vector.data
    for i in data:
        square = np.dot(i,i)
        total += square
    return np.sqrt(total)
    
norm(csr_matrix([3,4]))

5.0

In [28]:
def cosine(v1, v2):
    """
    Compute the cosine similarity between two vectors (rows from a csr_matrix).
    https://en.wikipedia.org/wiki/Cosine_similarity
    """
    
    vec1 = v1.toarray()
    vec2 = v2.toarray()
    numer = np.vdot(vec1,vec2)
    denom = norm(v1) * norm(v2)
    return numer / denom  
    
round(cosine(csr_matrix([2,4]), csr_matrix([3,8])), 5)

0.99451

In [29]:
print user_profiles.shape 
print tfidf_matrix.shape 
print ratings.T.shape

(396, 12149)
(774, 12149)
(396, 774)


In [31]:
def predict_ratings_w_user_profiles(ratings, user_profiles, tfidf_matrix, bias =3.96):
    """
    Make a copy of the ratings matrix. Replace each all the entries with a predicted score
    based on user_profile. Specifically, the ratings of user i for item j is the 
    cosine similarity between user i's profile and item's j tfidf vector.
    """
    ratings_copy = ratings.copy()
    for i in range(0,ratings_copy.shape[0]):
        for j in range(0,ratings_copy.shape[1]):
            if ratings_copy[i,j] == 0.:
                ratings_copy[i,j] = cosine(user_profiles[i],tfidf_matrix[j])
    ratings_copy = ratings_copy.todense() + bias
    return ratings_copy  
    
content_predict = predict_ratings_w_user_profiles(ratings.T, user_profiles, tfidf_matrix)
print content_predict.shape

(396L, 774L)


In [32]:
MAE,MSE,RMSE = evaluation(ratings.T,content_predict)
print "Mean Absolute Error", MAE
print "Mean Squared Error",MSE
print "Root Mean Squared Error",RMSE

Mean Absolute Error 34.29
Mean Squared Error 135.7884
Root Mean Squared Error 0.585576638878


### Hybrid Recommendation System

In [34]:
def hybrid_recommendation (cf_ratings ,ratings, tfidf_matrix):
    """
    Create a user profile matrix by computing the weighted average of the tfidf
    vectors of each item user has rated.
    """
    user_profiles = make_user_profiles(csr_matrix(ratings.T), tfidf_matrix)    
    print "csr matrix" , user_profiles.shape
    print "\nUser profile Matrix:"
    print user_profiles.todense()
    content_predict = predict_ratings_w_user_profiles(ratings.T, user_profiles, tfidf_matrix)
    content_predict = csr_matrix(content_predict)
    hybrid_predict = item_based(content_predict.todense())
    return hybrid_predict
hybrid_predict = hybrid_recommendation (csr_matrix(predict), ratings, tfidf_matrix)   

csr matrix (396, 12149)

User profile Matrix:
[[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.03333333  0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]


In [35]:
MAE,MSE,RMSE = evaluation(ratings.T, hybrid_predict)
print "Mean Absolute Error", MAE
print "Mean Squared Error",MSE
print "Root Mean Squared Error",RMSE

Mean Absolute Error 7.21636520021
Mean Squared Error 8.6374124446
Root Mean Squared Error 0.147687669043
