In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from scipy.io import loadmat

In [3]:
jokes_train = loadmat('./data/kaggle77b_trainset.mat')
jokes_test = loadmat('./data/kaggle77b_testset.mat')

In [4]:
jokes_train['trainset']

array([[ -7.82,   8.79,  -9.66, ...,  99.  ,  99.  ,  99.  ],
       [  4.08,  -0.29,   6.36, ...,   0.34,  -4.32,   1.07],
       [ 99.  ,  99.  ,  99.  , ...,  99.  ,  99.  ,  99.  ],
       ..., 
       [  9.13,  -8.16,   8.59, ...,  -8.59,   9.13,   8.45],
       [ 99.  ,  99.  ,  99.  , ...,  99.  ,  99.  ,  99.  ],
       [ 99.  ,  99.  ,  99.  , ...,  99.  ,  99.  ,  99.  ]])

In [5]:
from math import pow, sqrt

In [42]:
def euclidean(ratings, item1, item2):
    si = {}
    
    for i in range(len(ratings[item1])):
        if ratings[item1][i] <= 10 and ratings[item2][i] <= 10:
            si[i] = 1
    
    # if they have no rating in common, return 0
    if len(si) == 0: return 0
    
    sum_of_squares = sum([pow(ratings[item1][index] - ratings[item2][index], 2)
                         for index in range(len(ratings[item1])) if index in si])
    
    return 1 / ( 1 + sqrt(sum_of_squares))

In [99]:
def pearson(ratings, item1, item2):
    # Get the list of mutually rated items
    si={}
    for i in range(len(ratings[item1])):
        if ratings[item1][i] <= 10 and ratings[item2][i] <= 10: si[i] = 1
    
    # Find the number of elements
    n=len(si)
    
    # if they are no ratings in common, return 0
    if n==0: return 0
    
    # Add up all the preferences
    sum1=sum([ratings[item1][user] for user in si])
    sum2=sum([ratings[item2][user] for user in si])
    
    # Sum up the squares
    sum1Sq=sum([pow(ratings[item1][user],2) for user in si])
    sum2Sq=sum([pow(ratings[item2][user],2) for user in si])
    
    # Sum up the products
    pSum=sum([ratings[item1][user]*ratings[item2][user] for user in si])
    
    # Calculate Pearson score
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    
    if den==0: return 0
    
    r=num/den
    
    return r

In [100]:
def topMatches(ratings, item, n=5, similarity=euclidean):
    scores = [(similarity(ratings, item, other), other)
                for other in range(len(ratings)) if other != item]
    
    # sort the list so the highest scores appear at the top
    scores.sort()
    scores.reverse()
    
    return scores[0:n]

In [105]:
def calculateSimilarItems(prefs, n=50, similarity=euclidean):
    # Create a dictionary of items showing which other items they
    # are most similar to.
    result={}
    
    for item in range(len(prefs)):
        # Find most similar items to this one
        scores = topMatches(prefs, item, n=n, similarity=similarity)
        result[item] = scores
    
    return result

In [106]:
ratings_train = jokes_train['trainset']
ratings_test = jokes_test['testset']

In [107]:
ratings_train_copy = ratings_train.copy()

In [108]:
ratings_train_transposed = ratings_train_copy.T

In [110]:
item_sim = calculateSimilarItems(ratings_train_transposed, 20, similarity=pearson)

In [111]:
ratings_test_copy = ratings_test.copy()

In [112]:
ratings_test_transposed = ratings_test_copy.T

In [116]:
def get_rating(items, ratings_train, user):
    total = 0
    count = 0
    
    for score, other in items:
        if ratings_train[other][user] <= 10:
            total += ratings_train[other][user]
            count += 1.
            
    return total / (1 + count)

In [117]:
def fill_ratings(ratings, item_sim, ratings_train):
    submission = {}
    
    for user in range(len(ratings[0])):
        submission[user] = []
    
    for item in range(len(ratings)):
        for user in range(len(ratings[0])):
            if ratings[item][user] == 55:
                submission[user].append(get_rating(item_sim[item], ratings_train, user))
    
    return submission

In [118]:
submission = fill_ratings(ratings_test_transposed, item_sim, ratings_train_transposed)

In [119]:
def prepare_submission(submission):
    userId = []
    rating1 = []
    rating2 = []
    rating3 = []
    
    for user, ratings in submission.iteritems():
        userId.append(user + 1)
        rating1.append(ratings[0])
        rating2.append(ratings[1])
        rating3.append(ratings[2])
    
    sub_df = pd.DataFrame({'UserId': userId, 'Rating1': rating1, 'Rating2': rating2, 'Rating3': rating3})
    
    return sub_df

In [120]:
sub_df = prepare_submission(submission)

In [98]:
sub_df.to_csv('./submission/second.csv', index=False)