In [1]:
# importing libraries
import pandas as pd
import numpy as np

# pass in column names for each CSV as the column name is not given in the file and read them using pandas.
# You can check the column names from the readme file

# reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols,encoding='latin-1')

# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols,encoding='latin-1')

# reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('u.item', sep='|', names=i_cols,
encoding='latin-1')

# After loading the dataset, we should look at the content of each file (users, ratings, items).

# We have 100k ratings for different user and movie combinations. Now finally examine the items file.

# Item Data
#print("\nItem Data :")
#print("shape : ", items.shape)
items = items.drop(columns="IMDb URL")
items = items.drop(columns="release date")
items = items.drop(columns="unknown")
items = items.drop(columns="video release date")
items.head()
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('u1.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('u1.test', sep='\t', names=r_cols, encoding='latin-1')
print(ratings_train.shape)
print(ratings_test.shape)

(80000, 4)
(20000, 4)


In [2]:
import turicreate
train_data = turicreate.SFrame(ratings_train)
test_data = turicreate.SFrame(ratings_test)
popularity_model = turicreate.popularity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating')

In [3]:
popularity_recomm = popularity_model.recommend(users=[1,2],k=10)
popularity_recomm.print_rows(num_rows=20)

+---------+----------+-------------------+------+
| user_id | movie_id |       score       | rank |
+---------+----------+-------------------+------+
|    1    |   1467   |        5.0        |  1   |
|    1    |   1653   |        5.0        |  2   |
|    1    |   1599   |        5.0        |  3   |
|    1    |   1500   |        5.0        |  4   |
|    1    |   1189   |        5.0        |  5   |
|    1    |   1293   |        5.0        |  6   |
|    1    |   1201   |        5.0        |  7   |
|    1    |   1122   |        5.0        |  8   |
|    1    |   1449   | 4.714285714285714 |  9   |
|    1    |   1367   | 4.666666666666667 |  10  |
|    2    |   1467   |        5.0        |  1   |
|    2    |   1653   |        5.0        |  2   |
|    2    |   1599   |        5.0        |  3   |
|    2    |   1500   |        5.0        |  4   |
|    2    |   1189   |        5.0        |  5   |
|    2    |   1293   |        5.0        |  6   |
|    2    |   1201   |        5.0        |  7   |


In [4]:
#Training the model
item_sim_model = turicreate.item_similarity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating', similarity_type='cosine')

#Making recommendations
item_sim_recomm = item_sim_model.recommend(users=[1],k=10)
item_sim_recomm.print_rows(num_rows=20)

+---------+----------+--------------------+------+
| user_id | movie_id |       score        | rank |
+---------+----------+--------------------+------+
|    1    |   174    | 1.0417206989394294 |  1   |
|    1    |    98    | 0.9134195433722602 |  2   |
|    1    |    69    | 0.9058463997311063 |  3   |
|    1    |   210    | 0.8839188085661994 |  4   |
|    1    |    56    | 0.871105604701572  |  5   |
|    1    |   202    | 0.8094391557905409 |  6   |
|    1    |   423    | 0.8069580762474625 |  7   |
|    1    |    96    | 0.7885852707756891 |  8   |
|    1    |    82    | 0.7808106678503531 |  9   |
|    1    |   186    | 0.7134801493750678 |  10  |
+---------+----------+--------------------+------+
[10 rows x 4 columns]



In [5]:
pred_data = []
for i in range(1,944):
    temp = []
    for j in item_sim_model.recommend(users=[i],k=20):
        temp.append(j['movie_id'])
    pred_data.append(temp)

In [7]:
ratings_test.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


In [8]:
act_dicti = {}
for ind in ratings_test.index:
    if ratings_test['user_id'][ind] not in act_dicti:
        act_dicti[ratings_test['user_id'][ind]] = []
        act_dicti[ratings_test['user_id'][ind]].append(ratings_test['movie_id'][ind])
    else:
        act_dicti[ratings_test['user_id'][ind]].append(ratings_test['movie_id'][ind])

In [10]:
act_data = []
for key in act_dicti:
    act_data.append(act_dicti[key])

In [44]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [45]:
print(mapk(act_data,pred_data,k=5))

0.39622488501573466


In [14]:
0.41089445654805135

0.41089445654805135

In [15]:
0.3322005274800068

0.3322005274800068

In [16]:
0.050172612197928665

0.050172612197928665

In [17]:
0.05389310220296136

0.05389310220296136

### 0.050380258899676386

In [18]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

In [19]:
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [32]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [21]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [33]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')
hybrid_prediction = (user_prediction+item_prediction)/2

In [51]:
item_prediction

array([[0.44627765, 0.475473  , 0.50593755, ..., 0.58815455, 0.5731069 ,
        0.56669645],
       [0.10854432, 0.13295661, 0.12558851, ..., 0.13445801, 0.13657587,
        0.13711081],
       [0.08568497, 0.09169006, 0.08764343, ..., 0.08465892, 0.08976784,
        0.09084451],
       ...,
       [0.03230047, 0.0450241 , 0.04292449, ..., 0.05302764, 0.0519099 ,
        0.05228033],
       [0.15777917, 0.17409459, 0.18900003, ..., 0.19979296, 0.19739388,
        0.20003117],
       [0.24767207, 0.24489212, 0.28263031, ..., 0.34410424, 0.33051406,
        0.33102478]])

In [52]:
user_prediction

array([[ 2.06532606,  0.73430275,  0.62992381, ...,  0.39359041,
         0.39304874,  0.3927712 ],
       [ 1.76308836,  0.38404019,  0.19617889, ..., -0.08837789,
        -0.0869183 , -0.08671183],
       [ 1.79590398,  0.32904733,  0.15882885, ..., -0.13699223,
        -0.13496852, -0.13476488],
       ...,
       [ 1.59151513,  0.27526889,  0.10219534, ..., -0.16735162,
        -0.16657451, -0.16641377],
       [ 1.81036267,  0.40479877,  0.27545013, ..., -0.00907358,
        -0.00846587, -0.00804858],
       [ 1.8384313 ,  0.47964837,  0.38496292, ...,  0.14686675,
         0.14629808,  0.14641455]])

In [23]:
user_prediction.shape

(943, 1682)

In [24]:
item_prediction.shape

(943, 1682)

In [49]:
second_pred = []
for i in user_prediction:
    some = {k: v for v,k in enumerate(i)}
    soo = list(reversed(sorted(some.keys())))
    soo = soo[1:11]
    temp1 = []
    for i in soo:
        temp1.append(some[i]+1)
    second_pred.append(temp1)

In [50]:
print(mapk(act_data,second_pred,k=10))

0.08374498153170733


In [28]:
from collections import OrderedDict

In [29]:

    some = {k: v for v,k in enumerate(user_prediction[1])}
    type(some)

    soo = list(reversed(sorted(some.keys())))
    soo = soo[1:11]
    soo

    temp1 = []
    for i in soo:
        temp1.append(some[i]+1)
    temp1



[100, 181, 174, 258, 98, 1, 127, 56, 286, 172]