In [402]:
from collections import OrderedDict

In [622]:

# importing libraries
import pandas as pd
import numpy as np

# pass in column names for each CSV as the column name is not given in the file and read them using pandas.
# You can check the column names from the readme file

# reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols,encoding='latin-1')

# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u1.base', sep='\t', names=r_cols,encoding='latin-1')

# reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('u.item', sep='|', names=i_cols,
encoding='latin-1')

# After loading the dataset, we should look at the content of each file (users, ratings, items).

# We have 100k ratings for different user and movie combinations. Now finally examine the items file.

# Item Data
#print("\nItem Data :")

#print("shape : ", items.shape)
items = items.drop(columns="IMDb URL")
items = items.drop(columns="release date")
items = items.drop(columns="unknown")
items = items.drop(columns="video release date")
items.head()
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('u1.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('u1.test', sep='\t', names=r_cols, encoding='latin-1')
print(ratings_train.shape)
print(ratings_test.shape)

(80000, 4)
(20000, 4)


In [623]:
import turicreate
train_data = turicreate.SFrame(ratings_train)
test_data = turicreate.SFrame(ratings_test)
popularity_model = turicreate.popularity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating')

In [624]:
popularity_recomm = popularity_model.recommend(users=[1,2],k=10)
popularity_recomm.print_rows(num_rows=20)

+---------+----------+-------------------+------+
| user_id | movie_id |       score       | rank |
+---------+----------+-------------------+------+
|    1    |   1653   |        5.0        |  1   |
|    1    |   1467   |        5.0        |  2   |
|    1    |   1500   |        5.0        |  3   |
|    1    |   1599   |        5.0        |  4   |
|    1    |   1189   |        5.0        |  5   |
|    1    |   1293   |        5.0        |  6   |
|    1    |   1201   |        5.0        |  7   |
|    1    |   1122   |        5.0        |  8   |
|    1    |   1449   | 4.714285714285714 |  9   |
|    1    |   1367   | 4.666666666666667 |  10  |
|    2    |   1653   |        5.0        |  1   |
|    2    |   1467   |        5.0        |  2   |
|    2    |   1500   |        5.0        |  3   |
|    2    |   1599   |        5.0        |  4   |
|    2    |   1189   |        5.0        |  5   |
|    2    |   1293   |        5.0        |  6   |
|    2    |   1201   |        5.0        |  7   |


In [625]:
#Training the model
item_sim_model = turicreate.item_similarity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating', similarity_type='cosine')

#Making recommendations
item_sim_recomm = item_sim_model.recommend(users=[1],k=10)
item_sim_recomm.print_rows(num_rows=20)

+---------+----------+--------------------+------+
| user_id | movie_id |       score        | rank |
+---------+----------+--------------------+------+
|    1    |   174    | 1.1053527699576484 |  1   |
|    1    |    98    | 0.9404675934049819 |  2   |
|    1    |    69    | 0.9301243004975496 |  3   |
|    1    |    56    | 0.9235459936989678 |  4   |
|    1    |   210    | 0.9091658499505785 |  5   |
|    1    |   202    | 0.8433779941664802 |  6   |
|    1    |   423    | 0.8212485216282032 |  7   |
|    1    |    96    | 0.818349755693365  |  8   |
|    1    |    82    | 0.7732973712461966 |  9   |
|    1    |    97    | 0.7402290021931683 |  10  |
+---------+----------+--------------------+------+
[10 rows x 4 columns]



In [626]:
pred_data = []
for i in range(1,944):
    temp = []
    for j in item_sim_model.recommend(users=[i],k=20):
        temp.append(j['movie_id'])
    pred_data.append(temp)

In [627]:
print(pred_data)

[[174, 98, 69, 56, 210, 202, 423, 96, 82, 97, 186, 183, 196, 228, 655, 12, 132, 64, 318, 385], [50, 7, 181, 117, 121, 405, 257, 15, 222, 151, 742, 471, 98, 591, 174, 79, 118, 546, 125, 56], [313, 307, 300, 328, 331, 272, 286, 301, 269, 327, 310, 315, 270, 316, 332, 750, 690, 50, 294, 895], [50, 313, 307, 302, 288, 333, 751, 181, 331, 326, 286, 294, 270, 332, 245, 748, 272, 7, 690, 879], [195, 82, 202, 79, 69, 96, 28, 89, 176, 173, 385, 56, 423, 403, 568, 98, 393, 161, 230, 191], [204, 69, 98, 56, 172, 28, 423, 483, 210, 195, 238, 603, 357, 197, 183, 96, 97, 176, 211, 135], [174, 172, 89, 79, 195, 210, 28, 183, 423, 168, 176, 82, 191, 196, 228, 385, 161, 185, 393, 483], [79, 89, 69, 56, 176, 183, 210, 204, 385, 172, 161, 230, 265, 568, 202, 50, 186, 423, 28, 98], [174, 181, 98, 210, 204, 79, 56, 423, 172, 195, 173, 69, 96, 176, 89, 183, 97, 100, 28, 121], [204, 89, 56, 191, 172, 98, 168, 483, 79, 423, 135, 195, 132, 474, 183, 97, 210, 176, 603, 234], [174, 204, 202, 210, 216, 172, 50, 8

In [628]:
act_dicti = {}
for ind in ratings_test.index:
    if ratings_test['user_id'][ind] not in act_dicti:
        act_dicti[ratings_test['user_id'][ind]] = []
        act_dicti[ratings_test['user_id'][ind]].append(ratings_test['movie_id'][ind])
    else:
        act_dicti[ratings_test['user_id'][ind]].append(ratings_test['movie_id'][ind])

In [629]:
print(act_dicti)

{1: [6, 10, 12, 14, 17, 20, 23, 24, 27, 31, 33, 36, 39, 44, 47, 49, 51, 53, 54, 56, 60, 61, 62, 64, 65, 67, 69, 70, 72, 73, 74, 76, 78, 80, 81, 82, 84, 85, 86, 90, 91, 92, 96, 97, 98, 100, 102, 103, 104, 107, 108, 112, 113, 114, 117, 118, 120, 121, 125, 128, 129, 130, 132, 134, 140, 143, 145, 148, 150, 151, 154, 155, 157, 159, 160, 161, 163, 164, 170, 171, 174, 175, 177, 180, 183, 184, 185, 186, 188, 189, 190, 193, 196, 200, 201, 202, 206, 208, 209, 210, 212, 213, 214, 215, 218, 219, 221, 222, 224, 225, 226, 227, 228, 229, 230, 232, 233, 235, 236, 241, 242, 243, 248, 250, 252, 253, 254, 255, 258, 259, 260, 262, 264, 265, 266, 267, 272], 2: [13, 19, 50, 251, 257, 279, 280, 281, 290, 292, 297, 298, 299, 301, 303, 307, 308, 312, 313, 314, 315, 316], 3: [245, 264, 272, 294, 299, 300, 307, 318, 323, 324, 327, 328, 330, 331, 332, 334, 335, 337, 341, 343, 345, 348, 349, 350, 351, 354], 4: [50, 260, 264, 288, 294, 303, 354, 356, 357, 361], 5: [1, 2, 17, 24, 40, 42, 62, 69, 79, 80, 89, 90, 94, 

In [630]:
act_data = []
for key in act_dicti:
    act_data.append(act_dicti[key])

In [631]:
print(act_data)

[[6, 10, 12, 14, 17, 20, 23, 24, 27, 31, 33, 36, 39, 44, 47, 49, 51, 53, 54, 56, 60, 61, 62, 64, 65, 67, 69, 70, 72, 73, 74, 76, 78, 80, 81, 82, 84, 85, 86, 90, 91, 92, 96, 97, 98, 100, 102, 103, 104, 107, 108, 112, 113, 114, 117, 118, 120, 121, 125, 128, 129, 130, 132, 134, 140, 143, 145, 148, 150, 151, 154, 155, 157, 159, 160, 161, 163, 164, 170, 171, 174, 175, 177, 180, 183, 184, 185, 186, 188, 189, 190, 193, 196, 200, 201, 202, 206, 208, 209, 210, 212, 213, 214, 215, 218, 219, 221, 222, 224, 225, 226, 227, 228, 229, 230, 232, 233, 235, 236, 241, 242, 243, 248, 250, 252, 253, 254, 255, 258, 259, 260, 262, 264, 265, 266, 267, 272], [13, 19, 50, 251, 257, 279, 280, 281, 290, 292, 297, 298, 299, 301, 303, 307, 308, 312, 313, 314, 315, 316], [245, 264, 272, 294, 299, 300, 307, 318, 323, 324, 327, 328, 330, 331, 332, 334, 335, 337, 341, 343, 345, 348, 349, 350, 351, 354], [50, 260, 264, 288, 294, 303, 354, 356, 357, 361], [1, 2, 17, 24, 40, 42, 62, 69, 79, 80, 89, 90, 94, 98, 100, 102, 1

In [632]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [633]:
print(mapk(act_data,pred_data,k=5))

0.413080367949649


In [634]:
0.41089445654805135

0.41089445654805135

In [635]:
0.3322005274800068

0.3322005274800068

In [636]:
0.050172612197928665

0.050172612197928665

In [637]:
0.05389310220296136

0.05389310220296136

### 0.050380258899676386

In [638]:
0.27617886178861784


0.27617886178861784

In [639]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

In [640]:
ratings.shape

(80000, 4)

In [648]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

In [642]:
n_users

943

In [643]:
n_items

1650

In [644]:
print(data_matrix.shape)

(943, 1650)


In [645]:
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

IndexError: index 1650 is out of bounds for axis 1 with size 1650

In [611]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [612]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [543]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [544]:
user_prediction.shape

(943, 1682)

In [545]:
item_prediction.shape

(943, 1682)

In [551]:
item_prediction

array([[0.44627765, 0.475473  , 0.50593755, ..., 0.58815455, 0.5731069 ,
        0.56669645],
       [0.10854432, 0.13295661, 0.12558851, ..., 0.13445801, 0.13657587,
        0.13711081],
       [0.08568497, 0.09169006, 0.08764343, ..., 0.08465892, 0.08976784,
        0.09084451],
       ...,
       [0.03230047, 0.0450241 , 0.04292449, ..., 0.05302764, 0.0519099 ,
        0.05228033],
       [0.15777917, 0.17409459, 0.18900003, ..., 0.19979296, 0.19739388,
        0.20003117],
       [0.24767207, 0.24489212, 0.28263031, ..., 0.34410424, 0.33051406,
        0.33102478]])

In [552]:
second_pred = []
for i in item_prediction:
    some = {k: v for v,k in enumerate(i)}
    soo = list(reversed(sorted(some.keys())))
    soo = soo[1:11]
    temp1 = []
    for i in soo:
        temp1.append(some[i]+1)
    second_pred.append(temp1)

In [553]:
print(mapk(act_data,second_pred,k=10))

0.00500988907404602


In [549]:
0.3448479607466882

0.3448479607466882

In [649]:
# importing libraries
import pandas as pd
import numpy as np

# pass in column names for each CSV as the column name is not given in the file and read them using pandas.
# You can check the column names from the readme file

# reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols,encoding='latin-1')

# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u1.base', sep='\t', names=r_cols,encoding='latin-1')

# reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('u.item', sep='|', names=i_cols,
encoding='latin-1')

# After loading the dataset, we should look at the content of each file (users, ratings, items).

# We have 100k ratings for different user and movie combinations. Now finally examine the items file.

# Item Data
#print("\nItem Data :")

#print("shape : ", items.shape)
items = items.drop(columns="IMDb URL")
items = items.drop(columns="release date")
items = items.drop(columns="unknown")
items = items.drop(columns="video release date")
items.head()
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('u1.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('u1.test', sep='\t', names=r_cols, encoding='latin-1')
print(ratings_train.shape)
print(ratings_test.shape)
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

(80000, 4)
(20000, 4)


IndexError: index 1650 is out of bounds for axis 1 with size 1650