# Nearest Neighbour and Rating Prediction

look at item-based collaborative filtering to cluster together items having similar ratings and to predict what a certain user might rate a yet unrated item. 

## Prepare dataset

In [1]:
import pandas as pd
df = pd.read_json('/Users/annieclarnette/Desktop/new app/activities.json')
df
#Ranking = 3*like + 1.5*watch later + 1.5*play now + 0.5*select -3*dislike

Unnamed: 0,content_id,activity,user_id,datetime
0,documentary,Select Category,61,2022-04-05 12:09:32.886307
1,panel-discussion,Select Category,61,2022-04-05 12:13:04.402732
2,7978,Select Title,61,2022-04-05 12:13:25.928590
3,7164,Select Title,61,2022-04-05 12:13:52.845408
4,7164,Like,61,2022-04-05 12:14:14.373066
...,...,...,...,...
1773,1457,Like,75,2022-04-05 23:54:30.878571
1774,1828,Select Title,75,2022-04-05 23:54:42.913639
1775,1828,Watch,75,2022-04-05 23:54:50.141263
1776,2071,Select Title,75,2022-04-05 23:54:54.456741


In [2]:
df = df[pd.to_numeric(df['content_id'], errors='coerce').notnull()]
df = df[['content_id', 'activity', 'user_id']]

In [3]:
df = df.replace({'Like': 3, 'Dislike': -3, 'Watchlist': 1.5, 'Watch': 1.5, 'Select Title':0.5})
df = df[pd.to_numeric(df['activity'], errors='coerce').notnull()]

In [4]:
df=df.groupby(['content_id', 'user_id'])['activity'].agg(['sum']).reset_index()
df = df.rename(columns={"sum": "Rating"})
df

Unnamed: 0,content_id,user_id,Rating
0,7,22,0.5
1,14,22,2.0
2,17,32,5.0
3,18,34,5.0
4,22,34,0.5
...,...,...,...
862,7975,31,5.0
863,7976,35,2.0
864,7977,33,0.5
865,7978,35,3.5


## Creating the rating matrix

As covered in week 02, we need to construct a rating matrix out of the ratings dataset. Each row of the matrix are user ratings for a given book.

In [5]:
df=df.pivot(index='content_id', columns='user_id', values='Rating').fillna(0)
df

user_id,1,2,3,4,5,11,12,14,15,21,...,61,62,63,64,65,71,72,73,74,75
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Nearest Neighbors


$$
sim(i, j) = \frac{r_{i} \cdot r_{j}} {||r_{i}||_{2}||r_{j}||_{2}}
$$

compute similarities between books bases on their respective ratings. Remember cosine distance? We will use this with sklearn's NearestNeighbors algorithm.   

In [6]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric='cosine', algorithm='brute')

knn.fit(df.values)

distances, indices = knn.kneighbors(df.values, n_neighbors=6)

In [7]:
#indices
#distances

In [8]:
neighbours = {}

for i in range(0, len(indices)):
    nn = indices[i]
    dist = distances[i]    
    e = nn[0]
    e_isbn = df.index[e]    
    neighbours[e_isbn] = {"nn": [df.index[n] for n in nn[1:]], "dist": [1 - x for x in dist[1:]]}
    
neighbours
    

{1275: {'nn': [611, 586, 1252, 1231, 5145], 'dist': [1.0, 1.0, 1.0, 1.0, 1.0]},
 430: {'nn': [89, 7855, 903, 431, 666], 'dist': [1.0, 1.0, 1.0, 1.0, 1.0]},
 267: {'nn': [2946, 3600, 717, 937, 5511], 'dist': [1.0, 1.0, 1.0, 1.0, 1.0]},
 7437: {'nn': [1956, 1416, 7786, 1987, 1823],
  'dist': [1.0, 1.0, 1.0, 1.0, 1.0]},
 893: {'nn': [1067, 841, 1158, 921, 6723], 'dist': [1.0, 1.0, 1.0, 1.0, 1.0]},
 1233: {'nn': [640, 321, 7970, 7922, 7882], 'dist': [1.0, 1.0, 1.0, 1.0, 1.0]},
 2964: {'nn': [6107, 6081, 202, 413, 206], 'dist': [1.0, 1.0, 1.0, 1.0, 1.0]},
 293: {'nn': [1010, 1063, 1232, 64, 3310], 'dist': [1.0, 1.0, 1.0, 1.0, 1.0]},
 343: {'nn': [4891, 5478, 7856, 90, 1165], 'dist': [1.0, 1.0, 1.0, 1.0, 1.0]},
 1220: {'nn': [873, 1198, 1041, 646, 67], 'dist': [1.0, 1.0, 1.0, 1.0, 1.0]},
 3287: {'nn': [88, 1155, 2378, 369, 82], 'dist': [1.0, 1.0, 1.0, 1.0, 1.0]},
 7455: {'nn': [7413, 135, 2170, 7498, 7457],
  'dist': [1.0, 1.0, 1.0, 1.0, 1.0]},
 314: {'nn': [7884, 594, 6723, 1493, 1044],
  '

## Predict rating (based on neighbours)

predict the rating a certain user might give to an item based on this item's neighbours and the potential rating the user gave them. 

In [9]:
def predict_rating(user_id, content_id, neighbours):
    
    if content_id not in neighbours:
        return 0
        
    neighbours = neighbours[content_id]
    
    nn = neighbours['nn']
    dist = neighbours['dist']
    
    numerator = 0
    denominator = 0
    
    for i in range(0, len(nn)):
        
        isbn = nn[i]
        user_rating = df.loc[isbn, user_id]
            
        numerator += user_rating * dist[i]
        denominator += dist[i]
            
    if denominator > 0:
        
        return numerator / denominator
    
    else: 
        
        return 0


In [10]:
all_shows = df.index.tolist()
all_users = df.columns.tolist()

empty=[]

for b in all_shows:
    for u in all_users:
        pr = predict_rating(u, b, neighbours)
        
        result = dict.fromkeys(['content_id','user_id','prediction'])
        result['content_id'] = b
        result['user_id'] = u
        result['prediction'] = pr
        empty.append(result)
        #print(f"{b} - {u}: prediction - {pr}")


In [11]:
empty

[{'content_id': 7, 'user_id': 1, 'prediction': 0},
 {'content_id': 7, 'user_id': 2, 'prediction': 0},
 {'content_id': 7, 'user_id': 3, 'prediction': 0},
 {'content_id': 7, 'user_id': 4, 'prediction': 0},
 {'content_id': 7, 'user_id': 5, 'prediction': 0},
 {'content_id': 7, 'user_id': 11, 'prediction': 0},
 {'content_id': 7, 'user_id': 12, 'prediction': 0},
 {'content_id': 7, 'user_id': 14, 'prediction': 0},
 {'content_id': 7, 'user_id': 15, 'prediction': 0},
 {'content_id': 7, 'user_id': 21, 'prediction': 0},
 {'content_id': 7, 'user_id': 22, 'prediction': 0},
 {'content_id': 7, 'user_id': 23, 'prediction': 0},
 {'content_id': 7, 'user_id': 24, 'prediction': 0},
 {'content_id': 7, 'user_id': 25, 'prediction': 0},
 {'content_id': 7, 'user_id': 31, 'prediction': 0},
 {'content_id': 7, 'user_id': 32, 'prediction': 0},
 {'content_id': 7, 'user_id': 33, 'prediction': 0},
 {'content_id': 7, 'user_id': 34, 'prediction': 0},
 {'content_id': 7, 'user_id': 35, 'prediction': 0},
 {'content_id': 7

In [31]:
df2=pd.DataFrame(empty)
df2 = df2.rename(columns={"content_id": "ID"})

In [27]:
complete = pd.read_csv('/Users/annieclarnette/Desktop/new app/df_recommen_ABC.csv')
complete

Unnamed: 0,title,description,image,description_2,director,Features,Category,Series,diversity,cluster_input,duplicated_detect,duplicates,Cluster,ID,Season,Episode
0,100 Bloody Acres,Organic fertilizer producers and brothers are ...,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW297...,Organic fertilizer producers and brothers are ...,"Colin Caires, Cameron Cairnes","'COMEDY', 'DRAMA', 'MOVIES'",comedy,100 Bloody Acres,0,Organic fertilizer producers and brothers are ...,100 Bloody Acres Organic fertilizer producers ...,False,4,1,,
1,8MMM,An Indigenous crew from a remote radio station...,https://cdn.iview.abc.net.au/thumbs/i/ip/IP133...,An Indigenous crew from a remote radio station...,"Ian Meadows, Shari Sebbens, Geoff Morrell, Tri...",'INDIGENOUS',comedy,8MMM,1,An Indigenous crew from a remote radio station...,8MMM An Indigenous crew from a remote radio st...,False,4,2,,
2,About Sex,About Sex invites teens to take a funny and fr...,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW282...,About Sex invites teens to take a funny and fr...,"Brittany Drisdelle, Nadine Bhabha, Mike Carrozza",'SCIENCE',comedy,About Sex,0,About Sex invites teens to take a funny and fr...,About Sex About Sex invites teens to take a fu...,False,4,3,,
3,Absolutely Anything,Aliens place the fate of humanity in the hands...,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW287...,Aliens place the fate of humanity in the hands...,Terry Jones,'MOVIES',comedy,Absolutely Anything,0,Aliens place the fate of humanity in the hands...,Absolutely Anything Aliens place the fate of h...,False,0,4,,
4,Accidents Happen,A wickedly funny and surprisingly moving fable...,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW297...,A wickedly funny and surprisingly moving fable...,Andrew Lancaster,"'COMEDY', 'DRAMA', 'MOVIES'",comedy,Accidents Happen,0,A wickedly funny and surprisingly moving fable...,Accidents Happen A wickedly funny and surprisi...,False,4,5,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7976,QI,Sandi Toksvig asks questions on animals beginn...,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW254...,Sandi Toksvig hosts the return of the quiz wit...,Sandi Toksvig,,panel-discussion,QI: Series 19 Episode 13 R Animals,0,Sandi Toksvig asks questions on animals beginn...,QI Sandi Toksvig asks questions on animals beg...,False,0,7977,19.0,6.0
7977,QI,"Bill Bailey, Jack Carroll and Olga Koch join r...",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW254...,Sandi Toksvig hosts the return of the quiz wit...,Sandi Toksvig,,panel-discussion,QI: Series 19 Episode 14 Rogue,0,"Bill Bailey, Jack Carroll and Olga Koch join r...","QI Bill Bailey, Jack Carroll and Olga Koch joi...",False,0,7978,19.0,6.0
7978,QI,Let Sandi Toksvig bring you to your senses wit...,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW321...,Sandi Toksvig hosts the return of the quiz wit...,Sandi Toksvig,,panel-discussion,QI: Series 20 Episode 6 Sensational,0,Let Sandi Toksvig bring you to your senses wit...,QI Let Sandi Toksvig bring you to your senses ...,False,0,7979,20.0,6.0
7979,QI,Sandi Toksvig hosts a specially spooky episode...,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW321...,Sandi Toksvig hosts the return of the quiz wit...,Sandi Toksvig,,panel-discussion,QI: Series 20 Episode 7 Spooky,0,Sandi Toksvig hosts a specially spooky episode...,QI Sandi Toksvig hosts a specially spooky epis...,False,0,7980,20.0,6.0


In [34]:
complete = df2.merge(complete,on='ID')

In [35]:
complete.to_csv('/Users/annieclarnette/Desktop/new app/collab_predictions.csv')