This notebook is intended to walk-through the concept of content-based recommendation engines based on the example shown on Analyticas Vidhya: 

https://www.analyticsvidhya.com/blog/2015/08/beginners-guide-learn-content-based-recommender-systems/

In [1]:
import pandas as pd
import numpy as np

## Build Item Profile

- Items: articles

- Features: topics in articles

In [2]:
articles = pd.DataFrame({
    'big_data': [1,0,0,0,0,1],
    'r': [0,1,0,0,1,0],
    'python': [1,1,0,1,0,0],
    'machine_learning': [0,1,1,1,0,1],
    'learning_paths': [1,0,1,0,0,0]
})
articles = articles[['big_data', 'r', 'python', 'machine_learning', 'learning_paths']]
articles

Unnamed: 0,big_data,r,python,machine_learning,learning_paths
0,1,0,1,0,1
1,0,1,1,1,0
2,0,0,0,1,1
3,0,0,1,1,0
4,0,1,0,0,0
5,1,0,0,1,0


Note: Since we have a binary matrix, using the 1+log(f(f,d)) approach would give the same matrix as above, so the next step is to normalize

In [4]:
# Normalize the attributes
total_attributes = articles.sum(axis=1)
pd.DataFrame({'total_attributes':total_attributes})

Unnamed: 0,total_attributes
0,3
1,3
2,2
3,2
4,1
5,2


In [5]:
articles_normalized = pd.DataFrame()
for i, row in articles.iterrows():
    articles_normalized = articles_normalized.append(row/np.sqrt(total_attributes[i]))

In [6]:
articles_normalized = articles_normalized[['big_data', 'r', 'python', 'machine_learning', 'learning_paths']]
articles_normalized

Unnamed: 0,big_data,r,python,machine_learning,learning_paths
0,0.57735,0.0,0.57735,0.0,0.57735
1,0.0,0.57735,0.57735,0.57735,0.0
2,0.0,0.0,0.0,0.707107,0.707107
3,0.0,0.0,0.707107,0.707107,0.0
4,0.0,1.0,0.0,0.0,0.0
5,0.707107,0.0,0.0,0.707107,0.0


In [7]:
df = articles.sum(axis=0) # articles frequency
idf = np.log10(len(articles)/df)

In [8]:
df

big_data            2
r                   2
python              3
machine_learning    4
learning_paths      2
dtype: int64

In [9]:
idf

big_data            0.477121
r                   0.477121
python              0.301030
machine_learning    0.176091
learning_paths      0.477121
dtype: float64

## Build User Profile

In [10]:
input_1 = input('Do you like article 1:')
input_2 = input('Do you like article 2:')
input_3 = input('Do you like article 3:')
input_4 = input('Do you like article 4:')
input_5 = input('Do you like article 5:')
input_6 = input('Do you like article 6:')

Do you like article 1:1
Do you like article 2:-1
Do you like article 3:0
Do you like article 4:0
Do you like article 5:0
Do you like article 6:1


In [11]:
user = [int(input_1), int(input_2), int(input_3), 
        int(input_4), int(input_5), int(input_6)]
user

[1, -1, 0, 0, 0, 1]

In [12]:
user_profile = np.dot(articles_normalized.T, user)
user_profile

array([ 1.28445705, -0.57735027,  0.        ,  0.12975651,  0.57735027])

## Predictions

In [13]:
import operator
import functools

def sumproduct(*lists):
    return sum(functools.reduce(operator.mul, data) for data in zip(*lists))

In [14]:
articles_normalized

Unnamed: 0,big_data,r,python,machine_learning,learning_paths
0,0.57735,0.0,0.57735,0.0,0.57735
1,0.0,0.57735,0.57735,0.57735,0.0
2,0.0,0.0,0.0,0.707107,0.707107
3,0.0,0.0,0.707107,0.707107,0.0
4,0.0,1.0,0.0,0.0,0.0
5,0.707107,0.0,0.0,0.707107,0.0


In [15]:
articles_normalized.iloc[4]

big_data            0.0
r                   1.0
python              0.0
machine_learning    0.0
learning_paths      0.0
Name: 4, dtype: float64

In [16]:
if sumproduct(articles_normalized.iloc[4,:], user_profile, idf) < 0:
    print("don't recommend")
else:
    print("recommend")

don't recommend
