### Content Based Filtering - Recommendation Algorithm

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [3]:
ds = pd.read_csv("sample-data.csv")

In [4]:
ds.head()

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."


In [5]:
ds.shape

(500, 2)

In [6]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(ds['description'])

In [7]:
tfidf_matrix

<500x52262 sparse matrix of type '<class 'numpy.float64'>'
	with 148989 stored elements in Compressed Sparse Row format>

In [8]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [9]:
results = {}
for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices]

    results[row['id']] = similar_items[1:]

In [None]:
results

In [11]:
def item(id):
    return ds.loc[ds['id'] == id]['description'].tolist()[0].split(' - ')[0]

In [12]:
# Just reads the results out of the dictionary.
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

In [14]:
recommend(item_id=1, num=10)

Recommending 10 products similar to Active classic boxers...
-------
Recommended: Cap 1 boxer briefs (score:0.22037921472617467)
Recommended: Active boxer briefs (score:0.16938950913002365)
Recommended: Cap 1 bottoms (score:0.16769458065321555)
Recommended: Cap 1 t-shirt (score:0.1648552774562297)
Recommended: Cap 3 bottoms (score:0.1481261546058637)
Recommended: Cap 1 bottoms (score:0.14577863284367548)
Recommended: Cap 1 t-shirt (score:0.14137642365361247)
Recommended: Active briefs (score:0.13884463426216961)
Recommended: Cap 4 bottoms (score:0.1387953333136303)
Recommended: Cap 2 bottoms (score:0.13813550299091382)
