In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [8]:
df = pd.read_csv("https://raw.githubusercontent.com/med-cab1/ds-api/master/data/cannabis.csv")

In [9]:
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [53]:
df['Criteria'] = df['Effects'] + ',' + df['Flavor']

In [75]:
word_df = df.filter(['Criteria'], axis=1)

In [76]:
word_df.head()

Unnamed: 0,Criteria
0,"Creative,Energetic,Tingly,Euphoric,Relaxed,Ear..."
1,"Relaxed,Aroused,Creative,Happy,Energetic,Flowe..."
2,"Uplifted,Happy,Relaxed,Energetic,Creative,Spic..."
3,"Tingly,Creative,Hungry,Relaxed,Uplifted,Aprico..."
4,"Happy,Relaxed,Euphoric,Uplifted,Talkative,Citr..."


In [77]:
word_df['ID'] = word_df.index + 1

In [78]:
word_df.head()

Unnamed: 0,Criteria,ID
0,"Creative,Energetic,Tingly,Euphoric,Relaxed,Ear...",1
1,"Relaxed,Aroused,Creative,Happy,Energetic,Flowe...",2
2,"Uplifted,Happy,Relaxed,Energetic,Creative,Spic...",3
3,"Tingly,Creative,Hungry,Relaxed,Uplifted,Aprico...",4
4,"Happy,Relaxed,Euphoric,Uplifted,Talkative,Citr...",5


In [79]:
word_df.isnull().sum()

Criteria    46
ID           0
dtype: int64

In [82]:
word_df = word_df.dropna()

In [83]:
word_df.isnull().sum()

Criteria    0
ID          0
dtype: int64

In [84]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

In [85]:
tfidf_matrix = tf.fit_transform(word_df['Criteria'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
results = {} # dictionary created to store the result in a dictionary format (ID : (Score,item_id))#

In [88]:
for idx, row in word_df.iterrows(): #iterates through all the rows

# the below code 'similar_indice' stores similar ids based on cosine similarity. sorts them in ascending order. [:-5:-1] is then used so that the indices with most similarity are got. 0 means no similarity and 1 means perfect similarity#
    similar_indices = cosine_similarities[idx].argsort()[:-5:-1] 

    #stores 5 most similar books, you can change it as per your needs
    similar_items = [(cosine_similarities[idx][i], word_df['ID'][i]) for i in similar_indices]
    results[row['ID']] = similar_items[1:]
    
#below code 'function item(id)' returns a row matching the id along with Criteria. Initially it is a dataframe, then we convert it to a list#
def item(id):
    return word_df.loc[word_df['ID'] == id]['Criteria'].tolist()[0]

def recommend(id, num):
    if (num == 0):
        print("Unable to recommend any book as you have not chosen the number of book to be recommended")
    elif (num==1):
        print("Recommending " + str(num) + " book similar to " + item(id))
        
    else :
        print("Recommending " + str(num) + " books similar to " + item(id))
        
    print("----------------------------------------------------------")
    recs = results[id][:num]
    for rec in recs:
        print("You may also like to read: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

KeyError: 770