In [104]:
import pandas as pd

import spacy
from spacy.tokenizer import Tokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

nlp = spacy.load("en_core_web_lg")

tokenizer = Tokenizer(nlp.vocab)

In [105]:
df = pd.read_csv("https://raw.githubusercontent.com/med-cab1/ds-api/master/data/cannabis.csv")

In [106]:
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [107]:
df['Criteria'] = df['Effects'] + ',' + df['Flavor']

In [108]:
word_df = df.filter(['Criteria'], axis=1)

In [109]:
word_df.head()

Unnamed: 0,Criteria
0,"Creative,Energetic,Tingly,Euphoric,Relaxed,Ear..."
1,"Relaxed,Aroused,Creative,Happy,Energetic,Flowe..."
2,"Uplifted,Happy,Relaxed,Energetic,Creative,Spic..."
3,"Tingly,Creative,Hungry,Relaxed,Uplifted,Aprico..."
4,"Happy,Relaxed,Euphoric,Uplifted,Talkative,Citr..."


In [110]:
word_df['ID'] = word_df.index + 1

In [111]:
word_df.head()

Unnamed: 0,Criteria,ID
0,"Creative,Energetic,Tingly,Euphoric,Relaxed,Ear...",1
1,"Relaxed,Aroused,Creative,Happy,Energetic,Flowe...",2
2,"Uplifted,Happy,Relaxed,Energetic,Creative,Spic...",3
3,"Tingly,Creative,Hungry,Relaxed,Uplifted,Aprico...",4
4,"Happy,Relaxed,Euphoric,Uplifted,Talkative,Citr...",5


In [112]:
word_df.isnull().sum()

Criteria    46
ID           0
dtype: int64

In [113]:
word_df = word_df.dropna()

In [114]:
word_df.isnull().sum()

Criteria    0
ID          0
dtype: int64

In [115]:
# Function to use spacy tokenizer
def tokenize(document):    
    doc = nlp(document)   
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and 
                                                     (token.is_punct != True) and
                                                     (token.text != ' ')]

In [117]:
tf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')

In [120]:
dtm = tf.fit_transform(word_df['Criteria'].values.astype('U'))

# Get Word Counts for each document
dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names())
dtm.head()

Unnamed: 0,ammonia,apple,apricot,arouse,aroused,berry,blue,blueberry,butter,cheese,...,tar,tea,tingly,tobacco,tree,tropical,uplifted,vanilla,violet,woody
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.498659,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.37049,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.700807,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.168801,0.0,0.0,0.374513
3,0.0,0.0,0.653842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.296154,0.0,0.0,0.0,0.146332,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.218023,0.0,0.0,0.0


In [85]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
results = {} # dictionary created to store the result in a dictionary format (ID : (Score,item_id))#

In [88]:
for idx, row in word_df.iterrows(): #iterates through all the rows

# the below code 'similar_indice' stores similar ids based on cosine similarity. sorts them in ascending order. [:-5:-1] is then used so that the indices with most similarity are got. 0 means no similarity and 1 means perfect similarity#
    similar_indices = cosine_similarities[idx].argsort()[:-5:-1] 

    #stores 5 most similar books, you can change it as per your needs
    similar_items = [(cosine_similarities[idx][i], word_df['ID'][i]) for i in similar_indices]
    results[row['ID']] = similar_items[1:]
    
#below code 'function item(id)' returns a row matching the id along with Criteria. Initially it is a dataframe, then we convert it to a list#
def item(id):
    return word_df.loc[word_df['ID'] == id]['Criteria'].tolist()[0]

def recommend(id, num):
    if (num == 0):
        print("Unable to recommend any book as you have not chosen the number of book to be recommended")
    elif (num==1):
        print("Recommending " + str(num) + " book similar to " + item(id))
        
    else :
        print("Recommending " + str(num) + " books similar to " + item(id))
        
    print("----------------------------------------------------------")
    recs = results[id][:num]
    for rec in recs:
        print("You may also like to read: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

KeyError: 770