## Load the Data

In [1]:
""" Import Statements """

# Classics
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

import spacy
from spacy.tokenizer import Tokenizer
from nltk.stem import PorterStemmer
nlp = spacy.load("en_core_web_lg")

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/build-week-med-cabinet-2/ML_Model-Data/master/Cannabis_Strains_Features.csv")
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


## Tokenize Columns

In [3]:
# Merge all the text columns to make a all words columns
df['bag_of_words'] = df['Strain']+" "+df["Effects"] +" "+ df["Flavor"] +" "+ df['Description'] +" "+ df['Type']

In [4]:
df['bag_of_words'].head()

0    100-Og Creative,Energetic,Tingly,Euphoric,Rela...
1    98-White-Widow Relaxed,Aroused,Creative,Happy,...
2    1024 Uplifted,Happy,Relaxed,Energetic,Creative...
3    13-Dawgs Tingly,Creative,Hungry,Relaxed,Uplift...
4    24K-Gold Happy,Relaxed,Euphoric,Uplifted,Talka...
Name: bag_of_words, dtype: object

In [5]:
tokenizer = Tokenizer(nlp.vocab)

In [6]:
tokens = []

""" Make them tokens """
for doc in tokenizer.pipe(df['bag_of_words'], batch_size=500):
    doc_tokens = [token.text for token in doc]
    tokens.append(doc_tokens)

df['tokens'] = tokens

In [7]:
df['tokens'].head()

0    [100-Og, Creative,Energetic,Tingly,Euphoric,Re...
1    [98-White-Widow, Relaxed,Aroused,Creative,Happ...
2    [1024, Uplifted,Happy,Relaxed,Energetic,Creati...
3    [13-Dawgs, Tingly,Creative,Hungry,Relaxed,Upli...
4    [24K-Gold, Happy,Relaxed,Euphoric,Uplifted,Tal...
Name: tokens, dtype: object

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english')

# Create a vocabulary and get word counts per document
#Similar to fit_predict
dtm = tfidf.fit_transform(df['bag_of_words'])

# Print word counts

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm.head()

Unnamed: 0,00,001,07,09,10,100,1024,10th,11,115,...,zeta,zeus,zingers,zion,zipping,zkittlez,zombie,zone,zoning,zoom
0,0.0,0.0,0.0,0.0,0.0,0.559372,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.58649,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## KNN Model

In [24]:
# Instantiate
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer


# Fit on TF-IDF Vectors
nn  = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [10]:
# Query Using kneighbors 
nn.kneighbors([dtm.iloc[378]])

(array([[0.        , 1.04064022, 1.09486099, 1.26290318, 1.26293834]]),
 array([[378,  90, 929, 925, 888]], dtype=int64))

In [11]:
df['bag_of_words'][378][:200]

'Bright-Moments Tingly,Uplifted,Sleepy,Giggly,Happy Flowery,Woody,Earthy Bright Moments by Gage Green Group is a delicious cross of heirloom Grape Stomper genetics. Created by crossing Grape Stomper an'

In [12]:
df['bag_of_words'][929][:200]

"Sour-Grapes Happy,Relaxed,Uplifted,Euphoric,Hungry Grape,Sweet,Berry Also called 'Sour Grapes,' Grape Stomper\xa0is a craft hybridstrain created by Gage Green Seeds. A complex cross between breeder JojoR"

## Pickle the Model

In [25]:
from sklearn.externals import joblib 

joblib.dump(nn, 'NN_MJrec.pkl') 

['NN_MJrec.pkl']

In [26]:
from sklearn.externals import joblib

joblib.dump(tfidf, "tfidf.pkl")

['tfidf.pkl']

## Search Function

In [14]:
nn = joblib.load('NN_MJrec.pkl')
tfidf = joblib.load('tfidf.pkl')

In [22]:
def recommend(text):
   # Transform
    text = pd.Series(text)
    vect = tfidf.transform(text)

    # Send to df
    vectdf = pd.DataFrame(vect.todense())
    

    # Return a list of indexes
    top5 = nn.kneighbors([vectdf][0], n_neighbors=5)[1][0].tolist()
   
    
    # Send recomendations to DataFrame
    recommendations_df = df.iloc[top5]
    recommendations_df['index']= recommendations_df.index
    
    return recommendations_df

In [28]:
recommend("I want to a feel like a lemon just cleaned my mouth and wants to have an adventure ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,bag_of_words,tokens,index
1272,Lemon-Og,hybrid,4.3,"Relaxed,Euphoric,Happy,Sleepy,Uplifted","Lemon,Citrus,Earthy",What this skunky indica lacks in longevity it ...,"Lemon-Og Relaxed,Euphoric,Happy,Sleepy,Uplifte...","[Lemon-Og, Relaxed,Euphoric,Happy,Sleepy,Uplif...",1272
672,Django,sativa,4.6,"Energetic,Uplifted,Happy,Talkative,Euphoric","Sweet,Flowery,Earthy","Django means “I awake,” in Romani, and this up...","Django Energetic,Uplifted,Happy,Talkative,Euph...","[Django, Energetic,Uplifted,Happy,Talkative,Eu...",672
1251,Legendary-Lemon,sativa,4.0,"Aroused,Euphoric,Relaxed,Creative,Sleepy","Apricot,Chemical,Blueberry",Legendary Lemon is a sativa-dominant strain th...,"Legendary-Lemon Aroused,Euphoric,Relaxed,Creat...","[Legendary-Lemon, Aroused,Euphoric,Relaxed,Cre...",1251
1274,Lemon-Sativa,sativa,4.2,"Happy,Euphoric,Energetic,Creative,Focused","Lemon,Citrus,Earthy",Lemon Sativa is not your typical high-energy s...,"Lemon-Sativa Happy,Euphoric,Energetic,Creative...","[Lemon-Sativa, Happy,Euphoric,Energetic,Creati...",1274
1816,Sfv-Og,hybrid,4.4,"Happy,Relaxed,Uplifted,Euphoric,Focused","Earthy,Pine,Pungent",SFV OG by Cali Connection is a sativa-dominant...,"Sfv-Og Happy,Relaxed,Uplifted,Euphoric,Focused...","[Sfv-Og, Happy,Relaxed,Uplifted,Euphoric,Focus...",1816
