In [2]:
# import statements
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
import squarify

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from collections import Counter
from spacy.tokenizer import Tokenizer
nlp = spacy.load("en_core_web_md")

In [3]:
df = pd.read_csv('./final_df_percentsep.csv', sep='%', index_col='Unnamed: 0')

In [4]:
df = df.fillna('none')
df.head()

Unnamed: 0,name,flavors,race,positive_effects,negative_effects,medical_uses,Rating,Description
1,Afpak,"Earthy, Chemical, Pine, SpicyHerbal",hybrid,"Relaxed, Hungry, Happy, Sleepy, Creative, Focused",Dizzy,"Depression, Insomnia, Pain, Stress, Lack of Ap...",4.2,Afpak named for its direct Afghani and Pakista...
2,African,"SpicyHerbal, Pungent, Earthy, Pepper",sativa,"Euphoric, Happy, Creative, Energetic, Talkativ...",Dry Mouth,"Depression, Pain, Stress, Lack of Appetite, Na...",3.9,African refers to the indigenous varieties of ...
3,Afternoon Delight,"Pepper, Flowery, Pine, Pungent, Citrus, Tropical",hybrid,"Relaxed, Hungry, Euphoric, Uplifted, Tingly, T...","Dizzy, Dry Mouth, Paranoid","Depression, Insomnia, Pain, Stress, Cramps, He...",4.8,Afternoon Delight created by Colorado Seed Inc...
4,Afwreck,"Pine, Earthy, Flowery, Pungent",hybrid,"Relaxed, Happy, Creative, Uplifted, Sleepy, Eu...","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Pain, Stress, Headache, Fatigue, Headaches, Mu...",4.2,Afwreck is a hybrid cross of Afghani and Train...
5,Agent Orange,"Citrus, Orange, Sweet, Earthy",hybrid,"Relaxed, Euphoric, Happy, Energetic, Uplifted","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Depression, Pain, Stress, Nausea, Headache, He...",4.2,Don’t let the name scare you! The only herbici...


In [5]:
df.isnull().sum()

name                0
flavors             0
race                0
positive_effects    0
negative_effects    0
medical_uses        0
Rating              0
Description         0
dtype: int64

In [10]:
# combine all text features into one string:

df['combined_text'] = df.name + " " + df.flavors +  " " + df.race + " " + df.positive_effects + " " + df.negative_effects + " " + df.medical_uses + " " + df.Description
# Removing punctuations from our string
df["combined_text"] = df['combined_text'].str.replace('[^\w\s]',' ')
df["combined_text"] = df['combined_text'].str.replace('none','')

In [11]:
# Tokenizer

STOP_WORDS = nlp.Defaults.stop_words.union([' ','  ', '-PRON-', 'none'])

tokenizer = Tokenizer(nlp.vocab)
tokens = []

for doc in tokenizer.pipe(df['combined_text'], batch_size=500):
    
    doc_tokens = []
    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())
   
    tokens.append(doc_tokens)
    
df['tokens'] = tokens

In [12]:
df['tokens']

1       [afpak, earthy, chemical, pine, spicyherbal, h...
2       [african, spicyherbal, pungent, earthy, pepper...
3       [afternoon, delight, pepper, flowery, pine, pu...
4       [afwreck, pine, earthy, flowery, pungent, hybr...
5       [agent, orange, citrus, orange, sweet, earthy,...
                              ...                        
1459    [yummy, sweet, earthy, pungent, hybrid, relaxe...
1460    [zen, earthy, woody, flowery, sweet, hybrid, r...
1461    [zeta, sage, sage, diesel, sweet, pungent, sat...
1462    [zkittlez, sweet, berry, grape, indica, relaxe...
1463    [zoom, pie, berry, earthy, pungent, indica, re...
Name: tokens, Length: 1463, dtype: object

In [13]:
df['lemmas'] = df['combined_text'].apply(lambda text: [token.lemma_ for token in nlp(text) if (token.is_stop != True) and (token.is_punct != True)])

In [14]:
df['lemmas'] = df['lemmas'].str.join(' ')
df['lemmas']

1       Afpak Earthy   Chemical   Pine   SpicyHerbal h...
2       African SpicyHerbal   pungent   Earthy   Peppe...
3       afternoon Delight Pepper   flowery   Pine   pu...
4       Afwreck Pine   Earthy   Flowery   pungent hybr...
5       Agent Orange Citrus   Orange   sweet   earthy ...
                              ...                        
1459    yummy Sweet   Earthy   pungent hybrid relaxed ...
1460    Zen Earthy   Woody   Flowery   sweet hybrid re...
1461    Zeta Sage Sage   diesel   sweet   pungent sati...
1462    Zkittlez Sweet   Berry   Grape indica relaxed ...
1463    zoom pie berry   Earthy   pungent indica relax...
Name: lemmas, Length: 1463, dtype: object

### Test with lemmatization

In [15]:
tfidf = TfidfVectorizer()
dtm3 = tfidf.fit_transform(df['lemmas'])
dtm3 = pd.DataFrame(dtm3.todense(), columns=tfidf.get_feature_names())
dtm3.head()

Unnamed: 0,11,12,13,1974,43,44,47,51,69,91,...,zesty,zeta,zinger,zion,zip,zkittlez,zombie,zombies,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Fit on DTM
nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(dtm3)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [21]:
# with putting it back into a df:
test_input = ["I need something to help with anxiety and pain but has a sweet flavor"]
user_input = tfidf.transform(test_input)
input_df = pd.DataFrame(user_input.todense())
score, recommended_strains = nn.kneighbors(input_df.todense())

AttributeError: 'DataFrame' object has no attribute 'todense'

In [14]:
# without putting it back into a df:
user_input = "I want something to help with lack of appetite"
user_input = pd.Series(user_input)
vect_input = tfidf.transform(user_input)
score, recommended_strains = nn.kneighbors(vect_input.todense())

In [18]:
print(strain_index)

[1213, 1079, 361, 712, 857]


In [18]:
print(score, recommended_strains)

[[1.2646706  1.28472022 1.30472906 1.31082991 1.31861087]] [[1448 1281 1148  620   77]]


### Test without lemmatization:

In [17]:
# without lemmatization & putting into dataframe
dtm2 = tfidf.fit_transform(df['combined_text'])
dtm2 = pd.DataFrame(dtm2.todense(), columns=tfidf.get_feature_names())
# Fit on DTM
nn = NearestNeighbors(n_neighbors=5, leaf_size=50, algorithm='kd_tree')
nn.fit(dtm2)

NearestNeighbors(algorithm='kd_tree', leaf_size=50, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [19]:
test_input = ["Looking for something to help with headaches"]
user_input = tfidf.transform(test_input)
score, strain_index = nn.kneighbors(user_input.todense())
print(score, strain_index)

[[1.30060439 1.30612465 1.30721683 1.31698194 1.33486556]] [[1309  779  272  530  783]]


In [20]:
strains = [df[['name', 'medical_uses']].loc[n] for n in strain_index]
print(strains)

[                name                                       medical_uses
1309  Tangerine Haze  Depression, Pain, Stress, Headache, Fatigue, E...
779     Killer Queen  Depression, Pain, Stress, Nausea, Headache, Fa...
272        Candyland       Depression, Pain, Stress, Fatigue, Headaches
530      Fruit Loops      Depression, Insomnia, Pain, Stress, Headaches
783        King Kong  Depression, Insomnia, Pain, Stress, Muscle Spasms]


### Using Basilica

In [21]:
import basilica
API_KEY = '6b4eb009-61ef-5937-b4e6-444e5c3acc85'
with basilica.Connection(API_KEY) as c:
    embedded = []
    for row in df['combined_text']:
        sentence = row
        embedding = list(c.embed_sentence(sentence))
        embedded.append(embedding)
    df['embedded'] = embedded


In [22]:
df.head()

Unnamed: 0,name,flavors,race,positive_effects,negative_effects,medical_uses,Rating,Description,combined_text,tokens,lemmas,embedded
1,Afpak,"Earthy, Chemical, Pine, SpicyHerbal",hybrid,"Relaxed, Hungry, Happy, Sleepy, Creative, Focused",Dizzy,"Depression, Insomnia, Pain, Stress, Lack of Ap...",4.2,Afpak named for its direct Afghani and Pakista...,Afpak Earthy Chemical Pine SpicyHerbal hybr...,"[afpak, earthy, chemical, pine, spicyherbal, h...",Afpak Earthy Chemical Pine SpicyHerbal h...,"[0.0285604, -0.341568, 0.219712, -0.127979, -0..."
2,African,"SpicyHerbal, Pungent, Earthy, Pepper",sativa,"Euphoric, Happy, Creative, Energetic, Talkativ...",Dry Mouth,"Depression, Pain, Stress, Lack of Appetite, Na...",3.9,African refers to the indigenous varieties of ...,African SpicyHerbal Pungent Earthy Pepper s...,"[african, spicyherbal, pungent, earthy, pepper...",African SpicyHerbal pungent Earthy Peppe...,"[0.00377018, 0.114895, 0.159532, -0.232496, -0..."
3,Afternoon Delight,"Pepper, Flowery, Pine, Pungent, Citrus, Tropical",hybrid,"Relaxed, Hungry, Euphoric, Uplifted, Tingly, T...","Dizzy, Dry Mouth, Paranoid","Depression, Insomnia, Pain, Stress, Cramps, He...",4.8,Afternoon Delight created by Colorado Seed Inc...,Afternoon Delight Pepper Flowery Pine Punge...,"[afternoon, delight, pepper, flowery, pine, pu...",afternoon Delight Pepper flowery Pine pu...,"[-0.0594041, -0.401205, 0.0903683, -0.143796, ..."
4,Afwreck,"Pine, Earthy, Flowery, Pungent",hybrid,"Relaxed, Happy, Creative, Uplifted, Sleepy, Eu...","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Pain, Stress, Headache, Fatigue, Headaches, Mu...",4.2,Afwreck is a hybrid cross of Afghani and Train...,Afwreck Pine Earthy Flowery Pungent hybrid ...,"[afwreck, pine, earthy, flowery, pungent, hybr...",Afwreck Pine Earthy Flowery pungent hybr...,"[-0.21463, -0.12518, 0.580927, -0.130724, -0.2..."
5,Agent Orange,"Citrus, Orange, Sweet, Earthy",hybrid,"Relaxed, Euphoric, Happy, Energetic, Uplifted","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Depression, Pain, Stress, Nausea, Headache, He...",4.2,Don’t let the name scare you! The only herbici...,Agent Orange Citrus Orange Sweet Earthy hyb...,"[agent, orange, citrus, orange, sweet, earthy,...",Agent Orange Citrus Orange sweet earthy ...,"[-0.0601283, -0.061624, 0.275476, 0.0656168, -..."


In [176]:
df.to_csv('embedded_df.csv', index=False)

In [77]:
import basilica
import json
import numpy as np
import os
import random
import re
import sklearn.decomposition
import sklearn.neighbors
import sklearn.preprocessing
import time
from sklearn.pipeline import Pipeline

In [183]:
data_input = np.stack(df['embedded'].values, axis=0)


scaler = sklearn.preprocessing.StandardScaler(with_std=False)
pca = sklearn.decomposition.PCA(n_components=75, whiten=True)


data_input = scaler.fit_transform(data_input)
data_input = pca.fit_transform(data_input)
data_input = sklearn.preprocessing.normalize(data_input)
print(data_input.shape)

dtm = pd.DataFrame(data_input)
# Fit on DTM
nn3 = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(dtm)

(1463, 75)


In [184]:
user_input = "I need something to help with anxiety and pain but has a sweet flavor"
with basilica.Connection(API_KEY) as c:
    embedded = c.embed_sentence(user_input)

embedded = np.stack([embedded], axis=0)

user_input = scaler.transform(embedded)
user_input = pca.transform(user_input)
user_input = sklearn.preprocessing.normalize(user_input)

score, strain_index = nn3.kneighbors(user_input)
print(score, strain_index)


[[1.07226059 1.09184254 1.09362405 1.11285674 1.1443225 ]] [[ 188 1075  272 1333  116]]


In [175]:
strains = [df[['name', 'flavors', 'medical_uses']].loc[n] for n in strain_index]
print(strains)

[                    name                          flavors  \
1075        Purple Crack           Earthy, Pungent, Sweet   
188   Blue Mountain Fire     Earthy, Flowery, Pepper, Tar   
272            Candyland  Sweet, Earthy, Pungent, Flowery   
1333             The One    Earthy, Pungent, Pine, Diesel   
116    Black Cherry Soda             Sweet, Berry, Earthy   

                                           medical_uses  
1075      Depression, Insomnia, Pain, Stress, Headaches  
188                                  Depression, Stress  
272        Depression, Pain, Stress, Fatigue, Headaches  
1333     Depression, Pain, Stress, Nausea, Inflammation  
116   Depression, Pain, Stress, Headache, Fatigue, H...  ]


### for pickled model

In [220]:
data_input = np.stack(df['embedded'].values, axis=0)


scaler = sklearn.preprocessing.StandardScaler(with_std=False)
pca = sklearn.decomposition.PCA(n_components=75, whiten=True)
normalizer = sklearn.preprocessing.Normalizer().fit(pcad)
nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')

scaled = scaler.fit_transform(data_input)
print(type(scaled))
pcad = pca.fit_transform(scaled)
normd = sklearn.preprocessing.normalize(pcad)
dtm = pd.DataFrame(normd)
nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(dtm)


dtm.shape

<class 'numpy.ndarray'>


(1463, 75)

In [224]:
joblib.dump(scaler, 'scaler.pkl') 
joblib.dump(pca, 'pcaer.pkl')
joblib.dump(nn, 'nnmodel.pkl') 
joblib.dump(normalizer, 'normd.pkl')

['normd.pkl']

In [205]:
model = joblib.load('nn.pkl')
scaled = joblib.load('scaler.pkl')
pcaer = joblib.load('pcaer.pkl')
nnmodel = joblib.load('nnmodel.pkl')
normd = 

In [209]:
target = "I need something to help with anxiety and pain but has a sweet flavor"
with basilica.Connection(API_KEY) as c:
    embedded = c.embed_sentence(target)

embedded = np.stack([embedded], axis=0)
embedded.shape

user_input = scaled.transform(embedded)
user_input = pcaer.transform(user_input)

# score, strain_index = nn3.kneighbors(embedded1)
# print(score, strain_index)

HTTPError: 500 Server Error: Internal Server Error for url: https://api.basilica.ai/embed/text/english/default

In [181]:
import joblib
joblib.dump(scaled, 'scaled.pkl') 

['preprocessor.pkl']

In [182]:
joblib.dump(nn, 'nn.pkl') 

['nn.pkl']