In [13]:
import pandas as pd

In [14]:
# Upload the data:
df = pd.read_csv('../app/api/cannabis_new.csv')
df.head()

Unnamed: 0,Id,Strain,Type,Rating,Effects,Description,Flavors,Nearest
0,0,Kelly Hill Gold,indica,5.0,"Happy,Energetic,Euphoric,Talkative,Aroused",Cultivated by Joseph Arthur Botanicals in Colo...,"Pepper,Earthy,Coffee",9928974391841877
1,1,Spyder Mon,hybrid,5.0,"Uplifted,Creative,Focused,Happy,Relaxed",Spyder Mon is an uplifting CBD strain with a g...,"Citrus,Earthy,Sweet",12181571627223700
2,2,Mochi,hybrid,5.0,"Sleepy,Happy,Hungry,Relaxed,Tingly",Mochi by Sherbinski is another strain that lea...,"Pungent,Minty,Flowery",2614457208705130
3,3,Molokai Purpz,indica,5.0,"Aroused,Creative,Euphoric,Relaxed,Sleepy",Moloka’i Purpz is a luscious Hawaiian landrace...,"Berry,Grape,Sweet",31478562106350390
4,4,Monolith,indica,5.0,"Relaxed,Sleepy,Tingly,Euphoric,Focused",Monolith is an indica-dominant strain with Afg...,"Pungent,Earthy,Pine",413978621094214000


In [15]:
df['Effects'][0]

'Happy,Energetic,Euphoric,Talkative,Aroused'

In [16]:
df.shape

(2155, 8)

In [17]:
# Check for doubles:
df['Strain'].value_counts()

B Witched               2
Ace Killer Og           1
Velvet Bud              1
Grapefruit Diesel       1
Platinum Kush           1
                       ..
Alien Inferno           1
Diesel Duff             1
Hindu Kush              1
Platinum Sour Diesel    1
Ancient Kush            1
Name: Strain, Length: 2154, dtype: int64

In [18]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [19]:
# Define a function to tokenize the text:
def tokenizer(text):
    doc=nlp(text)
    return [token.lemma_ for token in doc if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON')]

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [26]:
# Build the model:
model = TfidfVectorizer(stop_words = 'english',
                       ngram_range = (1,2),
                       max_df = .95,
                       min_df = 3,
                       tokenizer = tokenizer)

# Fit and transform the data:
dtm = model.fit_transform(df['Effects'])

# Get features:
dtm = pd.DataFrame(dtm.todense(), columns = model.get_feature_names())

# Print the feature matrix:
dtm.head()

Unnamed: 0,arouse,arouse creative,arouse energetic,arouse euphoric,arouse happy,arouse relaxed,arouse sleepy,arouse talkative,arouse tingly,arouse uplifted,...,uplifted euphoric,uplifted focus,uplifted focused,uplifted giggly,uplifted happy,uplifted hungry,uplifted relaxed,uplifted sleepy,uplifted talkative,uplifted tingly
0,0.305935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.348341,0.64076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
dtm.shape

(2155, 190)

In [28]:
nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
model_t = nn.fit(dtm)

In [29]:
# Fin similar strains and return the strain_id:
# Victorise the text:
sample = ['I need something to make me sleepy']
#model.fit(sample)
vec = model.transform(sample)
# find similar effects:
dense = vec.todense()
similar = model_t.kneighbors(dense, return_distance=False)
similar.T

array([[ 127],
       [1896],
       [ 151],
       [ 519],
       [ 313]])

In [13]:
similar[0][4]

313

In [14]:
output = []
for i in range(5):
    elem = similar[0][i]
    output.append(elem)
output

[127, 1896, 151, 519, 313]

In [15]:
result = df[(df['Id']==output[0]) | 
            (df['Id']==output[1]) | 
            (df['Id']==output[2]) |
            (df['Id']==output[3]) |
            (df['Id']==output[4]) ]
result = result.sort_values(by=['Rating'], ascending=False)
result.head()

Unnamed: 0,Id,Strain,Type,Rating,Effects,Description,Flavors,Nearest
127,127,Eastern European,indica,5.0,"Sleepy,Happy,Relaxed",Eastern European refers to any strain that gro...,"Earthy,Woody,Pine",127180512062491000000
151,151,Club 69,indica,5.0,"Sleepy,Relaxed",Club 69 is a special strain crafted by Josh D ...,"Diesel,Flowery,Earthy",1512115938042021056
313,313,Joliet Jake,indica,4.55,"Relaxed,Sleepy,Euphoric",Joliet Jake by Cresco Labs brings back-to-back...,"Berry,Sweet,Menthol",3137112832017722147
519,519,Shurman 7,hybrid,4.32,"Relaxed,Sleepy,Euphoric",Shurman #7 by Solstice won Best CBD Flower at ...,"Earthy,Sweet,Pine",519876120721532000000
1896,1896,Dawgfather Og,hybrid,2.95,"Sleepy,Relaxed","Dawgfather OG, or simply “The Dawgfather,” is ...","Spicy/Herbal,Pepper,Pungent",18961793290119259131
