#Medical Marijuana DS

We were tasked with recommending users strains, appropriate dosages, intake method, and the intake schedule using some form of natural language processing. With such limited available data the latter 3 were not feasable. The hope is that with enough user data, a model could be built to predict those things.

Given the task and the limited data. We decided a bag of words model was the best approach. Specifically the TFIDF document term matrix combined the a nearest neighbor model. It's possible that a neural network could have provided more accurate results but it's hard to evaluate what a good result is. Even then, the descriptions are so domain specific that it would probably have been more trouble than it was worth.

## Strain Recommender: TFIDF (BOW)

In [0]:
# Read in the csv
# This code has to change to using the api the backend guys built

import pandas as pd

df = pd.read_csv('Cannabis_Strains_Features.csv')

# Dropping 'None' Descriptions
df = df[df['Description'] != 'None']

In [2]:
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [3]:
df.shape

(2346, 6)

#### Applying TFIDF Vectorizor along with nearest neighbor model

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english', max_features=1500, max_df=.98, min_df=.02)

# Create a vocabulary and get word counts per document
# Similiar to fit_predict
tfidfmodel = tfidf.fit_transform(df['Description'])

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(tfidfmodel.todense(), columns=tfidf.get_feature_names())
dtm = dtm.drop(dtm.iloc[:, 0:8], axis=1)

# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

(2346, 367)


Unnamed: 0,active,activity,afghani,alien,alongside,anxiety,appetite,aroma,aromas,average,away,balanced,berry,best,big,black,blend,blue,blueberry,body,bred,breeder,breeders,bright,brings,bubba,bud,buds,buzz,california,calm,calming,candy,cannabis,cbd,cerebral,cheese,chemdawg,cherry,chocolate,...,sweetness,symptoms,takes,tall,taste,tend,terpene,terpenes,thai,thc,time,times,took,trainwreck,treat,treating,trichome,trichomes,tropical,true,typical,typically,undertones,unique,unknown,uplifting,use,users,varieties,variety,way,week,weeks,white,widow,winning,won,world,yield,yields
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158664,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.158565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.115442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.452836,0.576801,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156389,0.0,0.267952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.204183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.244822,0.0,0.0,0.131176,0.153148,0.0,0.0,0.0,0.27364,0.0,0.0,0.0,0.180141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.256413,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.213829,0.0,0.0,0.0,0.0,0.0,0.129583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.187229,0.0,0.0,0.0,0.20654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
from sklearn.neighbors import NearestNeighbors

# Fit on DTM
nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [31]:
text = [ """appetite blueberry blend taste sweetness treat soul mind """]

text_transformed = tfidf.transform(text)
text_transformed = pd.DataFrame(text_transformed.todense(), columns=tfidf.get_feature_names())
text_transformed = text_transformed.drop(text_transformed.iloc[:, 0:8], axis=1)
text_transformed

Unnamed: 0,active,activity,afghani,alien,alongside,anxiety,appetite,aroma,aromas,average,away,balanced,berry,best,big,black,blend,blue,blueberry,body,bred,breeder,breeders,bright,brings,bubba,bud,buds,buzz,california,calm,calming,candy,cannabis,cbd,cerebral,cheese,chemdawg,cherry,chocolate,...,sweetness,symptoms,takes,tall,taste,tend,terpene,terpenes,thai,thc,time,times,took,trainwreck,treat,treating,trichome,trichomes,tropical,true,typical,typically,undertones,unique,unknown,uplifting,use,users,varieties,variety,way,week,weeks,white,widow,winning,won,world,yield,yields
0,0.0,0.0,0.0,0.0,0.0,0.0,0.335943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.402808,0.0,0.349958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.461861,0.0,0.0,0.0,0.334981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.434352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
top_five = nn.kneighbors(text_transformed)[1][0]
top_five

array([ 331, 1070,   19,  396, 1212])

In [33]:
# Getting the top 5 results
result = []

for index in top_five:
  result.append(df.iloc[index])

result

[Strain                                              Blueberry-Ak
 Type                                                      hybrid
 Rating                                                       4.9
 Effects                  Energetic,Relaxed,Euphoric,Happy,Sleepy
 Flavor                                   Blueberry,Sweet,Pungent
 Description    As the name suggests, Blueberry AK is a hybrid...
 Name: 331, dtype: object,
 Strain                                                 Ice-Cream
 Type                                                      hybrid
 Rating                                                       4.1
 Effects                  Happy,Relaxed,Uplifted,Focused,Euphoric
 Flavor                                       Sweet,Vanilla,Woody
 Description    This indica-dominant 60/40 strain bred by Para...
 Name: 1071, dtype: object,
 Strain                                                      A-10
 Type                                                      indica
 Rating              

In [37]:
for x in result:
  print('\n',x['Description'])


 As the name suggests, Blueberry AK is a hybrid strain that combines the indica-dominant Blueberry with the sativa-dominant AK-47, both popular and potent strains in their own right.  Blueberry AK has a strong musky odor with undertones of berry, and flavors ranging from sweet Kush to the more peppery taste of the AK. The effects of this hybrid are both relaxing and upbeat, often inducing a case of the giggles, and people have used it to treat anxiety, depression, and pain.

 This indica-dominant 60/40 strain bred by Paradise Seeds creates a great hybrid balance of effects. Much like the frozen treat, Ice Cream has a smooth, creamy taste.

 A-10 has an earthy, hashy taste that provides a very heavy body stone.  Frequently used to treat insomnia and chronic pain.

 Bubbleberry is a treat for cannabis consumers on either side of the indica-sativa divide. Combining the sweet, floral taste and aroma of Bubble Gum with the all-star fruity skunkiness of Blueberry, this strain has depth whil

## LDA Topic Modelling with Gensim

In [0]:
"""
  This was for testing purposes only. LDA topic modelling is not very useful for this dataset.
"""

import numpy as np
import gensim

from gensim.utils import simple_preprocess
from gensim.test.utils import common_corpus
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore

In [0]:
# Gensim tokenizer
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

tokens = []
for description in df['Description']:
  tokens.append(tokenize(str(description)))

In [11]:
# Creating a df of strain and the tokens
df2 = pd.DataFrame(index=df['Strain'], data={'tokens':tokens})
df2.head(3)

Unnamed: 0_level_0,tokens
Strain,Unnamed: 1_level_1
100-Og,"[og, hybrid, strain, packs, strong, punch, sup..."
98-White-Widow,"[aloha, white, widow, especially, potent, cut,..."
1024,"[sativa, dominant, hybrid, bred, spain, medica..."


In [12]:
# A Dictionary Representation of all the words in our corpus
id2word = corpora.Dictionary(df2['tokens'])
id2word

<gensim.corpora.dictionary.Dictionary at 0x7f3d69aa6eb8>

In [13]:
id2word.token2id['dank']

458

In [14]:
id2word.doc2bow(tokenize("This is a sample message Darcy England England England dank dank dank dank dank  dank drank"))

[(458, 6), (2384, 3), (6751, 1), (6900, 1)]

In [15]:
import sys
print(sys.getsizeof(id2word))
print(sys.getsizeof(tokens))
len(id2word.keys())

56
21048


8360

In [0]:
# Let's remove extreme values from the dataset
id2word.filter_extremes(no_below=5, no_above=0.95)

In [17]:
len(id2word.keys())

2405

In [0]:
lda = LdaMulticore(corpus=common_corpus,
                   id2word=id2word,
                   random_state=723812,
                   num_topics = 4,
                   passes=10,
                   workers=12
                  )

In [19]:
lda.print_topics()

[(0,
  '0.002*"alert" + 0.002*"buds" + 0.002*"body" + 0.000*"green" + 0.000*"effect" + 0.000*"dark" + 0.000*"cerebral" + 0.000*"feeling" + 0.000*"hybrid" + 0.000*"indica"'),
 (1,
  '0.000*"hybrid" + 0.000*"alert" + 0.000*"feeling" + 0.000*"cerebral" + 0.000*"dark" + 0.000*"indica" + 0.000*"green" + 0.000*"effect" + 0.000*"body" + 0.000*"buds"'),
 (2,
  '0.000*"hybrid" + 0.000*"body" + 0.000*"buds" + 0.000*"alert" + 0.000*"indica" + 0.000*"large" + 0.000*"dark" + 0.000*"feeling" + 0.000*"green" + 0.000*"cerebral"'),
 (3,
  '0.007*"effect" + 0.005*"indica" + 0.005*"green" + 0.005*"hybrid" + 0.004*"high" + 0.004*"large" + 0.004*"cerebral" + 0.004*"dark" + 0.004*"feeling" + 0.002*"buds"')]

In [20]:
import re

words = [re.findall(r'"([^"]*)"',t[1]) for t in lda.print_topics()]
topics = [' '.join(t[0:5]) for t in words]
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
alert buds body green effect

------ Topic 1 ------
hybrid alert feeling cerebral dark

------ Topic 2 ------
hybrid body buds alert indica

------ Topic 3 ------
effect indica green hybrid high



In [21]:
!pip3 install pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, common_corpus, id2word)



In [22]:
test_tokens = ['this', 'herb', 'is', 'dank']
test_description = id2word.doc2bow(tokenize("Great for back pain and body aches. Might want to snak a litle bit too."))

# Get topic probabilities
lda.get_document_topics(test_description)

[(0, 0.8851233), (1, 0.038107812), (2, 0.038108107), (3, 0.038660783)]