In [0]:
# Basic imports
import numpy as np
import pandas as pd

In [0]:
# Pull data from our repo
df = pd.read_csv('https://raw.githubusercontent.com/med-cabinet-5/data-science/master/cannabis.csv')

In [6]:
# Most basic EDA possible
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [7]:
# Making a quick count of unique effects
list = []
for i in range(len(df)):
    for effect in df['Effects'].iloc[i].split(','):
        if effect not in list:
            list.append(effect)
print(f'There are {len(list)} unique reported effects among the {len(df)} strains.')

There are 16 unique reported effects among the 2351 strains.


In [8]:
list

['Creative',
 'Energetic',
 'Tingly',
 'Euphoric',
 'Relaxed',
 'Aroused',
 'Happy',
 'Uplifted',
 'Hungry',
 'Talkative',
 'None',
 'Giggly',
 'Focused',
 'Sleepy',
 'Dry',
 'Mouth']

# A little bit silly with "None", "Dry" and "Mouth". But it's hardly important for now.

In [0]:
# Make a contrived user-preference example
preferences = ['Tingly', 'Aroused', 'Giggly']

In [0]:
# This cell will make an array which counts the number of desired effects
# matched by each individual strain
recommendations = np.zeros_like(df['Effects'])
for i in range(len(df)):
    effects = df['Effects'].iloc[i].split(',')
    for effect in effects:
      if effect in preferences:
        recommendations[i] += 1        

In [17]:
# This cell finds the indexes of any strains tied for the best match
matching_strains = np.argwhere(recommendations == np.amax(recommendations))
matching_strains = [strain[0] for strain in matching_strains]
matching_strains

[144, 515, 765, 2265]

In [22]:
# Then as a tie-breaker we use 'Rating'
df.iloc[matching_strains].sort_values(by='Rating', ascending=False)

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
2265,White-Bubblegum,indica,5.0,"Giggly,Aroused,Sleepy,Talkative,Tingly","Sweet,Earthy,Flowery",White Bubblegum by THClones is a sweet and sed...
515,Cherry-Sherbet,hybrid,4.3,"Sleepy,Giggly,Aroused,Talkative,Tingly","Berry,Sweet,Diesel",Cherry Sherbet is an indica-dominant delight. ...
765,Enemy-Of-The-State,indica,4.1,"Euphoric,Aroused,Focused,Tingly,Giggly","Spicy/Herbal,Tree,Fruit,Earthy",Enemy of the State by Super Strains is an old ...
144,Avalon,hybrid,3.6,"Aroused,Tingly,Hungry,Uplifted,Giggly","Sweet,Earthy,Nutty",A selectively bred cross between Afghani and B...


In [23]:
# And finally a strain suggestion without using ML in any meaningful way.
recommendation = df.iloc[matching_strains].sort_values(by='Rating', ascending=False).iloc[0]['Strain']
print(f'We recommend you try {recommendation}.')

We recommend you try White-Bubblegum.


# I'm actually a bit befuddled by the idea of attempting to turn this into an ML problem.
- We aren't dealing with separate reviews
- Therefore we have 1 'observation' of each strain
- Multiclass(2350) classification seems ridiculous given the data we have
- Regression sounds even more absurd

# If we were to have a problem statement akin to "recommend marijuana **type** based on desired effects," or "determine the characteristics of a theoretically optimal(_highest rated_) weed strain," then we would have a classical, well-defined situation.

# I look forward to all of your perspectives on the matter.

