In [1]:
import pandas as pd

In [2]:
#Importing Data

df = pd.read_csv("/Users/micahswain/Lambda/Unit-4/Med-Cabinet/Med Cabinet/data/cannabis.csv")

df['Flavor'] = df['Flavor'].str.replace('Blue,Cheese', 'Blue Cheese')

df = df[df['Effects'] != 'None']

df = df[df['Flavor'] != 'None']

df = df.set_index('Strain')

In [3]:
#getting all possible effects from DF
all_effects = df['Effects'].str.cat(sep=',')
set_effects = set(all_effects.split(","))

In [4]:
set_effects

{'Aroused',
 'Creative',
 'Dry',
 'Energetic',
 'Euphoric',
 'Focused',
 'Giggly',
 'Happy',
 'Hungry',
 'Mouth',
 'Relaxed',
 'Sleepy',
 'Talkative',
 'Tingly',
 'Uplifted'}

In [5]:
#getting all possible flavors from DF
all_flavor = df['Flavor'].str.cat(sep=',')
set_flavor = set(all_flavor.split(","))

In [6]:
set_flavor

{'Ammonia',
 'Apple',
 'Apricot',
 'Berry',
 'Blue Cheese',
 'Blueberry',
 'Butter',
 'Cheese',
 'Chemical',
 'Chestnut',
 'Citrus',
 'Coffee',
 'Diesel',
 'Earthy',
 'Flowery',
 'Fruit',
 'Grape',
 'Grapefruit',
 'Honey',
 'Lavender',
 'Lemon',
 'Lime',
 'Mango',
 'Menthol',
 'Mint',
 'Minty',
 'Nutty',
 'Orange',
 'Peach',
 'Pear',
 'Pepper',
 'Pine',
 'Pineapple',
 'Plum',
 'Pungent',
 'Rose',
 'Sage',
 'Skunk',
 'Spicy/Herbal',
 'Strawberry',
 'Sweet',
 'Tar',
 'Tea',
 'Tobacco',
 'Tree',
 'Tropical',
 'Vanilla',
 'Violet',
 'Woody'}

In [7]:
#combining effects and flavors to one column for ease of vectorizing
df['to_vect'] = df['Effects'] + "," + df['Flavor']
df.dropna(inplace=True)

In [8]:
df.head()

Unnamed: 0_level_0,Type,Rating,Effects,Flavor,Description,to_vect
Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,"Creative,Energetic,Tingly,Euphoric,Relaxed,Ear..."
98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,"Relaxed,Aroused,Creative,Happy,Energetic,Flowe..."
1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,"Uplifted,Happy,Relaxed,Energetic,Creative,Spic..."
13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,"Tingly,Creative,Hungry,Relaxed,Uplifted,Aprico..."
24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...","Happy,Relaxed,Euphoric,Uplifted,Talkative,Citr..."


In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle

In [14]:
#Vectorizing to_vect column

# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english', max_features=75)

# Create a vocabulary and get word counts per document
# Similiar to fit_predict
dtm = tfidf.fit_transform(df['to_vect'])

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names(), index=df.index)

# View Feature Matrix as DataFrame
dtm.head()

#pickle vectorizer
pickle.dump(tfidf, open("pickled_vectorizer", 'wb'))

In [15]:
from sklearn.neighbors import NearestNeighbors

#Fitting NearestNeighbors model

# Fit on DTM
nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(dtm)
#pickle NN model
pickle.dump(nn, open("pickled_nn_model", 'wb'))

In [16]:
###Testing the NN Model

#need to combine the desired effects and flavors to match the fitting model
desired_effects = 'creative,relaxed,happy'
desired_flavor = 'pineapple,Mango,Berry'
desired_combined = desired_effects + ',' + desired_flavor

In [17]:
#load vectorizer
loaded_vectorizer = pickle.load(open("pickled_vectorizer", 'rb'))

In [18]:
#vectorize desired effects/flavors
desired_dtm = loaded_vectorizer.transform([desired_combined])

In [19]:
#load NN Model

loaded_nn_model = pickle.load(open("pickled_nn_model", 'rb'))

In [20]:
### make predictions

output = loaded_nn_model.kneighbors(desired_dtm.todense())
output

(array([[0.57329548, 0.68475826, 0.81899101, 0.84182441, 0.8492552 ]]),
 array([[ 525, 1281, 1480, 1459, 1531]]))

In [21]:
###Show the nearest Neighbors output

nn = output[1][0]

df.iloc[nn].drop(columns='to_vect')

Unnamed: 0_level_0,Type,Rating,Effects,Flavor,Description
Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Colorado-Bubba,indica,4.6,"Relaxed,Happy,Sleepy,Uplifted,Creative","Pineapple,Mango,Pine",Colorado Bubba from Vault Genetics took 1st pl...
Mekamika-Haze,sativa,5.0,"Uplifted,Creative,Energetic,Euphoric,Focused","Pineapple,Mango,Flowery",Mekamika Haze won Best Sativa at the 2015 Amst...
Pineapple-Purps,sativa,4.4,"Creative,Uplifted,Relaxed,Euphoric,Energetic","Earthy,Pineapple,Berry",Pineapple Purps is a sativa-dominant strain th...
Phantom-Cookies,hybrid,4.6,"Happy,Relaxed,Uplifted,Creative,Energetic","Berry,Grape,Mango",Phantom Cookies is a sativa-dominant hybrid fr...
Pure-Love,indica,4.8,"Relaxed,Sleepy,Happy,Focused,Euphoric","Mango,Sweet,Berry",Pure Love is a hunger-inducing indica-dominant...
