In [1]:
# Imports
import pandas as pd

from pathlib import Path
import pickle
import os

import spacy
from spacy.tokenizer import Tokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

nlp = spacy.load("en_core_web_lg")
tokenizer = Tokenizer(nlp.vocab)

In [2]:
# Load initial cannabis data and disease data
df = pd.read_csv("https://raw.githubusercontent.com/med-cab1/ds-api/master/data/cannabis.csv")
disease = pd.read_csv("https://raw.githubusercontent.com/med-cab1/ds-api/master/data/Disease.csv")

In [3]:
# Combine the Effects and Flavors in one column
df['Criteria'] = df['Effects'] + ',' + df['Flavor']

In [4]:
# Function to use spacy tokenizer
def tokenize(document):    
    doc = nlp(document)   
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and 
                                                     (token.is_punct != True) and
                                                     (token.text != ' ')]

In [5]:
parent_directory = Path().resolve().parent

dtm = pickle.load(open(os.path.join(parent_directory, 'dtm.pkl'), 'rb'))
tf = pickle.load(open(os.path.join(parent_directory, 'tf.pkl'), 'rb'))

In [6]:
nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [7]:
# Create the test case
ideal_strain = ['Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus']

In [8]:
# Query for similar strains using the test case
new = tf.transform(ideal_strain)
results = nn.kneighbors(new.todense())

In [9]:
df['Strain'][results[1][0][0]]

'100-Og'

In [10]:
df['Criteria'][results[1][0][0]]

'Creative,Energetic,Tingly,Euphoric,Relaxed,Earthy,Sweet,Citrus'