In [0]:
!pip install q scikit-learn==0.22 #Changed sklearn version to stop warning
import pickle
from google.colab.files import upload
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from collections import Counter

Collecting q
  Downloading https://files.pythonhosted.org/packages/53/bc/51619d89e0bd855567e7652fa16d06f1ed36a85f108a7fe71f6629bf719d/q-2.6-py2.py3-none-any.whl
Collecting scikit-learn==0.22
[?25l  Downloading https://files.pythonhosted.org/packages/2e/d0/860c4f6a7027e00acff373d9f5327f4ae3ed5872234b3cbdd7bcb52e5eff/scikit_learn-0.22-cp36-cp36m-manylinux1_x86_64.whl (7.0MB)
[K     |████████████████████████████████| 7.0MB 3.4MB/s 
Installing collected packages: q, scikit-learn
  Found existing installation: scikit-learn 0.22.1
    Uninstalling scikit-learn-0.22.1:
      Successfully uninstalled scikit-learn-0.22.1
Successfully installed q-2.6 scikit-learn-0.22


# Globals

In [0]:
url = "https://raw.githubusercontent.com/med-cabinet-5/data-science/master/data/canna.csv"
# Read in data
df = pd.read_csv(url)
# Fill NaN with empty strings
df = df.fillna("")

def lister(x):
    """Function to return top seen words from a desired column"""
    # make new df from preds
    df_preds = df.loc[pred]
    # make empty list
    word_ls = []

    # loop over items in desired column and append into a list and title it
    for x in df_preds[x]:
        x = x.split(" ")
        for x in x:
            word_ls.append(x.strip(",").title())

    # Count the number of times each element appears
    count = Counter(word_ls)
    
    # Create new empty list
    word_ls = []

    # Loop over first 3 most common elements and join together in a string
    for x in range(3):
        word_ls.append(count.most_common(3)[x][0])
    result = ", ".join(word_ls)

    return result

def starter(x):

    # Instantiate vectorizer object
    tfidf = TfidfVectorizer(stop_words="english", min_df=0.025, max_df=.98, ngram_range=(1,3))

    # Create a vocabulary and get word counts per document
    dtm = tfidf.fit_transform(df['alltext'])

    # Get feature names to use as dataframe column headers
    dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
    
    # Fit on TF-IDF Vectors and return 30 neighbors
    nn = NearestNeighbors(n_neighbors=30, algorithm="kd_tree", radius=0.5)
    nn.fit(dtm)
    
    # Turn Review into a list, transform, and predict
    review = [x]
    new = tfidf.transform(review)
    
    global pred
    pred = nn.kneighbors(new.todense())[1][0]

    return

# Endpoint 1

In [0]:
def pred_list(x):
    """
    x = string to predict from (description)
    1. Predict the nearest neighbors to the inputted description
    2. Predict what type of cannabis the user is looking for with probability
       
    """
    starter(x)

    #create empty list
    pred_dict = []

    # only loop through 5 closest neighbors
    for x in pred[:5]:
        # add new dictionary to pred_dict containing predictions
        preds_list ={"strain":df["Strain"][x],
                     "type": df["Type_raw"][x],
                     "description": df["Description_raw"][x],
                     "flavor": df["Flavor_raw"][x],
                     "effects": df["Effects_raw"][x],
                     "ailments": df["Ailment_raw"][x]}
        pred_dict.append(preds_list)
    
    return pred_dict

In [0]:
pred_list("I want to feel uplifted happy")

[{'ailments': '',
  'description': 'Two superstars from the Dutch cannabis scene mingle together to rejuvenate genetics from decades passed into an intriguing hybrid that gained popularity in Colorado’s medical market for being both potent and flavorful. Dutch Treat Haze has a complex but undoubtable Haze aroma of pungent earthy tones from Super Silver Haze\xa0with the influence of Dutch Treat’s crisp, fruity, and floral flavors to create a captivating sativa-dominant hybrid. Its heady sativa effects are felt almost immediately with an energetic burst of creativity and sociability that uplift your mood and stimulate the appetite.\xa0',
  'effects': 'Happy, Euphoric, Relaxed, Energetic, Uplifted',
  'flavor': 'Berry, Earthy, Citrus',
  'strain': 'Dutch Treat Haze',
  'type': 'Hybrid'},
 {'ailments': '',
  'description': 'Afwreck is a hybrid cross of Afghani and Trainwreck. \xa0Strong sativa effects with immediate head-concentrated high.',
  'effects': 'Happy, Euphoric, Relaxed, Uplifted

# Endpoint 2

In [0]:
def pred_list2(x):

    starter(x)

    # Create initial dictionary with tops from relevant columns
    test_dict = {"top_effects": lister("Effects_raw"),
                 "top_flavors": lister("Flavor_raw"),
                 "top_ailments": lister("Ailment_raw")
                }


    model = pickle.load(open("stretch.sav", "rb"))
    #Pull result out
    pred_2 = model.predict(review)[0]
 
    #Grab max predict proba                   
    predict_proba = model.predict_proba(review)[0].max() * 100

    # Mapper to change result into string
    mapper = ({5: "Hybrid",
               4: "Indica",
               3: "Sativa",
               2: "Hybrid, Indica",
               1: "Sativa, Hybrid"})
    
    # Apply mapper to newly made Series
    strain_type = pd.Series(pred_2).map(mapper)[0]
    
    # Add new entry
    test_dict["proba"] = f"There is a {round(predict_proba, 2)}% that your looking for a {strain_type}"  

    return test_dict, pred

In [0]:
pred_list2("I want to feel uplifted happy")

({'proba': 'There is a 69.72% that your looking for a Indica',
  'top_ailments': ', Pain, Depression',
  'top_effects': 'Happy, Uplifted, Relaxed',
  'top_flavors': 'Earthy, Sweet, Citrus'},
 array([ 748,   56, 1946, 1013, 1728,  347,  275, 1086,  159, 1589, 1063,
        1343,  882,  998,  632,  609,  125, 2254, 1317, 1722, 1195,  314,
        2038, 1803, 1914,   25, 1281,  265,  626, 1775]))

In [0]:
#check to make sure preds from pred_list match with preds from pred_list2
df.loc[pred[:5]]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Strain,Type_raw,Effects_raw,Ailment_raw,Flavor_raw,Description_raw,labels,Type_lemma,Effects_lemma,Ailment_lemma,Flavor_lemma,Description_lemma,alltext
748,748,748,Dutch Treat Haze,Hybrid,"Happy, Euphoric, Relaxed, Energetic, Uplifted",,"Berry, Earthy, Citrus",Two superstars from the Dutch cannabis scene m...,5,['hybrid'],"['energetic', 'happy', 'relaxed', 'euphoric', ...",[],"['earthy', 'Berry', 'Citrus']","['superstar', 'dutch', 'cannabis', 'scene', 'm...","['Undoubtable', 'Intriguing', 'Creativity', ' ..."
56,56,56,Afwreck,Hybrid,"Happy, Euphoric, Relaxed, Uplifted, Sleepy",,"Earthy, Pungent, Pine",Afwreck is a hybrid cross of Afghani and Train...,5,['hybrid'],"['happy', 'sleepy', 'relaxed', 'euphoric', 'up...",[],"['earthy', 'Pine', 'pungent']","['Afwreck', 'hybrid', 'cross', 'Afghani', 'Tra...","[' ', 'sativa', 'Hybrid', 'relaxed', 'Effect'..."
1946,1946,1946,Sour Alien,Hybrid,"Giggly, Happy, Relaxed, Creative, Uplifted",,"Pungent, Lemon, Diesel","Sour Alien, bred by Cali Connection, is a 60% ...",5,['hybrid'],"['happy', 'giggly', 'relaxed', 'creative', 'Up...",[],"['lemon', 'diesel', 'pungent']","['Sour', 'Alien', 'breed', 'Cali', 'Connection...","['Cali', ' ', 'Hybrid', 'relaxed', 'Lemon', '..."
1013,1013,1013,Harry Potter,Hybrid,"Giggly, Happy, Relaxed, Creative, Uplifted",,"Lemon, Sweet, Citrus",Harry Potter is an otherworldly hybrid strain ...,5,['hybrid'],"['happy', 'giggly', 'relaxed', 'creative', 'Up...",[],"['lemon', 'Citrus', 'Sweet']","['Harry', 'Potter', 'otherworldly', 'hybrid', ...","['body', 'Otherworldly', 'Hybrid', 'pacify', '..."
1728,1728,1728,Purple Mr Nice,Indica,"Happy, Euphoric, Relaxed, Uplifted, Sleepy",,"Earthy, Grape, Pine",Granddaddy Purple crossed with Mr. Nice. This...,4,['Indica'],"['happy', 'sleepy', 'relaxed', 'euphoric', 'up...",[],"['earthy', 'Pine', 'Grape']","['Granddaddy', 'Purple', 'cross', 'Mr.', 'Nice...","['probability', ' ', 'relaxed', 'powerful', '..."
