# Imports

In [0]:
!pip install q scikit-learn==0.22 #Changed sklearn version to stop warning
import pickle
from google.colab.files import upload
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from collections import Counter



In [0]:
uploaded = upload()

# Pred Route

- I made `review` and `pred` globals to be used outside the functions
- pulled the df out of the function
- might have to pull model fitting out of the function 

In [0]:
url = "https://raw.githubusercontent.com/med-cabinet-5/data-science/master/data/canna.csv"
# Read in data
df = pd.read_csv(url)
# Fill NaN with empty strings
df = df.fillna("")

def pred_list(x):
    """
    x = string to predict from (description)
    1. Predict the nearest neighbors to the inputted description
    2. Predict what type of cannabis the user is looking for with probability
       
    """

    # Instantiate vectorizer object
    tfidf = TfidfVectorizer(stop_words="english", min_df=0.025, max_df=.98, ngram_range=(1,3))

    # Create a vocabulary and get word counts per document
    dtm = tfidf.fit_transform(df['alltext'])

    # Get feature names to use as dataframe column headers
    dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
    
    # Fit on TF-IDF Vectors and return 30 neighbors
    nn = NearestNeighbors(n_neighbors=30, algorithm="kd_tree", radius=0.5)
    nn.fit(dtm)
    
    # Turn review into a global to be used by all
    global review 

    # Turn Review into a list, transform, and predict
    review = [x]
    new = tfidf.transform(review)

    # Turn pred into a global to be used by all
    global pred
    pred = nn.kneighbors(new.todense())[1][0]

    
    #create empty list
    pred_dict = []

    # only loop through 5 closest neighbors
    for x in pred[:5]:
        # add new dictionary to pred_dict containing predictions
        preds_list ={"strain":df["Strain"][x],
                     "type": df["Type_raw"][x],
                     "description": df["Description_raw"][x],
                     "flavor": df["Flavor_raw"][x],
                     "effects": df["Effects_raw"][x],
                     "ailments": df["Ailment_raw"][x]}
        pred_dict.append(preds_list)
    
    # # Load data for model 2
    # model = pickle.load(open("stretch.sav", "rb"))
    # #Pull result out
    # pred_2 = model.predict(review)[0]
 
    # #Grab max predict proba                   
    # predict_proba = model.predict_proba(review)[0].max() * 100

    # # Mapper to change result into string
    # mapper = ({5: "Hybrid",
    #        4: "Indica",
    #        3: "Sativa",
    #        2: "Hybrid, Indica",
    #        1: "Sativa, Hybrid"})
    
    # # Apply mapper to newly made Series
    # strain_type = pd.Series(pred_2).map(mapper)[0]
    
    # # Create new dictionary element
    # new_dict = {"proba":f"There is a {round(predict_proba, 2)}% that your looking for a {strain_type}"}
    
    # # Add new dicitonary to list of dictionaries
    # pred_dict.append(new_dict)

    return pred_dict

In [0]:
ls = pred_list("I want to be more creative")
ls

[{'ailments': 'Stress, Pain, Insomnia, Nausea, Depression',
  'description': '',
  'effects': 'Euphoric, Paranoid, Relaxed, Creative, Uplifted, Mouth, Dry, Sleepy',
  'flavor': '',
  'strain': 'Guido Kush',
  'type': 'Indica'},
 {'ailments': 'Nausea, Depression, Stress',
  'description': '',
  'effects': 'Paranoid, Relaxed, Energetic, Creative, Uplifted, Mouth, Dry',
  'flavor': 'Vanilla, Sweet',
  'strain': 'Root Beer Kush',
  'type': 'Hybrid'},
 {'ailments': '',
  'description': 'Kushashima is a hybrid strain that competed in the 2014 L.A. Cannabis Cup.',
  'effects': 'Relaxed, Energetic, Creative, Uplifted, Sleepy',
  'flavor': 'Pungent, Sweet, Pine',
  'strain': 'Kushashima',
  'type': 'Hybrid'},
 {'ailments': '',
  'description': 'Guava Chem is a hybrid strain that competed in the 2014 L.A. Cannabis Cup.',
  'effects': 'Focused, Happy, Euphoric, Relaxed, Creative',
  'flavor': 'Tropical, Sweet, Chemical',
  'strain': 'Guava Chem',
  'type': 'Hybrid'},
 {'ailments': 'Insomnia, Stre

# Summary Stat

- Create summary statistics based on the input from the user

## Idea:

- I want to return 30 closest neighbors and return summary based on most populated categorical variables from effects and flavors.



In [0]:
# Create new df from the preds
df_preds = df.loc[preds]
df_preds.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Strain,Type_raw,Effects_raw,Ailment_raw,Flavor_raw,Description_raw,labels,Type_lemma,Effects_lemma,Ailment_lemma,Flavor_lemma,Description_lemma,alltext
989,989,989,Guido Kush,Indica,"Euphoric, Paranoid, Relaxed, Creative, Uplifte...","Stress, Pain, Insomnia, Nausea, Depression",,,4,['Indica'],"['dry', 'creative', 'sleepy', 'paranoid', 'rel...","['pain', 'Stress', 'Depression', 'Insomnia', '...",[],[],"['pain', 'stress', 'Dry', ' ', 'Depression', ..."
1827,1827,1827,Root Beer Kush,Hybrid,"Paranoid, Relaxed, Energetic, Creative, Uplift...","Nausea, Depression, Stress","Vanilla, Sweet",,5,['hybrid'],"['energetic', 'dry', 'paranoid', 'relaxed', 'c...","['depression', 'stress', 'nausea']","['vanilla', 'Sweet']",[],"['energetic', 'stress', 'Dry', 'depression', '..."
1232,1232,1232,Kushashima,Hybrid,"Relaxed, Energetic, Creative, Uplifted, Sleepy",,"Pungent, Sweet, Pine",Kushashima is a hybrid strain that competed in...,5,['hybrid'],"['energetic', 'sleepy', 'relaxed', 'creative',...",[],"['Pine', 'pungent', 'sweet']","['Kushashima', 'hybrid', 'strain', 'compete', ...","['Kushashima', 'Cup', 'energetic', 'pungent', ..."
985,985,985,Guava Chem,Hybrid,"Focused, Happy, Euphoric, Relaxed, Creative",,"Tropical, Sweet, Chemical",Guava Chem is a hybrid strain that competed in...,5,['hybrid'],"['happy', 'Creative', 'focused', 'relaxed', 'e...",[],"['Chemical', 'tropical', 'sweet']","['guava', 'Chem', 'hybrid', 'strain', 'compete...","['Cup', 'Chem', 'Guava', 'Happy', 'creative', ..."
1604,1604,1604,Peyote Cookies,Indica,"Happy, Euphoric, Relaxed, Creative, Uplifted, ...","Insomnia, Stress, Depression, Pain","Grape, Sweet",,4,['Indica'],"['happy', 'dry', 'creative', 'relaxed', 'eupho...","['pain', 'Insomnia', 'Stress', 'depression']","['grape', 'Sweet']",[],"['pain', 'Stress', 'Happy', 'Dry', 'Depression..."


In [0]:

def lister(x):
    """Function to return top seen words from a desired column"""
    # make new df from preds
    df_preds = df.loc[pred]
    # make empty list
    word_ls = []

    # loop over items in desired column and append into a list and title it
    for x in df_preds[x]:
        x = x.split(" ")
        for x in x:
            word_ls.append(x.strip(",").title())

    # Count the number of times each element appears
    count = Counter(word_ls)
    
    # Create new empty list
    word_ls = []

    # Loop over first 3 most common elements and join together in a string
    for x in range(3):
        word_ls.append(count.most_common(3)[x][0])
    result = ", ".join(word_ls)

    return result

In [0]:
# Create new dicitonary
test_dict = {"top_effects": lister("Effects_raw"),
            "top_flavors": lister("Flavor_raw"),
            }

In [0]:
test_dict

{'top_effects': 'Creative, Uplifted, Happy',
 'top_flavors': 'Earthy, Sweet, Citrus'}

In [0]:
# Load data for model 2
model = pickle.load(open("stretch.sav", "rb"))
#Pull result out
pred_2 = model.predict(review)[0]
 
#Grab max predict proba                   
predict_proba = model.predict_proba(review)[0].max() * 100

# Mapper to change result into string
mapper = ({5: "Hybrid",
           4: "Indica",
           3: "Sativa",
           2: "Hybrid, Indica",
           1: "Sativa, Hybrid"})
    
# Apply mapper to newly made Series
strain_type = pd.Series(pred_2).map(mapper)[0]
    
# Add new entry
test_dict["proba"] = f"There is a {round(predict_proba, 2)}% that your looking for a {strain_type}"   

In [0]:
test_dict

{'proba': 'There is a 65.46% that your looking for a Hybrid',
 'top_effects': 'Creative, Uplifted, Happy',
 'top_flavors': 'Earthy, Sweet, Citrus'}

# Demo

In [0]:
pred_list("I want it to uplift but also help with my back pain")

[{'ailments': '',
  'description': 'Las Vegas Purple Kush BX is a clone-only strain released in 2013 and has been described as Alphakronik Gene’s most potent indica. This backcross consists of Las Vegas Purple Kush and Sin City Kush genetics, creating a similar experience and aroma to Pre-98 Bubba Kush without the coffee smell and with a slightly sweeter aroma. This strain has been known to help with an array of ailments associated with physical pain and gastrointestinal issues.',
  'effects': 'Happy, Relaxed, Aroused, Creative, Sleepy',
  'flavor': '',
  'strain': 'Las Vegas Purple Kush Bx',
  'type': 'Indica'},
 {'ailments': '',
  'description': 'G-Force by Flying Dutchman is a hearty G13 and Skunk/Northern Lights cross with heavy effects and a generous yield. This potent indica produces massive resinous colas that mature over a nine-week flowering cycle. The effects are extremely heavy on the limbs while still offering a potent euphoric rush that can even challenge the most seasoned

In [0]:
# Check for changing preds
pred

array([1256,  851, 1943,  701,   25,  158, 1072,  197,   77,  127, 2008,
       1757, 1349,   71, 1042,  843,  266, 1131, 1457,  681, 1853, 2012,
        421, 1602,  487,  539, 2099, 1010, 2338, 1250])

In [0]:
# Create new dicitonary
test_dict = {"top_effects": lister("Effects_raw"),
            "top_flavors": lister("Flavor_raw"),
            "top_ailments": lister("Ailment_raw")
            }
test_dict

{'top_ailments': ', Stress, Pain',
 'top_effects': 'Happy, Euphoric, Relaxed',
 'top_flavors': 'Earthy, Sweet, '}

In [0]:
# Load data for model 2
model = pickle.load(open("stretch.sav", "rb"))
#Pull result out
pred_2 = model.predict(review)[0]
 
#Grab max predict proba                   
predict_proba = model.predict_proba(review)[0].max() * 100

# Mapper to change result into string
mapper = ({5: "Hybrid",
           4: "Indica",
           3: "Sativa",
           2: "Hybrid, Indica",
           1: "Sativa, Hybrid"})
    
# Apply mapper to newly made Series
strain_type = pd.Series(pred_2).map(mapper)[0]
    
# Add new entry
test_dict["proba"] = f"There is a {round(predict_proba, 2)}% that your looking for a {strain_type}"  

test_dict

{'proba': 'There is a 51.04% that your looking for a Indica',
 'top_ailments': ', Stress, Pain',
 'top_effects': 'Happy, Euphoric, Relaxed',
 'top_flavors': 'Earthy, Sweet, '}