In [None]:
import pandas as pd
import numpy as np
import sys
import re
import json
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import RegexpTokenizer

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import DistanceMetric, KNeighborsRegressor
from sklearn.datasets import load_iris, load_wine, make_classification

from scipy import sparse

from spellchecker import SpellChecker

### Mess Around With Pandas Options

In [None]:
pd.options.display.max_colwidth = 1000
pd.options.display.max_rows = 200
pd.options.display.min_rows = 200
pd.options.display.max_columns = 200

### Load in and Credit Data

In [None]:
strain_data = pd.read_csv('Data/Cannabis_Data.csv')

In [None]:
url_details = ['https://www.kaggle.com/kingburrito666/cannabis-strains', '11/6/2019']

### Search Methods

In [None]:
def find_strain(name):
    return strain_data['Description'][strain_data['Strain'].str.contains(name)]

In [None]:
def find_descrip(word):
    return strain_data[strain_data['Description'].str.contains(word)]

### Cleeeeaaaaning Time

In [None]:
strain_data['Flavor'].fillna('Unknown', inplace=True)

In [None]:
strain_data['Description'].fillna('None', inplace=True)

In [None]:
strain_data['Strain'] = strain_data['Strain'].str.replace('-', ' ')
strain_data['Strain'] = strain_data['Strain'].str.replace('Ar 4', 'Ar-4')

In [None]:
list_of_strains = list(strain_data['Strain'])

## Create a New DataFrame to Create New Columns 

In [None]:
strain_breakdown = pd.DataFrame(strain_data['Strain'])
strain_breakdown['Rating'] = strain_data['Rating']

### Split The Strain Types

In [None]:
# Create Dummie Variables for Sativa, Indica, and Hybrid.

strain_breakdown['Type'] = strain_data['Type']

strain_breakdown = pd.get_dummies(strain_breakdown, columns=['Type'])

strain_breakdown.columns = ['Strain', 'Rating', 'Type: Hybrid', 'Type: Indica', 'Type: Sativa']

### Effect Columns

In [None]:
# Since their are only 12 Effect Types listed, this was easily done manually.

strain_breakdown['Effect: Creative'] = (strain_data['Effects'].str.contains('Creative')).astype(int)
strain_breakdown['Effect: Energetic'] = (strain_data['Effects'].str.contains('Energetic')).astype(int)
strain_breakdown['Effect: Tingly'] = (strain_data['Effects'].str.contains('Tingly')).astype(int)
strain_breakdown['Effect: Euphoric'] = (strain_data['Effects'].str.contains('Euphoric')).astype(int)
strain_breakdown['Effect: Relaxed'] = (strain_data['Effects'].str.contains('Relaxed')).astype(int)
strain_breakdown['Effect: Aroused'] = (strain_data['Effects'].str.contains('Aroused')).astype(int)
strain_breakdown['Effect: Happy'] = (strain_data['Effects'].str.contains('Happy')).astype(int)
strain_breakdown['Effect: Uplifted'] = (strain_data['Effects'].str.contains('Uplifted')).astype(int)
strain_breakdown['Effect: Hungry'] = (strain_data['Effects'].str.contains('Hungry')).astype(int)
strain_breakdown['Effect: Talkative'] = (strain_data['Effects'].str.contains('Talkative')).astype(int)
strain_breakdown['Effect: Sleepy'] = (strain_data['Effects'].str.contains('Sleepy')).astype(int)
strain_breakdown['Effect: Focused'] = (strain_data['Effects'].str.contains('Focused')).astype(int)

### Flavor Columns

In [None]:
# # The Flavor Column was a bit more complex, so a for loop was needed to extract all the words used.

# flavor_options = []

# for i in range(len(strain_data)):
#     flavors = strain_data['Flavor'].iloc[i,]
#     flavors = flavors.split(',')
#     for flav in flavors:
#         if flav not in flavor_options:
#             flavor_options.append(flav)
#         else:
#             pass

In [None]:
# print (flavor_options, end=' ')
# # I'm about 50/50 on using these, because I don't know how important taste is to the average user, but here they are.

In [None]:
# for flavors in flavor_options:
#     strain_breakdown['Terpenes: ' + flavors] = (strain_data['Flavor'].str.contains(flavors)).astype(int)
#     # Terpenes are the technical term

### Medical Uses Columns

In [None]:
# This may not be the most effective, but the plan is to search the description column for medical reasons to use marijuana.

strain_breakdown['Medical: Pain'] = (strain_data['Description'].str.lower().str.contains(' pain')).astype(int)

insomnia = ['sleep', 'insomnia', 'awake']
strain_breakdown['Medical: Insomnia'] = (strain_data['Description'].apply(lambda x: any(word in x for word in insomnia))).astype(int)

nausea = ['nausea', 'chemo', 'cancer']
strain_breakdown['Medical: Nausea'] = (strain_data['Description'].apply(lambda x: any(word in x for word in nausea))).astype(int)

seizure = ['epilep', 'seiz']
strain_breakdown['Medical: Seizure'] = (strain_data['Description'].apply(lambda x: any(words in x for words in seizure))).astype(int)

anxiety = ['anxi', 'panic']
strain_breakdown['Medical: Anxiety'] = (strain_data['Description'].apply(lambda x: any(words in x for words in anxiety))).astype(int)

strain_breakdown['Medical: Muscle Spasms'] = (strain_data['Description'].str.lower().str.contains('muscle spas')).astype(int)

appetite = ['weigh', 'appetite', 'eat', 'anorexia']
strain_breakdown['Medical: Appetite'] = (strain_data['Description'].apply(lambda x: any(words in x for words in appetite))).astype(int)

strain_breakdown['Medical: Stress'] = (strain_data['Description'].str.lower().str.contains('stress')).astype(int)

strain_breakdown['Medical: Depression'] = (strain_data['Description'].str.lower().str.contains('depress')).astype(int)

cronhs_disease = ['cronh', 'digest']
strain_breakdown["Medical: Crohn's Disease"] = (strain_data['Description'].apply(lambda x: any(words in x for words in cronhs_disease))).astype(int)

substance_abuse = ['addict', 'alcoho', 'opiod']
strain_breakdown['Medical: Substance Abuse'] = (strain_data['Description'].apply(lambda x: any(words in x for words in substance_abuse))).astype(int)

strain_breakdown['Medical: PTSD'] = (strain_data['Description'].str.upper().str.contains('PTSD')).astype(int)

strain_breakdown['Medical: Arthritis'] = (strain_data['Description'].str.lower().str.contains('arthritis')).astype(int)

strain_breakdown['Medical: Fatigue'] = (strain_data['Description'].str.lower().str.contains('fatigue')).astype(int)

# strain_breakdown['Medical: '] = (strain_data['Description'].str.lower().str.contains((''))).astype(int)
# strain_breakdown['Medical: '] = (strain_data['Description'].str.lower().str.contains((''))).astype(int)
# strain_breakdown['Medical: '] = (strain_data['Description'].str.lower().str.contains((''))).astype(int)
# strain_breakdown['Medical: '] = (strain_data['Description'].str.lower().str.contains((''))).astype(int)
# strain_breakdown['Medical: '] = (strain_data['Description'].str.lower().str.contains((''))).astype(int)

# Below Sections May Be Replaced With an NLP

### Parent Columns

In [None]:
# List of common parent strains used as key words to seach for in the description column.

parent_strains = [
    
    'Aceh', 'Hindu Kush','OG Kush', 'Sour Diesel', 'Granddaddy Purple', 'Northern Lights',
    'Durban Poison', 'Bubba Kush', 'Pre-98 Bubba Kush', 'Jack Herer', 'Blue Dream', 'Trainwreck',
    'Hawaiian', 'Amnesia', 'Super Silver Haze', 'OG Badazz', 'Ms. Universe', 'LSD', 'Banana OG',
    'White Widow', 'Nepali OG', 'Afgoo', 'Appalachia', 'Harlequin', 'Jack the Ripper', 'Pennywise',
    'Lilly', 'Headband', 'Snowdawg', 'Snow Lotus', 'Green Crack', 'Alien OG', 'Alien Kush', 'Alien Dawg',
    'Super Silver Haze', 'Chernobyl', 'Elephant', 'Apollo 13', 'Space Queen', "Jack’s Cleaner",
    'LA Confidential', 'Maui Wowie', 'Gorilla Glue', 'AK-47', 'Blue Cheese', 'Deep Chunk', 'G13', 
    "Rare Dankness #1", 'Skywalker', 'Skywalker OG', 'Master Kush', 'SFV OG', 'Gooberry', 'Hell’s Angel OG',
    'Captain Krypt', 'Sour Bubble', 'Shiva', 'Blue Moonshine', 'Ortega', 'Fire OG', 'Green Ribbon',
    'Triangle Kush', 'Georgia Pine', 'Shishkaberry', 'Great White Shark', 'Mazar I Sharif', 
    
    'Nepalese', 'Afghani', 'Thai', 'Skunk', 'The White', 'Blackberry', 'Cheese',
    
]


# These are for strains that will need the first letter to be capitalized, since the word appears in other ways.
title_parent_strains = [     
     
]

In [None]:
for parents in parent_strains:
    strain_breakdown['Parent: ' + parents] = (strain_data['Description'].str.contains(parents)).astype(int)

In [None]:
gsc = ['Girl Scout Cookie', 'GSC']
strain_breakdown['Parent: Girl Scout Cookies'] = (strain_data['Description'].apply(lambda x: any(word in x for word in gsc))).astype(int)

chemdawg91 = ['chemdawg ‘91', 'chemdawg 91']
strain_breakdown['Parent: Chemdawg 91'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in chemdawg91))).astype(int)

chemdawg = ['chemdawg', 'chem dawg', 'chem dog']
strain_breakdown['Parent: Chemdawg'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in chemdawg))).astype(int)

### Other Descriptor Columns

In [None]:
# THC/CBD and Hybrid Details

sativa_dom = ['sativa-dom', 'sativa dom']
strain_breakdown['Descriptor: Sativa Dominant'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in sativa_dom))).astype(int)


indica_dom = ['indica-dom', 'indica dom']
strain_breakdown['Descriptor: Indica Dominant'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in indica_dom))).astype(int)


strain_breakdown['Descriptor: 50/50 Hybrid'] = (strain_data['Description'].str.lower().str.contains('50/50 hybrid')).astype(int)


high_cbd = ['high cbd', 'high-cbd', 'cbd rich', 'cdb-rich', 'higher cbd', 'cbd heavy', 'best cbd',
            'cbd levels', 'robust cbd', 'uplifting cbd', 'higher levels of cbd']
strain_breakdown['Descriptor: High CBD'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in high_cbd))).astype(int)


high_thc = ['high thc', 'high-thc', 'thc power', 'high level of thc', 'high levels of thc', 'thc content over',
            'thc content passes', 'thc content of up', 'high tolerance to thc', 'thc levels over', 'noted thc']
strain_breakdown['Descriptor: High THC'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in high_thc))).astype(int)


low_cbd = ['low cbd', 'low-cbd', 'weak cdb', 'weak-cbd', 'subtle cbd', 'cbd levels of 2%', 'cbd levels of 3%']
strain_breakdown['Descriptor: Low CBD'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in low_cbd))).astype(int)


low_thc = ['low thc', 'low-thc', 'weak thc', 'weak-thc']
strain_breakdown['Descriptor: Low THC'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in low_thc))).astype(int)


# one_to_one = ['1:1 CBD/THC', '1:1 THC/CBD', '1:1 THC-CBD', '1:1 CBD-THC', '1:1 ratio of CBD:THC', '1:1 ratio of THC:CBD', 'balanced ratio']
# strain_breakdown['Descriptor: 1:1 CBD/THC'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in one_to_one))).astype(int)

In [None]:
# Other Cannabinoid

strain_breakdown['Descriptor: Caryophyllene'] = (strain_data['Description'].str.lower().str.contains('caryophyllene')).astype(int)
strain_breakdown['Descriptor: Limonene'] = (strain_data['Description'].str.lower().str.contains('limonene')).astype(int)
strain_breakdown['Descriptor: Humulene'] = (strain_data['Description'].str.lower().str.contains('humulene')).astype(int)
strain_breakdown['Descriptor: Ocimene'] = (strain_data['Description'].str.lower().str.contains('ocimene')).astype(int)
strain_breakdown['Descriptor: Terpinolene'] = (strain_data['Description'].str.lower().str.contains('terpinolene')).astype(int)
strain_breakdown['Descriptor: Myrcene'] = (strain_data['Description'].str.lower().str.contains('myrcene')).astype(int)
strain_breakdown['Descriptor: Pinene'] = (strain_data['Description'].str.lower().str.contains('pinene')).astype(int)
strain_breakdown['Descriptor: Linalool'] = (strain_data['Description'].str.lower().str.contains('linalool')).astype(int)

In [None]:
# Other Keywords


strain_breakdown['Descriptor: Kush'] = (strain_data['Description'].str.lower().str.contains('kush')).astype(int)


potent = ['potent', 'strong', 'high tolerence', 'high-tolerence']
strain_breakdown['Descriptor: Potent'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in potent))).astype(int)


body_high = ['body']
strain_breakdown['Descriptor: Body High'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in body_high))).astype(int)


head_high = ['cerebral', 'head']
strain_breakdown['Descriptor: Head High'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in head_high))).astype(int)


strain_breakdown['Descriptor: Daytime'] = (strain_data['Description'].str.lower().str.contains(('day'))).astype(int)


strain_breakdown['Descriptor: Nighttime'] = (strain_data['Description'].str.lower().str.contains(('night'))).astype(int)


strain_breakdown['Descriptor: Outside'] = (strain_data['Description'].str.lower().str.contains(('outside'))).astype(int)


creative = ['creative', 'inspired']
strain_breakdown['Descriptor: Creative'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in creative))).astype(int)


strain_breakdown['Descriptor: Psychedelic'] = (strain_data['Description'].str.lower().str.contains(('psychedelic'))).astype(int)


strain_breakdown['Descriptor: Lazy'] = (strain_data['Description'].str.lower().str.contains(('lazy'))).astype(int)


strain_breakdown['Descriptor: Calm'] = (strain_data['Description'].str.lower().str.contains(('calm'))).astype(int)

# The Above Sections May Be Replaced With an NLP

# Not Enough Strains had the CBD to THC Ratio in the Description for it to Help, but I'll Keep this Here for Future Reference

In [None]:
# thc_cbd = pd.DataFrame(strain_data['Strain'])

# def find_ratio(string, name):
#     df = strain_data['Description'].str.extract(string)
#     df['CBD'].fillna(1, inplace=True)
#     df['THC'].fillna(0, inplace=True)
#     thc_cbd[name] = df['THC'].astype(int) / df['CBD'].astype(int)

In [None]:
# Credit to User jlesueur, https://stackoverflow.com/questions/58883944/extracting-specific-numbers-from-text-data?noredirect=1#comment104035437_58883944

# find_ratio(r'THC:CBD ratio of about (?P<THC>\d+):(?P<CBD>\d+)', 'thc_cbd_1')
# find_ratio(r'THC:CBD ratio of (?P<THC>\d+):(?P<CBD>\d+)', 'thc_cbd_2')
# find_ratio(r'THC/CBD ratio of about (?P<THC>\d+):(?P<CBD>\d+)', 'thc_cbd_3')
# find_ratio(r'THC/CBD ratio of (?P<THC>\d+):(?P<CBD>\d+)', 'thc_cbd_4')
# find_ratio(r'(?P<THC>\d+):(?P<CBD>\d+) THC:CBD', 'thc_cbd_5')
# find_ratio(r'(?P<THC>\d+):(?P<CBD>\d+) THC/CBD', 'thc_cbd_6')

# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+) CBD/THC', 'cbd_thc_1')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+) CBD:THC', 'cbd_thc_2')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+) CBD to THC', 'cbd_thc_3')
# find_ratio(r'CBD to THC ratio of (?P<CBD>\d+):(?P<THC>\d+)', 'cbd_thc_4')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+) ratio of CBD to THC', 'cbd_thc_5')
# find_ratio(r'CBD:THC ratio of (?P<CBD>\d+):(?P<THC>\d+)', 'cbd_thc_6')
# find_ratio(r'(?P<CBD>\d+)-to-(?P<THC>\d+) CBD:THC', 'cbd_thc_7')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+) 59:1 (CBD:THC)', 'cbd_thc_8')
           
           
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')
# find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+)')


# one_to_one = ['1:1 CBD/THC', '1:1 THC/CBD', '1:1 THC-CBD', '1:1 CBD-THC', '1:1 ratio of CBD:THC', '1:1 ratio of THC:CBD'
#               '1:1 ratio of THC:CBD', 'BD:THC ratio can range anywhere from 1:1 – 2:1', 'balanced THC/CBD',
#               '1:1 ratio of CBD to THC', '1:1 ratio of THC to CBD', '1:1 ratio of THC and CBD', '1:1 ratio of CBD and THC',
#               'THC to CBD ratio typically comes out 1:1', '1:1 CBD-THC', 'balanced 1:1 cannabinoid', 'balanced THC:CBD ratio']
# thc_cbd['Equal'] = (strain_data['Description'].apply(lambda x: any(word in x for word in one_to_one))).astype(int)

In [None]:
# X = thc_cbd.drop(axis=1, columns=['Strain', 'Equal'])

In [None]:
# thc_cbd['Total'] = X.sum(axis=1)

In [None]:
# for strain in thc_cbd:
#     if thc_cbd['Equal'] != 0:
#         thc_cbd['Total'] = 1

In [None]:
# thc_cbd[thc_cbd['Total'] != 0].shape

## Set Strain Name As Index

In [None]:
strain_frame = strain_breakdown.set_index('Strain')

In [None]:
strain_frame.to_csv('Data/Strain_Frame.csv')

## Now Let Unsupervise Learn This Shit

In [None]:
np.set_printoptions(threshold=1000, suppress=None)

In [None]:
# X = strain_frame

In [None]:
pivot = strain_frame.pivot_table(index='Strain')

In [None]:
pivot_sparse = sparse.csr_matrix(pivot.fillna(0))

In [None]:
recommender = (100 * (1 - (pairwise_distances(pivot_sparse, metric='cosine')))).round(decimals=2).astype(float)

In [None]:
recommender_df = pd.DataFrame(recommender, index=pivot.index, columns=pivot.index)

In [None]:
# recommender_df.to_csv('Data/Strain Recommender.csv') # This file is too big to save in Git.

In [None]:
recommender.shape

### Here we write out out function

In [None]:
# with open('Data/Strain_Dictionary.txt', 'w') as f:
#     f.writelines('\n'.join(list_of_strains))

In [None]:
Strain_List = {K: 1 for K in list_of_strains}

with open('Data/Strain_List.json', 'w') as f:
    json.dump(Strain_List, f)

In [None]:
sc = SpellChecker(local_dictionary='Data/Strain_List.json')

In [None]:
def suggestion(strain):
    strain_title = strain.title()
    try:
        print (f"Strains similar to {strain.upper()} include ")
        result = recommender_df[strain_title].sort_values(ascending=False)[1:11]
        return result.astype(str).map(lambda x: x + '%')
    except:
        if sc.correction(strain) != strain:
            return (f'Not Found. Did you mean {sc.correction(strain)}?')
        else:
            return( '''Strain Not Found. 
If you searched the full strain name, try just the initials.''')

#### Testing Time

In [None]:
suggestion('gsc')