In [2]:
import numpy as np
import pandas as pd
import nltk
#nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [3]:
df = pd.read_csv('./data/cannabis.csv')
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2351 entries, 0 to 2350
Data columns (total 6 columns):
Strain         2351 non-null object
Type           2351 non-null object
Rating         2351 non-null float64
Effects        2351 non-null object
Flavor         2305 non-null object
Description    2318 non-null object
dtypes: float64(1), object(5)
memory usage: 110.3+ KB


In [5]:
clean_df = df.drop(['Strain','Rating','Flavor','Description'], axis=1)
clean_df["Effects"] = clean_df["Effects"].str.replace(","," ")
clean_df.head()

Unnamed: 0,Type,Effects
0,hybrid,Creative Energetic Tingly Euphoric Relaxed
1,hybrid,Relaxed Aroused Creative Happy Energetic
2,sativa,Uplifted Happy Relaxed Energetic Creative
3,hybrid,Tingly Creative Hungry Relaxed Uplifted
4,hybrid,Happy Relaxed Euphoric Uplifted Talkative


In [6]:
clean_df['Tokens'] = clean_df['Effects'].apply(word_tokenize)
clean_df.head()

Unnamed: 0,Type,Effects,Tokens
0,hybrid,Creative Energetic Tingly Euphoric Relaxed,"[Creative, Energetic, Tingly, Euphoric, Relaxed]"
1,hybrid,Relaxed Aroused Creative Happy Energetic,"[Relaxed, Aroused, Creative, Happy, Energetic]"
2,sativa,Uplifted Happy Relaxed Energetic Creative,"[Uplifted, Happy, Relaxed, Energetic, Creative]"
3,hybrid,Tingly Creative Hungry Relaxed Uplifted,"[Tingly, Creative, Hungry, Relaxed, Uplifted]"
4,hybrid,Happy Relaxed Euphoric Uplifted Talkative,"[Happy, Relaxed, Euphoric, Uplifted, Talkative]"


In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder().fit(clean_df["Type"])
clean_df["Target"] = le.transform(clean_df["Type"])
clean_df.head()

Unnamed: 0,Type,Effects,Tokens,Target
0,hybrid,Creative Energetic Tingly Euphoric Relaxed,"[Creative, Energetic, Tingly, Euphoric, Relaxed]",0
1,hybrid,Relaxed Aroused Creative Happy Energetic,"[Relaxed, Aroused, Creative, Happy, Energetic]",0
2,sativa,Uplifted Happy Relaxed Energetic Creative,"[Uplifted, Happy, Relaxed, Energetic, Creative]",2
3,hybrid,Tingly Creative Hungry Relaxed Uplifted,"[Tingly, Creative, Hungry, Relaxed, Uplifted]",0
4,hybrid,Happy Relaxed Euphoric Uplifted Talkative,"[Happy, Relaxed, Euphoric, Uplifted, Talkative]",0


In [8]:
vectorizer = TfidfVectorizer()
vectorizer.fit(clean_df['Effects'])
vector = vectorizer.transform(clean_df['Effects'])

In [9]:
from sklearn.model_selection import train_test_split

X=vector
y=clean_df['Target']

X_train, X_test, y_train, y_test = train_test_split(X,y)

In [10]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200, random_state=0)
clf.fit(X_train, y_train)

  from numpy.core.umath_tests import inner1d


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [11]:
clf.score(X_test, y_test)

0.5476190476190477

In [12]:
filename="./model/type_model"
pickle.dump(clf, open(filename, 'wb'))

In [13]:
np.save('./model/types.npy', le.classes_)

In [14]:
vex_filename = "./model/type_vertorizer"
pickle.dump(vectorizer, open(vex_filename, 'wb'))