In [89]:
#imports
import os
import pandas as pd
import numpy as np
import string

from sklearn.model_selection import train_test_split

import gensim

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('word_tokenize')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\taylor.vanvalkenburg\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\taylor.vanvalkenburg\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading word_tokenize: Package 'word_tokenize' not
[nltk_data]     found in index
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\taylor.vanvalkenburg\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#load the data
df1 = pd.read_csv('https://raw.githubusercontent.com/kcelotto/3252TermProject/master/cannabis.csv')

In [3]:
#drop rows with missing values
df1 = df1.loc[(df1['Effects'] != 'None') | (df1['Flavor'] != 'None')]

<p>
    Preprocess the Effects and Flavor columns by turning them into arrarys of words
    Not required for the code below, but I've left it in just in case we need it in the future
</p>

In [5]:
#convert dtypes
str_cols = ['Effects',
            'Flavor',
            'Description']

for col in str_cols:
    df1[col] = df1[col].astype(str)

In [6]:
#create the arrays
df1['Effects'] = df1['Effects'].apply(lambda x: x[0:].split(','))
df1['Flavor'] = df1['Flavor'].apply(lambda x: x[0:].split(','))


<H3>
    The following section shows one method of preprocessing the text using scikit-learn's Count Vectorizer and TF-IDF. This is a bag of words model that will count instances of words in the text, and then compute the term frequency-inverse document frequency to assess the importance of words
</H3>

In [160]:
#imports
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [270]:
#define the cleaning function for scikit learn
def clean_text_skl(text):
    #define list of special characters
    special_list = ['$',
                '!'
                '/',
                '%',
                '-',
                "'",
                ",",
                "(",
                ")",
                '""',
                '‘',
                '’',
                '.',
                ':',
                ';',
                '*',
                '“',
                '”']
    
    #make all words lowercase
    clean_skl = text.lower()
    
    #remove special characters
    [char for char in clean_skl if char not in special_list]

    #lemmatize the text
    [WordNetLemmatizer().lemmatize(w) for w in clean_skl]
    
    #remove stopwords
    clean_skl = [token for token in word_tokenize(clean_skl) if not token in stopwords.words('english')]
    clean_skl = ' '.join(clean_skl)
    
    return clean_skl

In [272]:
#clean the description feature
df1['Descr_clean_skl'] = df1['Description'].apply(clean_text_skl)

In [273]:
#instantiate the vectorizer pipeline
vectorizer = CountVectorizer()
tfidf = TfidfTransformer()

pipe = make_pipeline(vectorizer, tfidf)

In [274]:
#transform X
X = pipe.fit_transform(df1['Descr_clean_skl'])

In [275]:
# X = descr_model[descr_model.wv.vocab]
y = df1['Rating']

In [276]:
#split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=54, test_size=0.3)

In [277]:
#Lasso
lass = Lasso()

lass.fit(X_train, y_train)

y_pred_lass = lass.predict(X_test)

In [278]:
#Ridge
ridge = Ridge()

ridge.fit(X_train, y_train)

y_pred_ridge = ridge.predict(X_test)

In [283]:
#Decision Tree Regressor
dtr = DecisionTreeRegressor()

dtr.fit(X_train, y_train)

y_pred_dtr = dtr.predict(X_test)

In [284]:
#measure models
from sklearn.metrics import mean_squared_error as MSE

lass_rmse = MSE(y_test, y_pred_lass)**(1/2)
ridge_rmse = MSE(y_test, y_pred_ridge)**(1/2)
dtr_rmse = MSE(y_test, y_pred_dtr)**(1/2)

print('For context, the standard deviation of the target variable is: {}'.format(df1['Rating'].std()))
print('The test rmse for the Lasso model is: {}'.format(lass_rmse))
print('The test rmse for the Ridge model is: {}'.format(ridge_rmse))
print('The test rmse for the Decision Tree model is: {}'.format(dtr_rmse))

For context, the standard deviation of the target variable is: 0.4323196801579272
The test rmse for the Lasso model is: 0.45377982086804636
The test rmse for the Ridge model is: 0.4491493314384868
The test rmse for the Decision Tree model is: 0.5496960788908559


After we have models, we need to plot learning curves for all of them to see which is the best fit

<H3>
    The models trained on the bag of words model created by CountVectorizer all perform poorly and underfit the data. To get past this, gensim's Word2Vec will be used to create word embeddings rather than a bag of words model. This model will preserve context by using the Word2Vec neural network to determine the probability of certain words appearing near other words
</H3>

First, the data will need to be preprocessed

In [260]:
#define the cleaning function
def clean_text(text):
    #define list of special characters
    special_list = ['$',
                '!'
                '/',
                '%',
                '-',
                "'",
                ",",
                "(",
                ")",
                '""',
                '‘',
                '’',
                '.',
                ':',
                ';',
                '*',
                '“',
                '”']
    
    #make all words lowercase
    clean = text.lower()
    
    #remove special characters
    [char for char in clean if char not in special_list]

    #lemmatize the text
    [WordNetLemmatizer().lemmatize(w) for w in clean]
    
    #remove stopwords
    clean_ex = [token for token in word_tokenize(clean) if not token in stopwords.words('english')]

    return clean_ex


In [168]:
#clean the description feature
df1['Descr_clean'] = df1['Description'].apply(clean_text)

In [203]:
#create the word2vec model
w2v_mod = gensim.models.Word2Vec(df1['Descr_clean'].to_list())

In [206]:
w2v_mx = w2v_mod[w2v_mod.wv.vocab]

  """Entry point for launching an IPython kernel.
