In [1]:
import pandas as pd
import numpy as np
import re

import spacy
nlp = spacy.load('en')

import string

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

for the MVP, files stored in '/mvp_reviews'

48 strains x 104 reviews (Leafly is into multiples of 8)

In [2]:
stored_reviews = pd.read_pickle('/home/nate/ds/metis/class_work/projects/project_fletcher/pkl/mvp_reviews.pkl')

# unpack and check raw data

the way I've stored the reviews is in one large list of lists

we'll unpack this, but first a quick peek

In [3]:
len(stored_reviews)

201

let's just look at the first review of the first strain...

we're concerned with formatting, thankfully everything is already text because of the way we scraped it

however, we still have: numbers, punctuation, capitalization, interesting vocabulary

In [4]:
stored_reviews[0][0]

'wow! what a strain! very nice body and head high. would not recommend for getting stuff done as everything moved at 10 frames per second... just stay home and enjoy this amazing strain'

In [5]:
# let's explore a couple different ways to handle cleaning

remove = str.maketrans('', '', string.punctuation)


clean_list = []
for inner in stored_reviews:
    strain = ''
    
    for review in inner:
        review = review.translate(remove) # translate runs on C, so it's fast, really fast
        # join is another quick function, but not quite as efficient, here's an interpretable implementation using a list comprehension
        review = ''.join([i for i in review if not i.isdigit()]) 
        strain += review.lower()
        
    clean_list.append(strain)

now we have text that's ready to see some stronger stuff

In [6]:
clean_list[0][:100]

'wow what a strain very nice body and head high would not recommend for getting stuff done as everyth'

# exploring spacy

spacy is excellent! after spending some time scouring the docs, I came across quite a few practical examples

first let's get all of the documents into a format we can clean and lemmatize

lemmatization is the process of turning words into their root, so ['describe', 'described', 'describes'] all become 'describe'

this first cell where we call nlp is the bulk of the work

cleaning was fast, but we need something we can throw into a model

In [7]:
def noise(token):
    
    '''
    for each token, determine if it's noise
    
    i.e. if it's a stopword, too short, or punctuation
    '''
    
    noise = False
    if token.is_stop == True:
        noise = True
    elif token.is_punct == True:
        noise = True
    elif token.is_digit == True:
        noise = True
    elif token.is_space == True:
        noise = True
    elif len(token) < 3:
        noise = True
    return noise

In [8]:
tokenized_reviews = []

for strain in clean_list:
    tokenized_strain = nlp(strain)
    strain_review = ''
    for token in tokenized_strain:
        if noise(token):
            pass
        else:
            strain_review += str(token.lemma_) + ' '
            
    tokenized_reviews.append(strain_review)

In [9]:
tokenized_reviews[0][:100]

'wow strain nice body head high recommend get stuff move frame second stay home enjoy amazing strainh'

our data is almost ready for some machine learning!

now we need to split every word into it's own place

the result is one massive list of lists (again)

In [10]:
preprocessed = []

for review in tokenized_reviews:
    
    preprocessed.append(review.split())

In [11]:
preprocessed[0][:20]

['wow',
 'strain',
 'nice',
 'body',
 'head',
 'high',
 'recommend',
 'get',
 'stuff',
 'move',
 'frame',
 'second',
 'stay',
 'home',
 'enjoy',
 'amazing',
 'strainhappy',
 'uplift',
 'flower',
 'smoke']

In [12]:
import pickle

pickle_out = open('preprocessed.pkl', 'wb')
pickle.dump(tokenized_reviews, pickle_out)
pickle_out.close()