In [50]:
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
import cPickle as pk
import os
from pymongo import MongoClient
import threading
import requests
from datetime import datetime
import scrape_leafly as sl

DB_NAME = "leafly"
COLLECTION_NAME = "reviews"

client = MongoClient()
db = client[DB_NAME]
coll = db[COLLECTION_NAME]

In [8]:
strain_page_file = 'leafly_strains_page.pk'
base_url = 'https://www.leafly.com'
strain_url = base_url + '/explore/sort-alpha'

driver = webdriver.PhantomJS()
driver.set_window_size(1920, 1080)

In [3]:
if os.path.exists(strain_page_file):
    strain_page = pk.load(open(strain_page_file))
    strain_soup = bs(strain_page, 'lxml')
    # get list of strain pages
    strains = sl.get_strains(strain_soup)
    # check for newly-added strains
    uptodate = sl.check_if_strains_uptodate(strains, strain_url, driver)
    if not uptodate:
        strain_soup = sl.scrape_strainlist(strain_page_file)
        strains = sl.get_strains(strain_soup, update_pk=True)
else:
    strain_soup = sl.scrape_strainlist(strain_page_file)
    strains = sl.get_strains(strain_soup, update_pk=True)

In [69]:
def scrape_reviews_page(url, verbose=True):
    '''
    scrapes reviews page for all reviews
    url is a string for the specified strain homepage
    
    returns list of reviews
    each review consist of a tuple of (user, stars, review_text, datetime_of_review, link_to_full_review)
    '''
    # num photos is index 1
    num_reviews = int(soup.findAll('span', {'class': 'hidden-xs'})[0].get_text().strip('(').strip(')'))
    print num_reviews, 'total reviews to scrape'
    reviews = []
    i = 0
    while len(reviews) < num_reviews:
        cur_url = url + '/reviews?page=' + str(i)
        if verbose:
            print 'scraping', cur_url
        i += 1
        driver.get(cur_url)
        rev_soup = bs(driver.page_source, 'lxml')
        reviews_soup = rev_soup.findAll('li', {'class': 'page-item divider bottom padding-listItem'})
        if verbose:
            print len(reviews_soup), 'reviews on page'
        for r in reviews_soup:
            user = r.findAll('a', {'class': 'no-color'})[0].get_text()
            stars = r.findAll('span', {'class': 'squeeze ng-isolate-scope'})[0].get('star-rating')
            text = r.findAll('p', {'class': 'copy--xs copy-md--md'})[0].get_text()[1:-1]
            date = r.findAll('time', \
                             {'class': \
                              'copy--xs copy-md--sm timestamp pull-right hidden-xs hidden-sm'}) \
                                [0].get('datetime')
            review_link = r.findAll('a', {'class': 'copy--xs copy-md--md'})[0].get('href')
            reviews.append((user, stars, text, date, review_link))

    return reviews

In [None]:
for i, s in enumerate(strains):
    review_page = base_url + s
    genetics = s.split('/')[1]
    strain = s.split('/')[2]
    coll = db[strain]
    print 'scraping', strain
    sl.scrape_reviews_page_threads(driver, coll, review_page, genetics)
    time.sleep(9)

scraping 100-og
257 total reviews to scrape
scraping https://www.leafly.com/hybrid/100-og/reviews?page=0
scraping https://www.leafly.com/hybrid/100-og/reviews?page=1
scraping https://www.leafly.com/hybrid/100-og/reviews?page=2
scraping https://www.leafly.com/hybrid/100-og/reviews?page=3
scraping https://www.leafly.com/hybrid/100-og/reviews?page=4
scraping https://www.leafly.com/hybrid/100-og/reviews?page=5
scraping https://www.leafly.com/hybrid/100-og/reviews?page=6
scraping https://www.leafly.com/hybrid/100-og/reviews?page=7
scraping https://www.leafly.com/hybrid/100-og/reviews?page=8
scraping https://www.leafly.com/hybrid/100-og/reviews?page=9
scraping https://www.leafly.com/hybrid/100-og/reviews?page=10
scraping https://www.leafly.com/hybrid/100-og/reviews?page=11
scraping https://www.leafly.com/hybrid/100-og/reviews?page=12
scraping https://www.leafly.com/hybrid/100-og/reviews?page=13
scraping https://www.leafly.com/hybrid/100-og/reviews?page=14
scraping https://www.leafly.com/hybr

In [None]:
# http://stackoverflow.com/questions/14184099/fastest-way-to-remove-duplicate-documents-in-mongodb
# will drop dupe reviews in mongo:
#db['100-og'].ensureIndex({text: 1}, {unique: true, dropDups: true}) 

In [None]:
# scrapes synchronously...is very slow
# print 'scraping', reviews_page_100og
# reviews_list = scrape_reviews_page(reviews_page_100og)