In [11]:
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
import cPickle as pk
import os
from pymongo import MongoClient
import threading
import requests
from datetime import datetime
from scrape_leafly import scrape_strainlist

DB_NAME = "leafly"
COLLECTION_NAME = "reviews"

client = MongoClient()
db = client[DB_NAME]
coll = db[COLLECTION_NAME]

In [12]:
strain_page_file = 'leafly_strains_page.pk'
base_url = 'https://www.leafly.com'
strain_url = base_url + '/explore/sort-alpha'

driver = webdriver.PhantomJS()
driver.set_window_size(1920, 1080)

In [17]:
if os.path.exists(strain_page_file):
    strain_page = pk.load(open(strain_page_file))
    strain_soup = bs(strain_page, 'lxml')
else:
    strain_soup = scrape_strainlist(strain_page_file)

# get list of strain pages
strains = strain_soup.findAll('a', {'class': 'ga_Explore_Strain_Tile'}) + strain_soup.findAll('a', {'class': 'ng-scope'})
strains = [s.get('href') for s in strains]
    
strain_pages_file = 'strain_pages_list.pk'
if not os.path.exists(strain_pages_file):
    pk.dump(strains, open(strain_pages_file, 'w'), 2)


In [40]:
def check_if_strains_uptodate():
    '''
    scrapes leafly main page to check if any new strains have been added
    '''
    strain_len = len(strains)
    print 'currently have', strain_len, 'strains'
    driver.get(strain_url)
    alpha_sort_soup = bs(driver.page_source, 'lxml')
    cur_strains = int(alpha_sort_soup.findAll('strong', {'class':'ng-binding'})[0].get_text())
    print 'found', cur_strains, 'strains on leafly'
    
    

In [41]:
check_if_strains_uptodate()

currently have 2122 strains
found 2115 strains on leafly


In [7]:
strain_soup.findAll('a', {'class': 'ga_Explore_Strain_Tile'})[0].get('href')

'/hybrid/100-og'

In [8]:
base_url + strain_soup.findAll('a', {'class': 'ga_Explore_Strain_Tile'})[0].get('href')

'https://www.leafly.com/hybrid/100-og'

In [9]:
reviews_page = base_url + strain_soup.findAll('a', {'class': 'ga_Explore_Strain_Tile'})[0].get('href') + '/reviews?page=0'

In [10]:
driver.get(reviews_page)

In [11]:
soup = bs(driver.page_source, 'lxml')

In [26]:
reviews_page

'https://www.leafly.com/hybrid/100-og/reviews?page=0'

In [13]:
reviews = soup.findAll('li', {'class': 'page-item divider bottom padding-listItem'})

In [14]:
reviews[0].findAll('a', {'class': 'no-color'})[0].get_text()

u'TrippyLama'

In [15]:
reviews[0].findAll('span', {'class': 'squeeze ng-isolate-scope'})[0].get('star-rating')

'4'

In [16]:
reviews[0].findAll('p', {'class': 'copy--xs copy-md--md'})[0].get_text()[1:-1]

u'I like the high makes me feel motivation'

In [17]:
reviews[0].findAll('time', {'class': 'copy--xs copy-md--sm timestamp pull-right hidden-xs hidden-sm'})[0].get('datetime')

'2016-10-03 23:48:46Z'

In [25]:
int(soup.findAll('span', {'class': 'hidden-xs'})[0].get_text().strip('(').strip(')'))

257

In [69]:
def scrape_reviews_page(url, verbose=True):
    '''
    scrapes reviews page for all reviews
    url is a string for the specified strain homepage
    
    returns list of reviews
    each review consist of a tuple of (user, stars, review_text, datetime_of_review)
    '''
    # num photos is index 1
    num_reviews = int(soup.findAll('span', {'class': 'hidden-xs'})[0].get_text().strip('(').strip(')'))
    print num_reviews, 'total reviews to scrape'
    reviews = []
    i = 0
    while len(reviews) < num_reviews:
        cur_url = url + '/reviews?page=' + str(i)
        if verbose:
            print 'scraping', cur_url
        i += 1
        driver.get(cur_url)
        rev_soup = bs(driver.page_source, 'lxml')
        reviews_soup = rev_soup.findAll('li', {'class': 'page-item divider bottom padding-listItem'})
        if verbose:
            print len(reviews_soup), 'reviews on page'
        for r in reviews_soup:
            user = r.findAll('a', {'class': 'no-color'})[0].get_text()
            stars = r.findAll('span', {'class': 'squeeze ng-isolate-scope'})[0].get('star-rating')
            text = r.findAll('p', {'class': 'copy--xs copy-md--md'})[0].get_text()[1:-1]
            date = r.findAll('time', \
                             {'class': \
                              'copy--xs copy-md--sm timestamp pull-right hidden-xs hidden-sm'}) \
                                [0].get('datetime')
            reviews.append((user, stars, text, date))

    return reviews

In [134]:
def scrape_a_review_page(url, verbose=True):
    '''
    scrapes review page and puts all the info in a mongodb, because it is unordered
    '''
    res = requests.get(url)
    rev_soup = bs(res.content, 'lxml')
    reviews_soup = rev_soup.findAll('li', {'class': 'page-item divider bottom padding-listItem'})
    if verbose:
        print len(reviews_soup), 'reviews on page'
    for r in reviews_soup:
        user = r.findAll('a', {'class': 'no-color'})[0].get_text()
        stars = r.findAll('span', {'class': 'squeeze'})[0].get('star-rating')
        text = r.findAll('p', {'class': 'copy--xs copy-md--md'})[0].get_text()[1:-1]
        date = r.findAll('time', \
                         {'class': \
                          'copy--xs copy-md--sm timestamp pull-right hidden-xs hidden-sm'}) \
                            [0].get('datetime')
        
        datadict = {}
        datadict['user'] = user
        datadict['stars'] = stars
        datadict['text'] = text
        datadict['date'] = date
        coll.insert_one(datadict)

In [130]:
num_reviews = int(soup.findAll('span', {'class': 'hidden-xs'})[0].get_text().strip('(').strip(')'))
num_reviews

257

In [131]:
num_reviews/8

32

In [197]:
def scrape_reviews_page_threads(url, genetics, verbose=True):
    '''
    scrapes reviews page for all reviews
    url is a string for the specified strain homepage
    
    returns list of reviews
    each review consist of a tuple of (user, stars, review_text, datetime_of_review)
    '''
    # num photos is index 1
    num_reviews = int(soup.findAll('span', {'class': 'hidden-xs'})[0].get_text().strip('(').strip(')'))
    print num_reviews, 'total reviews to scrape'
    pages = num_reviews / 8
    scrapetime = datetime.utcnow().isoformat()
    threads = []
    if coll.find({'genetics': genetics}).count() < 1:
        coll.insert_one({'genetics': genetics})
    if coll.find({'name': 'scrape_datetime'}).count() < 1:
        coll.insert_one({'scrape_times': scrapetime})
        coll.insert_one({'review_count': num_reviews})
    else:
        coll.update_one({'scrape_times': {'$exists': true}}, {'$push': {'scrape_times': scrapetime}})
        coll.update_one({'review_count': {'$exists': true}}, {'$push': {'review_count': num_reviews}})
    if coll.find({'review_count':{'$exists':'true'}}).next()['review_count'] == num_reviews:
        print 'already up-to-date'
        return
    for i in range(pages + 1):
        cur_url = url + '/reviews?page=' + str(i)
        if verbose:
            print 'scraping', cur_url
        #scrape_a_review_page(cur_url)
        t = threading.Thread(target=scrape_a_review_page, args=(cur_url,))
        t.start()
        threads.append(t)
    for th in threads:
        th.join()

In [172]:
coll.find({'genetics': genetics}).count()

1

In [149]:
coll = db[strain]

In [150]:
coll

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), u'leafly'), u'100-og')

In [158]:
coll.find({'genetics': genetics}).count()

1

In [116]:
reviews_page_100og = base_url + strain_soup.findAll('a', {'class': 'ga_Explore_Strain_Tile'})[0].get('href')

In [128]:
genetics = strain_soup.findAll('a', {'class': 'ga_Explore_Strain_Tile'})[0].get('href').split('/')[1]
strain = strain_soup.findAll('a', {'class': 'ga_Explore_Strain_Tile'})[0].get('href').split('/')[2]

In [196]:
coll.find({'review_count':{'$exists':'true'}}).next()['review_count']

257

In [198]:
coll = db[strain]
scrape_reviews_page_threads(reviews_page_100og, genetics)

257 total reviews to scrape
already up-to-date


In [None]:
# http://stackoverflow.com/questions/14184099/fastest-way-to-remove-duplicate-documents-in-mongodb
# will drop dupe reviews in mongo:
#db['100-og'].ensureIndex({text: 1}, {unique: true, dropDups: true}) 

In [None]:
# scrapes synchronously...very slow
print 'scraping', reviews_page_100og
reviews_list = scrape_reviews_page(reviews_page_100og)