In [11]:
import urllib.request
import time
import random

import pickle

from bs4 import BeautifulSoup

In [6]:
# Crawling data from Memecenter website

baseUrl = 'http://www.memecenter.com/hall/{year}/{month}'

years = range(2013,2017)
months = ['january','february','march','april','may','june',
          'july','august','september','october','november','december']


## Downloading data

In [9]:
def crawl_single_page(pUrl):
    print ('Crawling {}'.format(pUrl))
    opener = urllib.request.build_opener()
    # We add header because some websites check for this before responding to request
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    
    try: # If there is any error, report error message and continue crawling
        # We create file name based on url pattern
        # Make sure memecenter folder exists!!!
        fname = 'memecenter/{}.html'.format(pUrl.replace('/','-').split('hall-')[-1])

        # Writing to a file
        with open(fname,'w') as fOut:
            # Making a request for html content
            html = opener.open(pUrl).read().decode('utf-8')
            fOut.write(html)
        print ('Stored in {}'.format(fname))
    except Exception as e:
        print ('[ERROR]: {}'.format(e))
    
# Iterate over years and months
for y in years:
    for m in months:
        crawl_single_page(baseUrl.format(year=y, month=m))
        time.sleep(random.random())

Crawling http://www.memecenter.com/hall/2013/january
Stored in memecenter/2013-january.html
Crawling http://www.memecenter.com/hall/2013/february
Stored in memecenter/2013-february.html
Crawling http://www.memecenter.com/hall/2013/march
Stored in memecenter/2013-march.html
Crawling http://www.memecenter.com/hall/2013/april
Stored in memecenter/2013-april.html
Crawling http://www.memecenter.com/hall/2013/may
Stored in memecenter/2013-may.html
Crawling http://www.memecenter.com/hall/2013/june
Stored in memecenter/2013-june.html
Crawling http://www.memecenter.com/hall/2013/july
Stored in memecenter/2013-july.html
Crawling http://www.memecenter.com/hall/2013/august
Stored in memecenter/2013-august.html
Crawling http://www.memecenter.com/hall/2013/september
Stored in memecenter/2013-september.html
Crawling http://www.memecenter.com/hall/2013/october
Stored in memecenter/2013-october.html
Crawling http://www.memecenter.com/hall/2013/november
Stored in memecenter/2013-november.html
Crawling h

## Working with crawled data

So far we crawled all the data and stored them under `memecenter` folder. 

Once data is all crawled, we can attempt to parse it multiple times. It's important to crawl data first before analysis.

Let's read those stored data and try to parse it.

## Searching and parsing information

To search information, I usually use [developer tools](https://developer.chrome.com/devtools) to inspect the html content (all good browsers Firefox, Chrome has it. I don't know about Microsoft). 

I recommend the same for you. It's easier to find information about tags, location and available data. Sometimes what you see in the webpage is not all the data in the html and you can obtain more by investigating these.

Improve your code step-by-step. First find the outermost element and print if you get the right content. Then try to parse inner elements.

For instance: Facebook images has alt-tag for additional metadata. It provides list of objects detected in the image. [see](https://github.com/ageitgey/show-facebook-computer-vision-tags)

In [57]:
def parse_page(fname):
    print ('Parsing {}'.format(fname))
    # Let's store all the necessary information in this dictionary
    pageData = dict()
    pageData['filename'] = fname
    pageData['topList'] = list()
    
    html_doc = open(fname,'r').read()
    soup = BeautifulSoup(html_doc, 'html.parser') # Parse html content using BeautifulSoup
    
    contentList = soup.find(id='fdc_contcontainer')
    for memeDiv in contentList.find_all('div', {'class':'content'}):
        tempData = dict()
        tempData['title'] = memeDiv.find('div', {'class':'content-title'}).text
        tempData['img-src'] = memeDiv.find('img')['src']
        tempData['rank'] = memeDiv.find('div', {'class':'hall_badge'}).text.strip()
        
        #print (memeDiv)
        #print (memeDiv.find('div', {'class':'content-title'}).text)
        #print (memeDiv.find('img')['src'])
        #print (memeDiv.find('div', {'class':'hall_badge'}).text.strip())
        
        buttonsDiv = memeDiv.find('div', {'class':'buttons'})
        tempData['nLike'] = int(buttonsDiv.find('div', {'class':'like'}).text.strip().replace('Like',''))
        tempData['nComment'] = int(buttonsDiv.find('div', {'class':'comment'}).text.strip().replace('Show ','').replace('comments',''))
        
        #print (buttonsDiv.find('div', {'class':'like'}).text.strip().replace('Like',''))
        #print (buttonsDiv.find('div', {'class':'comment'}).text.strip().replace('Show ','').replace('comments',''))
        pageData['topList'].append(tempData)
    
    print ('{} item collected from page'.format(len(pageData['topList'])))
    return pageData

memeDataset = dict()
for y in years:
    for m in months:
        fname = 'memecenter/{}-{}.html'.format(y,m)
        try:
            pData = parse_page(fname)
        except Exception as e:
            print ('[ERROR]: {}'.format(e))
        memeDataset['{}-{}'.format(y,m)] = pData
        
# Store extracted information into a file. Later we can easily load it without repeating preprocessing.
#pickle.dump(memeDataset, open('memecenter/page_data.pkl','wb'))

Parsing memecenter/2013-january.html
20 item collected from page
Parsing memecenter/2013-february.html
20 item collected from page
Parsing memecenter/2013-march.html
20 item collected from page
Parsing memecenter/2013-april.html
20 item collected from page
Parsing memecenter/2013-may.html
20 item collected from page
Parsing memecenter/2013-june.html
20 item collected from page
Parsing memecenter/2013-july.html
20 item collected from page
Parsing memecenter/2013-august.html
20 item collected from page
Parsing memecenter/2013-september.html
20 item collected from page
Parsing memecenter/2013-october.html
20 item collected from page
Parsing memecenter/2013-november.html
20 item collected from page
Parsing memecenter/2013-december.html
20 item collected from page
Parsing memecenter/2014-january.html
20 item collected from page
Parsing memecenter/2014-february.html
20 item collected from page
Parsing memecenter/2014-march.html
20 item collected from page
Parsing memecenter/2014-april.html
2

## Load crawled data and analyze

In [58]:
memeDataset = pickle.load(open('memecenter/page_data.pkl','rb'))
#print (memeDataset.keys())

for k in memeDataset:
    print (k, memeDataset[k])
    break

2014-july {'topList': [{'rank': '1', 'nComment': 220, 'nLike': 7310, 'img-src': 'http://img.memecdn.com/dat-feeling_c_1769367.jpg', 'title': 'Dat Feeling'}, {'rank': '2', 'nComment': 161, 'nLike': 5233, 'img-src': 'http://img.memecdn.com/wars_o_678135.jpg', 'title': 'Wars....'}, {'rank': '3', 'nComment': 156, 'nLike': 5054, 'img-src': 'http://img.memecdn.com/never-mind-me-d_o_1646029.jpg', 'title': 'Never Mind Me'}, {'rank': '4', 'nComment': 153, 'nLike': 4470, 'img-src': 'http://img.memecdn.com/we-could-all-use-it_o_2070627.jpg', 'title': 'We Could All Use It.'}, {'rank': '5', 'nComment': 173, 'nLike': 4407, 'img-src': 'http://img.memecdn.com/what-if-he-is-actually-right_o_2516749.jpg', 'title': 'What If He Is Actually Right? '}, {'rank': '6', 'nComment': 113, 'nLike': 4346, 'img-src': 'http://img.memecdn.com/gandalf-the-homeless_o_2470885.jpg', 'title': 'Gandalf The Homeless. '}, {'rank': '7', 'nComment': 132, 'nLike': 4245, 'img-src': 'http://img.memecdn.com/epic-mortal-kombat-cospl

## Possible ideas

- Check which memes appear in this top list more than once

- What are the some memes that has high number of likes and comments

- Temporal changes of like and comments per meme