#Predicting on Google Books
<strong>Author:</strong> Nicholas Hunt-Walker<br/>
<strong>Desires:</strong>
- Get a list of words from the [top english verbs](http://www.acme2k.co.uk/acme/3star%20verbs.htm)
- For each word, search google blooks
- Organize books into a sensible schema
- Test classifiers on books to predict

In [1]:
import urllib2
from bs4 import BeautifulSoup as bs
import json
import pandas as pd
from admin.admin import booksAPIkey
import time
from pymongo import MongoClient
import pickle

In [2]:
words_url = "http://www.acme2k.co.uk/acme/3star%20verbs.htm"
words_page = urllib2.urlopen(words_url)

In [3]:
soup = bs(words_page)
words = soup.findAll("font", attrs={"color":"#0000CC"})
word_list = [str(word.get_text()) for word in words]

In [14]:
my_books = []
title_list = []
subject_stats = {}

In [11]:
def filter_books(retrieved_booklist, master_booklist):
    for item in retrieved_booklist["items"]:
        if "title" in item["volumeInfo"].keys():
            if ("averageRating" in item["volumeInfo"].keys()) & (item["volumeInfo"]["title"] not in title_list):
                master_booklist.append(item)
                title_list.append(item["volumeInfo"]["title"])
    
    return master_booklist

In [12]:
def get_data(idx, amt_per_page, keyword):
    the_url = "https://www.googleapis.com/books/v1/volumes?q={0}&maxResults={1}&printType=books&startIndex={2}&key={3}".format(keyword, amt_per_page, idx, booksAPIkey)
    the_json_info = urllib2.urlopen(the_url)
    data = json.loads(the_json_info.read())
    time.sleep(5)
    return data

In [13]:
def download_books(keyword, master_booklist):
    per_page = 40
    data = get_data(0, 10, keyword)
    book_total = data["totalItems"]

    if keyword not in subject_stats.keys():
        print keyword
        subject_stats[keyword] = book_total

    max_pages = book_total / per_page + 1
    for ii in range(20):
        data = get_data((ii*per_page) + 1, per_page, keyword)
        if "items" in data.keys():
            master_booklist = filter_books(data, master_booklist)
    
    return master_booklist

In [22]:
t0 = time.time()
for ii in range(47, len(word_list)):
    try:
        my_books = download_books(word_list[ii], my_books)
        
    except urllib2.HTTPError:
        print "Encountered an HTTPError {0} seconds after the loop has started.\nPausing for 10 minutes.".format(time.time() - t0)
        time.sleep(600)
        print "Resuming production"
        my_books = download_books(word_list[ii], my_books)

Encountered an HTTPError 5.96867609024 seconds after the loop has started.
Pausing for 1 minute.
Resuming production


HTTPError: HTTP Error 403: Forbidden

In [23]:
len(my_books)

12992

In [25]:
pickle.dump(my_books, open("books_save.p", "wb"))

I've got books for...47 words. These total about 13,000 and are saved in the text file `books_save.dat` for future reference. I can get books for the rest of the words later. For now move forward by unpickling the books and starting up MongoDB. Also don't forget to push to github.

- start up a mongodb instance with this code:
```
sudo mongod --dbpath /Users/Nick/data/db
```



In [2]:
my_books = pickle.load(open("books_save.p", "rb"))

In [4]:
client = MongoClient()
db = client.google_books
db.drop_collection("book_data")

In [5]:
db.book_data.insert_many(my_books)

<pymongo.results.InsertManyResult at 0x124356dc0>

In [6]:
print "The total number of entries: {0}".format(db.book_data.find().count())

The total number of entries: 12992


In [7]:
my_books[0]

{'_id': ObjectId('568efa09627fdd0f19073a47'),
 u'accessInfo': {u'accessViewStatus': u'SAMPLE',
  u'country': u'US',
  u'embeddable': True,
  u'epub': {u'acsTokenLink': u'http://books.google.com/books/download/Accept_This_Gift-sample-epub.acsm?id=s1vwMUECsgIC&format=epub&output=acs4_fulfillment_token&dl_type=sample&source=gbs_api',
   u'isAvailable': True},
  u'pdf': {u'isAvailable': False},
  u'publicDomain': False,
  u'quoteSharingAllowed': False,
  u'textToSpeechPermission': u'ALLOWED_FOR_ACCESSIBILITY',
  u'viewability': u'PARTIAL',
  u'webReaderLink': u'http://books.google.com/books/reader?id=s1vwMUECsgIC&as_pt=BOOKS&hl=&printsec=frontcover&output=reader&source=gbs_api'},
 u'etag': u'/Nqqi1bcSBM',
 u'id': u's1vwMUECsgIC',
 u'kind': u'books#volume',
 u'saleInfo': {u'buyLink': u'http://books.google.com/books?id=s1vwMUECsgIC&dq=accept&as_pt=BOOKS&hl=&buy=&source=gbs_api',
  u'country': u'US',
  u'isEbook': True,
  u'listPrice': {u'amount': 9.99, u'currencyCode': u'USD'},
  u'offers': 

In [23]:
#get the authors count?
total_authors_and_books_count = db.book_data.aggregate([{"$match" : {"volumeInfo.authors":{"$exists":1}}},
                                       {"$group" : {
                                          "_id" : "$volumeInfo.authors",
                                         "count" : {"$sum": 1}
                                                   }
                                       },{"$sort":
                                         {"count" : -1}
                                         }])
# author_count = 0
# for doc in total_authors: author_count += 1
# print author_count
for doc in total_authors_and_books_count:
    print doc

{u'count': 22, u'_id': [u'Stephen King']}
{u'count': 22, u'_id': [u'James Patterson']}
{u'count': 15, u'_id': [u'Avi']}
{u'count': 12, u'_id': [u'Mike Lupica']}
{u'count': 11, u'_id': [u'Joyce Meyer']}
{u'count': 10, u'_id': [u'Brian Tracy']}
{u'count': 10, u'_id': [u'Margaret Atwood']}
{u'count': 9, u'_id': [u'Jeffrey Archer']}
{u'count': 9, u'_id': [u'William Shakespeare']}
{u'count': 8, u'_id': [u'Albert Ellis']}
{u'count': 8, u'_id': [u'Todd Parr']}
{u'count': 8, u'_id': [u'Ian McEwan']}
{u'count': 8, u'_id': [u'Haruki Murakami']}
{u'count': 8, u'_id': [u'Eric Carle']}
{u'count': 8, u'_id': [u'John F. MacArthur']}
{u'count': 7, u'_id': [u'Dean Koontz']}
{u'count': 7, u'_id': [u'Bart D. Ehrman']}
{u'count': 7, u'_id': [u'Roald Dahl']}
{u'count': 7, u'_id': [u'Ernest Hemingway']}
{u'count': 7, u'_id': [u'Judy Blume']}
{u'count': 7, u'_id': [u'Jim Butcher']}
{u'count': 7, u'_id': [u'George R. R. Martin']}
{u'count': 7, u'_id': [u'Neal Stephenson']}
{u'count': 7, u'_id': [u'Dr. Seuss']

In [29]:
noam_chomsky = db.book_data.find({"volumeInfo.authors": "Noam Chomsky"})
for doc in noam_chomsky:
    print doc["volumeInfo"]["title"]
    print doc["volumeInfo"]["authors"]

Syntactic Structures
[u'Noam Chomsky']
How the World Works
[u'Noam Chomsky']
Profit Over People
[u'Noam Chomsky']
Powers and Prospects
[u'Noam Chomsky']
Topics in the Theory of Generative Grammar
[u'Noam Chomsky']
On Palestine
[u'Noam Chomsky', u'Ilan  Papp\u017d']
Gaza in Crisis
[u'Noam Chomsky', u'Ilan  Papp\x8e']
