Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
tree: 9762579930
Fetching contributors…

Cannot retrieve contributors at this time

296 lines (230 sloc) 9.218 kb
#!/usr/bin/env python
#
# The MIT License
#
# Copyright (c) 2010 Kyle Conroy
#
# (Python 3 compatability fixes made by Mark Nenadov)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import sys
PY3 = sys.version_info[0] == 3
import json
import logging
import multiprocessing
import os
import argparse
import urllib
if PY3:
from urllib.request import urlretrieve
from urllib.request import urlopen
from urllib.error import HTTPError
import urllib.parse as urlparse
else:
from urllib import urlretrieve
from urllib import urlopen
from urllib2 import HTTPError
from urlparse import urlparse
USER_FIELDS = [
"first_name", "last_name", "gender", "username", "email",
"about", "bio", "birthday", "education", "hometown", "sports",
"relationship_status", "religion", "website", "work",
]
# Async Functions
def save_file(url, filename):
"""Save a photo.
Args:
url: The url of the photo
filename: The filename of the new photo
"""
logging.debug("Saving" + filename)
try:
urlretrieve(url, filename)
except HTTPError:
logging.error("Could not open url:%s" % url)
def save_note(note, filename):
"""Save a note
Args:
note: A note object from the Facebook Graph API
filename: The path for the new note
"""
logging.debug("Saving note %s" % filename)
try:
f = open(filename, 'w')
f.write(note["subject"].encode('utf-8'))
f.write("\n")
f.write(note["message"].encode('utf-8'))
f.close()
except:
logging.error("Could not save note %s" % filename)
f.close()
class Scrapebook(object):
""" Scrapebook downloads all your data from Facebook to your computer.
Scrapebook connects to the Facebook Graph API.
"""
def __init__(self, token, resources=None):
"""Create a new Scrape book object.
Args:
token: A long and unintelligible key for the Graph API
"""
self.base = "https://graph.facebook.com"
self.pool = multiprocessing.Pool(processes=35)
self.token = token
self.resources = resources or ["photos", "friends", "videos", "notes"]
def _clean(self, s):
"""Returns a safe and clean filename for any given string
Args:
string: The string to be cleaned
Returns:
A cleaned strings, suitable for a filename
"""
return "".join([x for x in s if x.isalpha() or x.isdigit()])
def _create_dir(self, *args):
"""Create a directory inside the Facebook directory.
Will not complain if the directory already exists.
Args:
Various directory names
Returns:
The path to the new directory
"""
path = os.path.join(*args)
path = os.path.join(os.curdir, path)
if not os.path.isdir(path):
logging.debug("Creating directory: %s" % path)
os.mkdir(path)
return path
def api_request(self, path, limit=10000, params=None):
"""Perform a Facebook Graph API request and return the data.
The returned JSON is parsed into native Python objects before being
returned.
Args:
path: relative resource url
limit: number of results to return. Default 10000
Returns:
A dictionary. If an error occured, the returned dictionary is empty
"""
params = params or {}
params["limit"] = 10000
url = ("https://graph.facebook.com/%s?access_token=%s&%s" %
(path, self.token, urllib.urlencode(params)))
logging.debug(url)
try:
data = urlopen(url)
if PY3:
json_data = str(data.read(), 'utf-8')
else:
json_data = data.read()
data = json.loads(json_data)
except HTTPError:
logging.error("Could not retreive %s" % url)
data = {}
if "error" in data:
error = data["error"]
logging.error("{}: {}".format(error["type"], error["message"]))
data = {}
return data
def scrape_photos(self):
"""Scrape all tagged photos and uploaded albums"""
photo_dir = self._create_dir("facebook", "photos")
albums = self.api_request("/me/albums")
if not albums:
print "Error: Could not scrape photo data"
return
photo_albums = [("/%s/photos" % a["id"], a["name"])
for a in albums["data"]]
photo_albums.append(("/me/photos", "me"))
for album in photo_albums:
url, name = album
name = self._clean(name)
album_dir = self._create_dir("facebook", "photos", name)
photos = self.api_request(url)
if not photos:
print "Error: Could not download album"
continue
for i, photo in enumerate(photos["data"]):
purl = photo["source"]
filename = os.path.join(album_dir, "%s_%d.jpg" % (name, i))
self.pool.apply_async(save_file, [purl, filename])
def scrape_videos(self):
"""Scrape all tagged videos and uploaded videos"""
videos_dir = self._create_dir("facebook", "videos")
videos = self.api_request("/me/videos/uploaded")
tags = self.api_request("/me/videos")
if not videos or not tags:
print "Error: Could not scrape your movies"
return
for video in videos["data"] + tags["data"]:
name = self._clean(video["name"])
fn, ext = os.path.splitext(urlparse(video["source"]).path)
vurl = video["source"]
filename = os.path.join(videos_dir, "%s%s" % (name, ext))
self.pool.apply_async(save_file, [vurl, filename])
def scrape_notes(self):
"""Scrape all notes a user composed or a user is tagged in."""
notes_dir = self._create_dir("facebook", "notes")
notes = self.api_request("/me/notes")
if not notes:
print "Error: Could not scrape your notes"
return
for n in notes["data"]:
title = self._clean(n["subject"][:15])
filename = os.path.join(notes_dir, "%s.txt" % title)
self.pool.apply_async(save_note, [n, filename])
def scrape_friends(self):
"""Scrape all friends. Stored in JSON objects"""
friends_file = os.path.join("facebook", "friends.json")
options = {"fields": ",".join(USER_FIELDS)}
friends = self.api_request("/me/friends", params=options)
if not friends:
print "Error: Could not scrape your friends"
return
json.dump(friends["data"], open(friends_file, "w"))
def run(self):
self._create_dir("facebook")
if "photos" in self.resources:
self.scrape_photos()
if "notes" in self.resources:
self.scrape_notes()
if "videos" in self.resources:
self.scrape_videos()
if "friends" in self.resources:
self.scrape_friends()
self.pool.close()
self.pool.join()
def main():
usage = ("To get your authtoken, head over to http://developers."
"facebook.com/docs/api, click on the https://graph.facebook."
"com/me/photos link, and copy the auth token in the url to "
"the command line")
parser = argparse.ArgumentParser(
description="Facebook shouldn't own your soul")
parser.add_argument("-t", "--token", dest="token", help=usage)
parser.add_argument("-d", "--debug", dest="debug", action="store_true",
help="Turn on debug information")
parser.add_argument('resources', type=str, nargs='*',
default=None, help='resources to scrape')
args = parser.parse_args()
if not args.token:
parser.error("option -t is required")
if args.debug:
logging.basicConfig(level=logging.DEBUG)
scraper = Scrapebook(args.token, resources=args.resources)
scraper.run()
if __name__ == '__main__':
main()
Jump to Line
Something went wrong with that request. Please try again.