Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
nandhp committed Apr 13, 2013
0 parents commit f503679
Show file tree
Hide file tree
Showing 12 changed files with 1,776 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
imdb.zip
imdb.zip.idx
*.list.gz
23 changes: 23 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
Copyright (c) 2013, nandhp <nandhp@gmail.com>
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
This package implements a Python interface to [IMDb plain text data files][1].

[1]: http://www.imdb.com/interfaces

At this time, the API should not be considered stable.

Note that IMDb uses iso-8859-1 encoding (in data files and URLs);
this package uses Unicode in most places.

`python-imdb` supports the following data files (to greater or lesser degree)

* movies
* aka-titles
* ratings
* plot
* genres
* running-times
* color-info
* certificates
* directors
* writers
* actors
* actresses

The module includes examples of a simple program (`example.py`)
and a WSGI-based JSON API endpoint (`wsgi.py`).
13 changes: 13 additions & 0 deletions example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env python
"""A simple example to query the database."""

from imdb import IMDb

imdb = IMDb(dbfile='imdb.zip')
# imdb.rebuild_index('/path/to/imdb')
results = imdb.search('War Games (1983)')
titles = [title for title, score in results]
imdb.populate_rating(titles)
for title in titles:
r = title.rating
print u'%s has rating %s/10 (%d ratings)' % (title, r.score, r.nratings)
145 changes: 145 additions & 0 deletions imdb/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
"""imdb - Read and search IMDb plain text data files.
http://www.imdb.com/interfaces
ftp://ftp.fu-berlin.de/pub/misc/movies/database/
ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/
ftp://ftp.sunet.se/pub/tv+movies/imdb/
"""

import heapq
import re
import os

from chunkedfile import ChunkedFile
from utils import Timer
import parsers
import search

# Notes on handling plot summaries:
# - HTML entities
# - qv links http://www.imdb.com/updates/guide/tgq_qv
# - MovieGuide should use Markdown-escaper (but leave HTML entities)
# Note title formats: http://www.imdb.com/updates/guide/title_formats
# FAQ: http://www.imdb.com/updates/guide/
# Linking to IMDb: http://www.imdb.com/Title?The+Bourne+Ultimatum+(2007)
# Old-style URL; degrades to search results if not found.
# URLencode with + not %20, iso-8859-1 not utf-8)

class IMDbTitle(object):
"""An object representing a title entry in IMDb.
If a backend IMDb object is provided, undefined attributes (e.g. rating)
will be populated from the backend on-demand. Note that if populating
multiple IMDbTitles is desired, it will be much faster to use
IMDb.populate_rating or equivalent."""

def __init__(self, title, backend=None):
self.title = title
self.backend = backend
self.name, self.year, self.unique, self.cat = self.parse(title)[1:]
self.aka = None

def __repr__(self):
return 'IMDbTitle(%s)' % repr(self.title)

def __unicode__(self):
return self.title
def __str__(self):
return self.__unicode__().encode('utf-8')

# For getters/setters for movie data see _install_parsers, below.

parse = staticmethod(parsers.parse_title)

class IMDb(object):
"""Main interface to IMDb."""

def __init__(self, dbfile, debug=False):
self.dbfile = dbfile
self.debug = debug

def rebuild_index(self, dbdir):
"""Convert and index data files for random access.
Index movie list for searching."""
# Import and index data files
if os.path.exists(self.dbfile):
raise Exception, '%s exists' % self.dbfile
for parsername, parser in parsers.parsers():
obj = parser(dbfile=self.dbfile, dbdir=dbdir, debug=self.debug)
if self.debug:
print "Indexing %s..." % parsername
with Timer(indent=2, quiet=not self.debug):
obj.rebuild_index(do_copy=True)

# Create index of movie titles
if self.debug:
print "Creating search index..."
with Timer(indent=2, quiet=not self.debug):
search.create_index(self.dbfile, dbdir, debug=self.debug)

def search(self, query, year=None):
"""Search the database for query, optionally with an estimated year."""
scores, akascores = search.search(self.dbfile, query, year,
debug=self.debug)

# Return the top-scoring results
numret = 30
topscores = heapq.nlargest(numret, scores, scores.get)
titles = dict((title, IMDbTitle(title, backend=self)) \
for title in topscores)
for title, obj in titles.items():
if title in akascores:
obj.aka = akascores[title]
return [(titles[title], scores[title]) for title in topscores]

# For each parser, add a corresponding property to the IMDbTitle class and a
# populator (to load data into one or more IMDBTitles) to the IMDb class.

def imdbtitle_property(name):
"""Create and return an IMDbTitle property for a type of movie data.
Uses self.backend.populate_whatever to load the data from the database."""
populater = 'populate_'+name
data_val = '_'+name

def getter(self):
"""Auto-generted getter for this property."""
if not hasattr(self, data_val):
populate_func = getattr(self.backend, populater)
populate_func((self,))
return getattr(self, data_val)

def setter(self, value):
"""Auto-generated setter for this property."""
setattr(self, data_val, value)

return (getter, setter)

def imdb_populator(parserclass, prop, default):
"""Create and return an IMDb method to populate (from the database) some
property for multiple IMDbTitle objects."""
def populate(self, titles):
"""Auto-generated function to populate (from the database) this
property for multiple IMDbTitle objects."""
titles = tuple(title for title in titles)
# FIXME: Optimize if title._rating is None)
parser = parserclass(dbfile=self.dbfile, debug=self.debug)
results = parser.search(title.title for title in titles)
for title in titles:
if title.title in results:
setattr(title, prop, results[title.title])
else: # No data available
setattr(title, prop, default)
return populate

def _install_parsers():
"""Install support for each parser into the IMDb and IMDbTitle classes."""
property_name = re.compile(r'(?<=[a-z])([A-Z])')
for name, parser in parsers.parsers():
name = property_name.sub(r'_\1', name).lower()
populator = imdb_populator(parser, name, default=parser.default)
setattr(IMDb, 'populate_'+name, populator)
prop = property(*imdbtitle_property(name),
doc="""IMDb """+name+""" autogenerated property.""")
setattr(IMDbTitle, name, prop)

_install_parsers()

117 changes: 117 additions & 0 deletions imdb/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""__main__ - Sample program to search the IMDb from the command line."""

from argparse import ArgumentParser
import sys, os.path
sys.path.append(os.path.dirname(__file__))
from __init__ import IMDb, IMDbTitle

SUPPORTED_ARGS = 'rating', 'plot', 'color_info', 'genres', 'running_time', \
'certificates', 'cast', 'directors', 'writers'

def _main(argv):
"""Command-line interface."""
parser = ArgumentParser()
parser.add_argument('--quiet', action='store_const', default=False,
const=True,
help='Do not display debugging messages')
parser.add_argument('--dbfile', nargs=1, default='imdb.zip',
help='Database file')
parser.add_argument('--rebuild-db', nargs=1, metavar='DIR',
help='Rebuild the database file from IMDb data files')
parser.add_argument('--search', nargs='*',
help='Search the database')
for argname in SUPPORTED_ARGS:
parser.add_argument('--' + argname.replace('_', '-'), nargs='*',
metavar='TITLE',
help='Display ' + argname.replace('_',' '))
parser.add_argument('--all', nargs='*', metavar='TITLE',
help='Display all information')

if len(argv) == 0:
parser.error('nothing to do.')
args = parser.parse_args(argv)

iface = IMDb(dbfile=args.dbfile, # Database filename
debug=not args.quiet)

if args.rebuild_db:
iface.rebuild_index(args.rebuild_db[0])

titles = []
if args.search:
queries = []
check_for_year = False
for query in args.search:
if check_for_year:
try:
iquery = int(query)
except ValueError:
pass
else:
if iquery > 1850 and iquery < 2100:
queries[-1][1] = iquery
check_for_year = False
continue
queries.append([query, None])
check_for_year = True

print "Search results:"
for query, year in queries:
results = iface.search(query, year=year)
for title, score in results:
print " %s (%s)" % (title, str(score))
if len(results) > 0:
titles.append(results[0][0])
print ''

for argname in SUPPORTED_ARGS:
argval = args.all if args.all is not None else getattr(args, argname)
if argval is None:
continue
my_titles = [IMDbTitle(i) for i in argval]
if not my_titles:
my_titles = titles
# Populate the requested information
populator = getattr(iface, 'populate_' + argname)
populator(my_titles)
# Print the information
for title in my_titles:
print u"%s for %s:" % (argname.title().replace('_',' '), title)
val = getattr(title, argname)
if val is None:
val = u'(None)'
elif argname == 'rating':
val = u"%s/10, %7s votes" % (val.score, val.nratings)
elif argname == 'plot':
val = val.summary
# if val.byline: val += u" (by %s)" % (val.byline,)
elif argname == 'genres':
val = u", ".join(val)
elif argname == 'running_time':
val = u'%3d minutes' % val
elif argname == 'cast' or argname == 'writers' or \
argname == 'directors':
val = u"\n ".join(str(i) for i in val)
print u" %s" % (val,)
print ''

if __name__ == '__main__':
_main([i.decode('utf-8') for i in sys.argv[1:]])
#print search('texas chainsaw massacre', year=1974)
#print search('war games', year=1983)
#print search('dark city - 1998')
#print search('Evangelion 3.0 Q: You Can (Not) Redo (2012)')
#print search('Evangelion Shin Gekijoban: Kyu', year=2012)
#print search('Up')
#print search('R.E.M.')
#print search('secret', year=2007)
#print search('secret (2007)')
#print search('die hard')
#build_index()
#print IMDbRatingsParser().search((u'Not Existing', u'Up (2009)',
# u'Live Free or Die Hard (2007)',
# u'zNotExist'))
#for i in IMDbAkaParser().search():
# pass
#print i

Loading

0 comments on commit f503679

Please sign in to comment.