Initial commit

nandhp · Apr 13, 2013 · f503679 · f503679
commit f503679
Show file tree

Hide file tree

Showing 12 changed files with 1,776 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+imdb.zip
+imdb.zip.idx
+*.list.gz
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,23 @@
+Copyright (c) 2013, nandhp <nandhp@gmail.com>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
@@ -0,0 +1,26 @@
+This package implements a Python interface to [IMDb plain text data files][1].
+
+[1]: http://www.imdb.com/interfaces
+
+At this time, the API should not be considered stable.
+
+Note that IMDb uses iso-8859-1 encoding (in data files and URLs);
+this package uses Unicode in most places.
+
+`python-imdb` supports the following data files (to greater or lesser degree)
+
+* movies
+* aka-titles
+* ratings
+* plot
+* genres
+* running-times
+* color-info
+* certificates
+* directors
+* writers
+* actors
+* actresses
+
+The module includes examples of a simple program (`example.py`)
+and a WSGI-based JSON API endpoint (`wsgi.py`).
diff --git a/example.py b/example.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+"""A simple example to query the database."""
+
+from imdb import IMDb
+
+imdb = IMDb(dbfile='imdb.zip')
+# imdb.rebuild_index('/path/to/imdb')
+results = imdb.search('War Games (1983)')
+titles = [title for title, score in results]
+imdb.populate_rating(titles)
+for title in titles:
+    r = title.rating
+    print u'%s has rating %s/10 (%d ratings)' % (title, r.score, r.nratings)
diff --git a/imdb/__init__.py b/imdb/__init__.py
@@ -0,0 +1,145 @@
+"""imdb - Read and search IMDb plain text data files.
+
+http://www.imdb.com/interfaces
+ftp://ftp.fu-berlin.de/pub/misc/movies/database/
+ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/
+ftp://ftp.sunet.se/pub/tv+movies/imdb/
+"""
+
+import heapq
+import re
+import os
+
+from chunkedfile import ChunkedFile
+from utils import Timer
+import parsers
+import search
+
+# Notes on handling plot summaries:
+# - HTML entities
+# - qv links http://www.imdb.com/updates/guide/tgq_qv
+# - MovieGuide should use Markdown-escaper (but leave HTML entities) 
+# Note title formats: http://www.imdb.com/updates/guide/title_formats
+# FAQ: http://www.imdb.com/updates/guide/
+# Linking to IMDb: http://www.imdb.com/Title?The+Bourne+Ultimatum+(2007)
+#   Old-style URL; degrades to search results if not found.
+#   URLencode with + not %20, iso-8859-1 not utf-8)
+
+class IMDbTitle(object):
+    """An object representing a title entry in IMDb.
+    If a backend IMDb object is provided, undefined attributes (e.g. rating)
+    will be populated from the backend on-demand. Note that if populating
+    multiple IMDbTitles is desired, it will be much faster to use
+    IMDb.populate_rating or equivalent."""
+
+    def __init__(self, title, backend=None):
+        self.title = title
+        self.backend = backend
+        self.name, self.year, self.unique, self.cat = self.parse(title)[1:]
+        self.aka = None
+
+    def __repr__(self):
+        return 'IMDbTitle(%s)' % repr(self.title)
+
+    def __unicode__(self):
+        return self.title
+    def __str__(self):
+        return self.__unicode__().encode('utf-8')
+
+    # For getters/setters for movie data see _install_parsers, below.
+
+    parse = staticmethod(parsers.parse_title)
+
+class IMDb(object):
+    """Main interface to IMDb."""
+
+    def __init__(self, dbfile, debug=False):
+        self.dbfile = dbfile
+        self.debug = debug
+
+    def rebuild_index(self, dbdir):
+        """Convert and index data files for random access.
+           Index movie list for searching."""
+        # Import and index data files
+        if os.path.exists(self.dbfile):
+            raise Exception, '%s exists' % self.dbfile
+        for parsername, parser in parsers.parsers():
+            obj = parser(dbfile=self.dbfile, dbdir=dbdir, debug=self.debug)
+            if self.debug:
+                print "Indexing %s..." % parsername
+            with Timer(indent=2, quiet=not self.debug):
+                obj.rebuild_index(do_copy=True)
+
+        # Create index of movie titles
+        if self.debug:
+            print "Creating search index..."
+        with Timer(indent=2, quiet=not self.debug):
+            search.create_index(self.dbfile, dbdir, debug=self.debug)
+
+    def search(self, query, year=None):
+        """Search the database for query, optionally with an estimated year."""
+        scores, akascores = search.search(self.dbfile, query, year,
+            debug=self.debug)
+
+        # Return the top-scoring results
+        numret = 30
+        topscores = heapq.nlargest(numret, scores, scores.get)
+        titles = dict((title, IMDbTitle(title, backend=self)) \
+                        for title in topscores)
+        for title, obj in titles.items():
+            if title in akascores:
+                obj.aka = akascores[title]
+        return [(titles[title], scores[title]) for title in topscores]
+
+# For each parser, add a corresponding property to the IMDbTitle class and a
+# populator (to load data into one or more IMDBTitles) to the IMDb class.
+
+def imdbtitle_property(name):
+    """Create and return an IMDbTitle property for a type of movie data.
+    Uses self.backend.populate_whatever to load the data from the database."""  
+    populater = 'populate_'+name
+    data_val = '_'+name
+
+    def getter(self):
+        """Auto-generted getter for this property."""
+        if not hasattr(self, data_val):
+            populate_func = getattr(self.backend, populater)
+            populate_func((self,))
+        return getattr(self, data_val)
+
+    def setter(self, value):
+        """Auto-generated setter for this property."""
+        setattr(self, data_val, value)
+
+    return (getter, setter)
+
+def imdb_populator(parserclass, prop, default):
+    """Create and return an IMDb method to populate (from the database) some
+    property for multiple IMDbTitle objects."""
+    def populate(self, titles):
+        """Auto-generated function to populate (from the database) this
+        property for multiple IMDbTitle objects."""
+        titles = tuple(title for title in titles)
+        # FIXME: Optimize if title._rating is None)
+        parser = parserclass(dbfile=self.dbfile, debug=self.debug)
+        results = parser.search(title.title for title in titles)
+        for title in titles:
+            if title.title in results:
+                setattr(title, prop, results[title.title])
+            else:               # No data available
+                setattr(title, prop, default)
+    return populate
+
+def _install_parsers():
+    """Install support for each parser into the IMDb and IMDbTitle classes."""
+    property_name = re.compile(r'(?<=[a-z])([A-Z])')
+    for name, parser in parsers.parsers():
+        name = property_name.sub(r'_\1', name).lower()
+        populator = imdb_populator(parser, name, default=parser.default)
+        setattr(IMDb, 'populate_'+name, populator)
+        prop = property(*imdbtitle_property(name),
+            doc="""IMDb """+name+""" autogenerated property.""")
+        setattr(IMDbTitle, name, prop)
+
+_install_parsers()
+
diff --git a/imdb/__main__.py b/imdb/__main__.py
@@ -0,0 +1,117 @@
+"""__main__ - Sample program to search the IMDb from the command line."""
+
+from argparse import ArgumentParser
+import sys, os.path
+sys.path.append(os.path.dirname(__file__))
+from __init__ import IMDb, IMDbTitle
+
+SUPPORTED_ARGS = 'rating', 'plot', 'color_info', 'genres', 'running_time', \
+    'certificates', 'cast', 'directors', 'writers'
+
+def _main(argv):
+    """Command-line interface."""
+    parser = ArgumentParser()
+    parser.add_argument('--quiet', action='store_const', default=False,
+        const=True,
+        help='Do not display debugging messages')
+    parser.add_argument('--dbfile', nargs=1, default='imdb.zip',
+        help='Database file')
+    parser.add_argument('--rebuild-db', nargs=1, metavar='DIR',
+        help='Rebuild the database file from IMDb data files')
+    parser.add_argument('--search', nargs='*',
+        help='Search the database')
+    for argname in SUPPORTED_ARGS:
+        parser.add_argument('--' + argname.replace('_', '-'), nargs='*',
+                        metavar='TITLE',
+                        help='Display ' + argname.replace('_',' '))
+    parser.add_argument('--all', nargs='*', metavar='TITLE',
+                    help='Display all information')
+
+    if len(argv) == 0:
+        parser.error('nothing to do.')
+    args = parser.parse_args(argv)
+
+    iface = IMDb(dbfile=args.dbfile,    # Database filename
+                 debug=not args.quiet)
+
+    if args.rebuild_db:
+        iface.rebuild_index(args.rebuild_db[0])
+
+    titles = []
+    if args.search:
+        queries = []
+        check_for_year = False
+        for query in args.search:
+            if check_for_year:
+                try:
+                    iquery = int(query)
+                except ValueError:
+                    pass
+                else:
+                    if iquery > 1850 and iquery < 2100:
+                        queries[-1][1] = iquery
+                        check_for_year = False
+                        continue
+            queries.append([query, None])
+            check_for_year = True
+
+        print "Search results:"
+        for query, year in queries:
+            results = iface.search(query, year=year)
+            for title, score in results:
+                print "  %s (%s)" % (title, str(score))
+            if len(results) > 0: 
+                titles.append(results[0][0])
+        print ''
+
+    for argname in SUPPORTED_ARGS:
+        argval = args.all if args.all is not None else getattr(args, argname)
+        if argval is None:
+            continue
+        my_titles = [IMDbTitle(i) for i in argval]
+        if not my_titles:
+            my_titles = titles
+        # Populate the requested information
+        populator = getattr(iface, 'populate_' + argname)
+        populator(my_titles)
+        # Print the information
+        for title in my_titles:
+            print u"%s for %s:" % (argname.title().replace('_',' '), title)
+            val = getattr(title, argname)
+            if val is None:
+                val = u'(None)'
+            elif argname == 'rating':
+                val = u"%s/10, %7s votes" % (val.score, val.nratings)
+            elif argname == 'plot':
+                val = val.summary
+                # if val.byline: val += u" (by %s)" % (val.byline,)
+            elif argname == 'genres':
+                val = u", ".join(val)
+            elif argname == 'running_time':
+                val = u'%3d minutes' % val
+            elif argname == 'cast' or argname == 'writers' or \
+                    argname == 'directors':
+                val = u"\n  ".join(str(i) for i in val)
+            print u"  %s" % (val,)
+        print ''
+
+if __name__ == '__main__':
+    _main([i.decode('utf-8') for i in sys.argv[1:]])
+    #print search('texas chainsaw massacre', year=1974)
+    #print search('war games', year=1983)
+    #print search('dark city - 1998')
+    #print search('Evangelion 3.0 Q: You Can (Not) Redo (2012)')
+    #print search('Evangelion Shin Gekijoban: Kyu', year=2012)
+    #print search('Up')
+    #print search('R.E.M.')
+    #print search('secret', year=2007)
+    #print search('secret (2007)')
+    #print search('die hard')
+    #build_index()
+    #print IMDbRatingsParser().search((u'Not Existing', u'Up (2009)',
+    #                                  u'Live Free or Die Hard (2007)',
+    #                                  u'zNotExist'))
+    #for i in IMDbAkaParser().search():
+    #    pass
+    #print i
+