-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit f503679
Showing
12 changed files
with
1,776 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
imdb.zip | ||
imdb.zip.idx | ||
*.list.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
Copyright (c) 2013, nandhp <nandhp@gmail.com> | ||
All rights reserved. | ||
|
||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions are met: | ||
|
||
1. Redistributions of source code must retain the above copyright notice, | ||
this list of conditions and the following disclaimer. | ||
|
||
2. Redistributions in binary form must reproduce the above copyright notice, | ||
this list of conditions and the following disclaimer in the documentation | ||
and/or other materials provided with the distribution. | ||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR | ||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | ||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | ||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
This package implements a Python interface to [IMDb plain text data files][1]. | ||
|
||
[1]: http://www.imdb.com/interfaces | ||
|
||
At this time, the API should not be considered stable. | ||
|
||
Note that IMDb uses iso-8859-1 encoding (in data files and URLs); | ||
this package uses Unicode in most places. | ||
|
||
`python-imdb` supports the following data files (to greater or lesser degree) | ||
|
||
* movies | ||
* aka-titles | ||
* ratings | ||
* plot | ||
* genres | ||
* running-times | ||
* color-info | ||
* certificates | ||
* directors | ||
* writers | ||
* actors | ||
* actresses | ||
|
||
The module includes examples of a simple program (`example.py`) | ||
and a WSGI-based JSON API endpoint (`wsgi.py`). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/usr/bin/env python | ||
"""A simple example to query the database.""" | ||
|
||
from imdb import IMDb | ||
|
||
imdb = IMDb(dbfile='imdb.zip') | ||
# imdb.rebuild_index('/path/to/imdb') | ||
results = imdb.search('War Games (1983)') | ||
titles = [title for title, score in results] | ||
imdb.populate_rating(titles) | ||
for title in titles: | ||
r = title.rating | ||
print u'%s has rating %s/10 (%d ratings)' % (title, r.score, r.nratings) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
"""imdb - Read and search IMDb plain text data files. | ||
http://www.imdb.com/interfaces | ||
ftp://ftp.fu-berlin.de/pub/misc/movies/database/ | ||
ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/ | ||
ftp://ftp.sunet.se/pub/tv+movies/imdb/ | ||
""" | ||
|
||
import heapq | ||
import re | ||
import os | ||
|
||
from chunkedfile import ChunkedFile | ||
from utils import Timer | ||
import parsers | ||
import search | ||
|
||
# Notes on handling plot summaries: | ||
# - HTML entities | ||
# - qv links http://www.imdb.com/updates/guide/tgq_qv | ||
# - MovieGuide should use Markdown-escaper (but leave HTML entities) | ||
# Note title formats: http://www.imdb.com/updates/guide/title_formats | ||
# FAQ: http://www.imdb.com/updates/guide/ | ||
# Linking to IMDb: http://www.imdb.com/Title?The+Bourne+Ultimatum+(2007) | ||
# Old-style URL; degrades to search results if not found. | ||
# URLencode with + not %20, iso-8859-1 not utf-8) | ||
|
||
class IMDbTitle(object): | ||
"""An object representing a title entry in IMDb. | ||
If a backend IMDb object is provided, undefined attributes (e.g. rating) | ||
will be populated from the backend on-demand. Note that if populating | ||
multiple IMDbTitles is desired, it will be much faster to use | ||
IMDb.populate_rating or equivalent.""" | ||
|
||
def __init__(self, title, backend=None): | ||
self.title = title | ||
self.backend = backend | ||
self.name, self.year, self.unique, self.cat = self.parse(title)[1:] | ||
self.aka = None | ||
|
||
def __repr__(self): | ||
return 'IMDbTitle(%s)' % repr(self.title) | ||
|
||
def __unicode__(self): | ||
return self.title | ||
def __str__(self): | ||
return self.__unicode__().encode('utf-8') | ||
|
||
# For getters/setters for movie data see _install_parsers, below. | ||
|
||
parse = staticmethod(parsers.parse_title) | ||
|
||
class IMDb(object): | ||
"""Main interface to IMDb.""" | ||
|
||
def __init__(self, dbfile, debug=False): | ||
self.dbfile = dbfile | ||
self.debug = debug | ||
|
||
def rebuild_index(self, dbdir): | ||
"""Convert and index data files for random access. | ||
Index movie list for searching.""" | ||
# Import and index data files | ||
if os.path.exists(self.dbfile): | ||
raise Exception, '%s exists' % self.dbfile | ||
for parsername, parser in parsers.parsers(): | ||
obj = parser(dbfile=self.dbfile, dbdir=dbdir, debug=self.debug) | ||
if self.debug: | ||
print "Indexing %s..." % parsername | ||
with Timer(indent=2, quiet=not self.debug): | ||
obj.rebuild_index(do_copy=True) | ||
|
||
# Create index of movie titles | ||
if self.debug: | ||
print "Creating search index..." | ||
with Timer(indent=2, quiet=not self.debug): | ||
search.create_index(self.dbfile, dbdir, debug=self.debug) | ||
|
||
def search(self, query, year=None): | ||
"""Search the database for query, optionally with an estimated year.""" | ||
scores, akascores = search.search(self.dbfile, query, year, | ||
debug=self.debug) | ||
|
||
# Return the top-scoring results | ||
numret = 30 | ||
topscores = heapq.nlargest(numret, scores, scores.get) | ||
titles = dict((title, IMDbTitle(title, backend=self)) \ | ||
for title in topscores) | ||
for title, obj in titles.items(): | ||
if title in akascores: | ||
obj.aka = akascores[title] | ||
return [(titles[title], scores[title]) for title in topscores] | ||
|
||
# For each parser, add a corresponding property to the IMDbTitle class and a | ||
# populator (to load data into one or more IMDBTitles) to the IMDb class. | ||
|
||
def imdbtitle_property(name): | ||
"""Create and return an IMDbTitle property for a type of movie data. | ||
Uses self.backend.populate_whatever to load the data from the database.""" | ||
populater = 'populate_'+name | ||
data_val = '_'+name | ||
|
||
def getter(self): | ||
"""Auto-generted getter for this property.""" | ||
if not hasattr(self, data_val): | ||
populate_func = getattr(self.backend, populater) | ||
populate_func((self,)) | ||
return getattr(self, data_val) | ||
|
||
def setter(self, value): | ||
"""Auto-generated setter for this property.""" | ||
setattr(self, data_val, value) | ||
|
||
return (getter, setter) | ||
|
||
def imdb_populator(parserclass, prop, default): | ||
"""Create and return an IMDb method to populate (from the database) some | ||
property for multiple IMDbTitle objects.""" | ||
def populate(self, titles): | ||
"""Auto-generated function to populate (from the database) this | ||
property for multiple IMDbTitle objects.""" | ||
titles = tuple(title for title in titles) | ||
# FIXME: Optimize if title._rating is None) | ||
parser = parserclass(dbfile=self.dbfile, debug=self.debug) | ||
results = parser.search(title.title for title in titles) | ||
for title in titles: | ||
if title.title in results: | ||
setattr(title, prop, results[title.title]) | ||
else: # No data available | ||
setattr(title, prop, default) | ||
return populate | ||
|
||
def _install_parsers(): | ||
"""Install support for each parser into the IMDb and IMDbTitle classes.""" | ||
property_name = re.compile(r'(?<=[a-z])([A-Z])') | ||
for name, parser in parsers.parsers(): | ||
name = property_name.sub(r'_\1', name).lower() | ||
populator = imdb_populator(parser, name, default=parser.default) | ||
setattr(IMDb, 'populate_'+name, populator) | ||
prop = property(*imdbtitle_property(name), | ||
doc="""IMDb """+name+""" autogenerated property.""") | ||
setattr(IMDbTitle, name, prop) | ||
|
||
_install_parsers() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
"""__main__ - Sample program to search the IMDb from the command line.""" | ||
|
||
from argparse import ArgumentParser | ||
import sys, os.path | ||
sys.path.append(os.path.dirname(__file__)) | ||
from __init__ import IMDb, IMDbTitle | ||
|
||
SUPPORTED_ARGS = 'rating', 'plot', 'color_info', 'genres', 'running_time', \ | ||
'certificates', 'cast', 'directors', 'writers' | ||
|
||
def _main(argv): | ||
"""Command-line interface.""" | ||
parser = ArgumentParser() | ||
parser.add_argument('--quiet', action='store_const', default=False, | ||
const=True, | ||
help='Do not display debugging messages') | ||
parser.add_argument('--dbfile', nargs=1, default='imdb.zip', | ||
help='Database file') | ||
parser.add_argument('--rebuild-db', nargs=1, metavar='DIR', | ||
help='Rebuild the database file from IMDb data files') | ||
parser.add_argument('--search', nargs='*', | ||
help='Search the database') | ||
for argname in SUPPORTED_ARGS: | ||
parser.add_argument('--' + argname.replace('_', '-'), nargs='*', | ||
metavar='TITLE', | ||
help='Display ' + argname.replace('_',' ')) | ||
parser.add_argument('--all', nargs='*', metavar='TITLE', | ||
help='Display all information') | ||
|
||
if len(argv) == 0: | ||
parser.error('nothing to do.') | ||
args = parser.parse_args(argv) | ||
|
||
iface = IMDb(dbfile=args.dbfile, # Database filename | ||
debug=not args.quiet) | ||
|
||
if args.rebuild_db: | ||
iface.rebuild_index(args.rebuild_db[0]) | ||
|
||
titles = [] | ||
if args.search: | ||
queries = [] | ||
check_for_year = False | ||
for query in args.search: | ||
if check_for_year: | ||
try: | ||
iquery = int(query) | ||
except ValueError: | ||
pass | ||
else: | ||
if iquery > 1850 and iquery < 2100: | ||
queries[-1][1] = iquery | ||
check_for_year = False | ||
continue | ||
queries.append([query, None]) | ||
check_for_year = True | ||
|
||
print "Search results:" | ||
for query, year in queries: | ||
results = iface.search(query, year=year) | ||
for title, score in results: | ||
print " %s (%s)" % (title, str(score)) | ||
if len(results) > 0: | ||
titles.append(results[0][0]) | ||
print '' | ||
|
||
for argname in SUPPORTED_ARGS: | ||
argval = args.all if args.all is not None else getattr(args, argname) | ||
if argval is None: | ||
continue | ||
my_titles = [IMDbTitle(i) for i in argval] | ||
if not my_titles: | ||
my_titles = titles | ||
# Populate the requested information | ||
populator = getattr(iface, 'populate_' + argname) | ||
populator(my_titles) | ||
# Print the information | ||
for title in my_titles: | ||
print u"%s for %s:" % (argname.title().replace('_',' '), title) | ||
val = getattr(title, argname) | ||
if val is None: | ||
val = u'(None)' | ||
elif argname == 'rating': | ||
val = u"%s/10, %7s votes" % (val.score, val.nratings) | ||
elif argname == 'plot': | ||
val = val.summary | ||
# if val.byline: val += u" (by %s)" % (val.byline,) | ||
elif argname == 'genres': | ||
val = u", ".join(val) | ||
elif argname == 'running_time': | ||
val = u'%3d minutes' % val | ||
elif argname == 'cast' or argname == 'writers' or \ | ||
argname == 'directors': | ||
val = u"\n ".join(str(i) for i in val) | ||
print u" %s" % (val,) | ||
print '' | ||
|
||
if __name__ == '__main__': | ||
_main([i.decode('utf-8') for i in sys.argv[1:]]) | ||
#print search('texas chainsaw massacre', year=1974) | ||
#print search('war games', year=1983) | ||
#print search('dark city - 1998') | ||
#print search('Evangelion 3.0 Q: You Can (Not) Redo (2012)') | ||
#print search('Evangelion Shin Gekijoban: Kyu', year=2012) | ||
#print search('Up') | ||
#print search('R.E.M.') | ||
#print search('secret', year=2007) | ||
#print search('secret (2007)') | ||
#print search('die hard') | ||
#build_index() | ||
#print IMDbRatingsParser().search((u'Not Existing', u'Up (2009)', | ||
# u'Live Free or Die Hard (2007)', | ||
# u'zNotExist')) | ||
#for i in IMDbAkaParser().search(): | ||
# pass | ||
#print i | ||
|
Oops, something went wrong.