Skip to content
Browse files

inital import

  • Loading branch information...
0 parents commit ee062480385aefe34fdd009ff98da1c0a5025f92 @ns committed Oct 24, 2010
Showing with 589 additions and 0 deletions.
  1. +34 −0 ActorData.java
  2. +264 −0 ImdbParser.java
  3. +23 −0 ImdbReader.java
  4. +177 −0 ImdbReaderImpl.java
  5. +17 −0 Main.java
  6. +26 −0 MovieData.java
  7. +15 −0 README
  8. +33 −0 RoleData.java
34 ActorData.java
@@ -0,0 +1,34 @@
+public class ActorData
+{
+ private final String name;
+ private final RoleData[] movieRoles;
+ private boolean actress = false;
+
+ /**
+ * Create container for actor data.
+ * @param name
+ * name of actor
+ * @param movieRoles
+ * movie roles of actor
+ */
+ ActorData( final String name, final RoleData[] movieRoles, boolean actress )
+ {
+ this.movieRoles = movieRoles;
+ this.name = name;
+ this.actress = actress;
+ }
+
+ public String getName()
+ {
+ return name;
+ }
+
+ public RoleData[] getMovieRoles()
+ {
+ return movieRoles;
+ }
+
+ public boolean isActress() {
+ return this.actress;
+ }
+}
264 ImdbParser.java
@@ -0,0 +1,264 @@
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.ZipInputStream;
+
+/**
+ * A <code>ImdbParser</code> can parse the movie and actor/actress lists from
+ * the imdb text data (http://www.imdb.com/interfaces). It uses an
+ * {@link ImdbReader} forwarding the parsed information.
+ */
+public class ImdbParser
+{
+ private static final String MOVIES_MARKER = "MOVIES LIST";
+ private static final int MOVIES_SKIPS = 2;
+ private static final String ACTRESSES_MARKER = "THE ACTRESSES LIST";
+ private static final int ACTRESS_SKIPS = 4;
+ private static final String ACTOR_MARKER = "THE ACTORS LIST";
+ private static final int ACTOR_SKIPS = 4;
+ private static final int BUFFER_SIZE = 200;
+ private final ImdbReader reader;
+
+ /**
+ * Create a new Imdb parser.
+ * @param reader
+ * reader this parser will use to forward events to
+ */
+ public ImdbParser( final ImdbReader reader )
+ {
+ if ( reader == null )
+ {
+ throw new IllegalArgumentException( "Null ImdbReader" );
+ }
+ this.reader = reader;
+ }
+
+ /**
+ * Parsers a tab-separated movie list file, each line containing a movie
+ * title and the year the movie was released. The file can be .gz or .zip
+ * compressed, and must then have the corresponding file extension.
+ * @param file
+ * name of movie list file
+ * @throws IOException
+ * if unable to open the movie list file
+ */
+ public String parseMovies( final String file ) throws IOException
+ {
+ final List<MovieData> buffer = new LinkedList<MovieData>();
+ if ( file == null )
+ {
+ throw new IllegalArgumentException( "Null movie file" );
+ }
+ BufferedReader fileReader = getFileReader( file, MOVIES_MARKER,
+ MOVIES_SKIPS );
+ String line = fileReader.readLine();
+ int movieCount = 0;
+ while ( line != null )
+ {
+ // get rid of blank lines and TV shows
+ if ( "".equals( line ) || line.indexOf( "(TV)" ) != -1 )
+ {
+ line = fileReader.readLine();
+ continue;
+ }
+ final int yearSep = line.indexOf( '\t' );
+ if ( yearSep > 0 )
+ {
+ final String title = line.substring( 0, yearSep ).trim();
+ String yearString = line.substring( yearSep ).trim();
+ if ( yearString.length() > 4 )
+ {
+ yearString = yearString.substring( 0, 4 );
+ }
+ if ( yearString.length() == 0 || yearString.charAt( 0 ) == '?'
+ || title.contains( "{" ) || title.startsWith( "\"" ) )
+ {
+ line = fileReader.readLine();
+ continue;
+ }
+ final int year = Integer.parseInt( yearString );
+ buffer.add( new MovieData( title, year ) );
+ movieCount++;
+ if ( movieCount % BUFFER_SIZE == 0 )
+ {
+ reader.newMovies( buffer );
+ buffer.clear();
+ }
+ }
+ line = fileReader.readLine();
+ }
+ reader.newMovies( buffer );
+ return (movieCount + " movies parsed and injected.");
+ }
+
+ /**
+ * Parsers a tab-separated actors list file. A line begins with actor name
+ * then followed by a tab and a movie title the actor acted in. Additional
+ * movies the current actor acted in are found on the following line that
+ * starts with a tab followed by the movie title.
+ * @param actorFile
+ * name of actor list file
+ * @param actressFile
+ * TODO
+ * @throws IOException
+ * if unable to open actor list file
+ */
+ public String parseActors( final String actorFile, final String actressFile )
+ throws IOException
+ {
+ if ( actorFile == null )
+ {
+ throw new IllegalArgumentException( "Null actor file" );
+ }
+ if ( actressFile == null )
+ {
+ throw new IllegalArgumentException( "Null actress file" );
+ }
+ String result = "";
+ BufferedReader fileReader;
+ // result += "Actors: " + parseActorItems( fileReader, false ) + "\n";
+ // fileReader.close();
+ fileReader = getFileReader( actressFile, ACTRESSES_MARKER,
+ ACTRESS_SKIPS );
+ result += "Actresses: " + parseActorItems( fileReader, true );
+ return result;
+ }
+
+ private String parseActorItems( BufferedReader fileReader, boolean actress )
+ throws IOException
+ {
+ String line = fileReader.readLine();
+ String currentActor = null;
+ final List<ActorData> buffer = new LinkedList<ActorData>();
+ final List<RoleData> movies = new ArrayList<RoleData>();
+ int movieCount = 0;
+ int actorCount = 0;
+ while ( line != null )
+ {
+ // get rid of blank lines
+ if ( "".equals( line ) )
+ {
+ line = fileReader.readLine();
+ continue;
+ }
+ int actorSep = line.indexOf( '\t' );
+ if ( actorSep >= 0 )
+ {
+ String actor = line.substring( 0, actorSep ).trim();
+ if ( !"".equals( actor ) )
+ {
+ if ( movies.size() > 0 )
+ {
+ buffer.add( new ActorData( currentActor, movies
+ .toArray( new RoleData[movies.size()] ), actress ) );
+ actorCount++;
+ movies.clear();
+ }
+ currentActor = actor;
+ }
+ String title = line.substring( actorSep ).trim();
+ if ( title.length() == 0 || title.contains( "{" )
+ || title.startsWith( "\"" ) || title.contains( "????" ) )
+ {
+ line = fileReader.readLine();
+ continue;
+ }
+ int characterStart = title.indexOf( '[' );
+ int characterEnd = title.indexOf( ']' );
+ String character = null;
+ if ( characterStart > 0 && characterEnd > characterStart )
+ {
+ character = title.substring( characterStart + 1,
+ characterEnd );
+ }
+ int creditStart = title.indexOf( '<' );
+ // int creditEnd = title.indexOf( '>' );
+ // String credit = null;
+ // if ( creditStart > 0 && creditEnd > creditStart )
+ // {
+ // credit = title.substring( creditStart + 1, creditEnd );
+ // }
+ if ( characterStart > 0 )
+ {
+ title = title.substring( 0, characterStart ).trim();
+ }
+ else if ( creditStart > 0 )
+ {
+ title = title.substring( 0, creditStart ).trim();
+ }
+ int spaces = title.indexOf( " " );
+ if ( spaces > 0 )
+ {
+ if ( title.charAt( spaces - 1 ) == ')'
+ && title.charAt( spaces + 2 ) == '(' )
+ {
+ title = title.substring( 0, spaces ).trim();
+ }
+ }
+ movies.add( new RoleData( title, character ) );
+ movieCount++;
+ if ( movieCount % BUFFER_SIZE == 0 )
+ {
+ reader.newActors( buffer );
+ buffer.clear();
+ }
+ }
+ line = fileReader.readLine();
+ }
+ reader.newActors( buffer );
+ return (actorCount + " added including " + movieCount + " characters parsed and injected.");
+ }
+
+ /**
+ * Get fiel reader that corresponds to file extension.
+ * @param file
+ * the file name
+ * @param pattern
+ * TODO
+ * @param skipLines
+ * TODO
+ * @return a file reader that uncompresses data if needed
+ * @throws IOException
+ * @throws FileNotFoundException
+ */
+ private BufferedReader getFileReader( final String file, String pattern,
+ int skipLines ) throws IOException, FileNotFoundException
+ {
+ BufferedReader fileReader;
+ // support compressed files
+ if ( file.endsWith( ".gz" ) )
+ {
+ fileReader = new BufferedReader( new InputStreamReader(
+ new GZIPInputStream( new FileInputStream( file ) ) ) );
+ }
+ else if ( file.endsWith( ".zip" ) )
+ {
+ fileReader = new BufferedReader( new InputStreamReader(
+ new ZipInputStream( new FileInputStream( file ) ) ) );
+ }
+ else
+ {
+ fileReader = new BufferedReader( new FileReader( file ) );
+ }
+
+ String line = "";
+ while ( !pattern.equals( line ) )
+ {
+ line = fileReader.readLine();
+ }
+ for ( int i = 0; i < skipLines; i++ )
+ {
+ line = fileReader.readLine();
+ }
+
+ return fileReader;
+ }
+
+}
23 ImdbReader.java
@@ -0,0 +1,23 @@
+import java.util.List;
+
+ /**
+6 * Reads events from the {@link ImdbParser}.
+7 */
+ public interface ImdbReader
+ {
+ /**
+ * Creates new movies with specified <code>title</code> and
+ * <code>year</code> from a {@link MovieData} list.
+ * Every movie will be indexed.
+ * @param movieList movies to create and index
+ */
+ void newMovies( List<MovieData> movieList );
+
+ /**
+ * Creates new actors specifying what movies the actors acted in
+ * from a {@link ActorData} list.
+ * Every actor will be indexed.
+ * @param actorList actors to create and index
+ */
+ void newActors( List<ActorData> actorList );
+}
177 ImdbReaderImpl.java
@@ -0,0 +1,177 @@
+import java.util.List;
+
+import javax.xml.parsers.*;
+import javax.xml.transform.*;
+import javax.xml.transform.dom.*;
+import javax.xml.transform.stream.*;
+import org.w3c.dom.*;
+import java.util.ArrayList;
+import java.io.BufferedOutputStream;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
+
+class ImdbReaderImpl implements ImdbReader
+{
+ private ArrayList<MovieData> movies = new ArrayList<MovieData>();
+ private ArrayList<ActorData> actors = new ArrayList<ActorData>();
+ private ArrayList<ActorData> actresses = new ArrayList<ActorData>();
+
+
+ public void newActors( final List<ActorData> actorList )
+ {
+ for ( ActorData actorData : actorList )
+ {
+ if (!actorData.isActress()) {
+ actors.add(actorData);
+ }
+ else {
+ actresses.add(actorData);
+ }
+ }
+ }
+
+ public void newMovies( final List<MovieData> movieList )
+ {
+ for ( MovieData movieData : movieList )
+ {
+ movies.add(movieData);
+ }
+ }
+
+ public void writeMoviesXML(final String path) throws Exception {
+ DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
+ Document document = documentBuilder.newDocument();
+ Element rootElement = document.createElement("movies");
+ document.appendChild(rootElement);
+
+ for (MovieData m : movies) {
+ String element = "movie";
+ Element em = document.createElement(element);
+
+ String elementTitle = "title";
+ Element title = document.createElement(elementTitle);
+ title.appendChild(document.createTextNode(m.getTitle()));
+
+ String elementYear = "year";
+ Element year = document.createElement(elementYear);
+ year.appendChild(document.createTextNode(m.getYear() + ""));
+
+ em.appendChild(title);
+ em.appendChild(year);
+ rootElement.appendChild(em);
+ }
+
+ TransformerFactory transformerFactory = TransformerFactory.newInstance();
+ Transformer transformer = transformerFactory.newTransformer();
+ DOMSource source = new DOMSource(document);
+ StreamResult result = new StreamResult(path);
+ transformer.transform(source, result);
+ }
+
+ public void writeActorsXML(final String path) throws Exception {
+ OutputStream out = null;
+ try {
+ OutputStream outFile = new FileOutputStream(path);
+ out = new BufferedOutputStream(outFile);
+ out.write("<actors>".getBytes());
+
+
+ for (int i = 0; i < actors.size(); i++) {
+ ActorData a = actors.get(i);
+
+ out.write("<actor>".getBytes());
+
+ out.write("<name>".getBytes());
+ out.write(a.getName().getBytes());
+ out.write("</name>".getBytes());
+
+ out.write("<roles>".getBytes());
+ for ( RoleData movieRole : a.getMovieRoles() )
+ {
+
+ if (movieRole == null || movieRole.getRole() == null) {
+
+ }
+ else {
+ out.write("<role>".getBytes());
+ out.write(movieRole.getRole().getBytes());
+ out.write("</role>".getBytes());
+ }
+
+
+ out.write("<movie>".getBytes());
+ out.write(movieRole.getTitle().getBytes());
+ out.write("</movie>".getBytes());
+ }
+ out.write("</roles>".getBytes());
+
+ out.write("</actor>".getBytes());
+
+ int j = 1000;
+ if (i % j == 0) {
+ System.out.println(i + "");
+ }
+ }
+
+ out.write("</actors>".getBytes());
+ } finally {
+ if (out != null) {
+ out.close();
+ }
+ }
+ }
+
+ public void writeActressesXML(final String path) throws Exception {
+ OutputStream out = null;
+ try {
+ OutputStream outFile = new FileOutputStream(path);
+ out = new BufferedOutputStream(outFile);
+
+ out.write("<actresses>".getBytes());
+
+
+ for (int i = 0; i < actresses.size(); i++) {
+ ActorData a = actresses.get(i);
+
+ out.write("<actress>".getBytes());
+
+ out.write("<name>".getBytes());
+ out.write(a.getName().getBytes());
+ out.write("</name>".getBytes());
+
+ out.write("<roles>".getBytes());
+ for ( RoleData movieRole : a.getMovieRoles() )
+ {
+ if (movieRole == null || movieRole.getRole() == null) {
+
+ }
+ else {
+ out.write("<role>".getBytes());
+ out.write(movieRole.getRole().getBytes());
+ out.write("</role>".getBytes());
+ }
+
+
+ out.write("<movie>".getBytes());
+ out.write(movieRole.getTitle().getBytes());
+ out.write("</movie>".getBytes());
+ }
+ out.write("</roles>".getBytes());
+
+ out.write("</actress>".getBytes());
+
+ int j = 1000;
+ if (i % j == 0) {
+ System.out.println(i + "");
+ }
+ }
+
+ out.write("</actresses>".getBytes());
+ } finally {
+ if (out != null) {
+ out.close();
+ }
+ }
+ }
+}
17 Main.java
@@ -0,0 +1,17 @@
+public class Main
+{
+ public static void main(String[] args) throws Exception {
+ ImdbReaderImpl iri = new ImdbReaderImpl();
+ ImdbParser ip = new ImdbParser(iri);
+
+ // ip.parseMovies("/path/to/file/movies.list.gz");
+
+ // this parses actors and actresses
+ ip.parseActors("/path/to/file/actors.list.gz", "/path/to/file/actresses.list.gz");
+
+ // ip.parseMovies("/path/to/file/movies.list.gz");
+ // iri.writeMoviesXML("/path/to/file/movies.xml");
+ // iri.writeActorsXML("/path/to/file/actors.xml");
+ iri.writeActressesXML("/path/to/file/actresses.xml");
+ }
+}
26 MovieData.java
@@ -0,0 +1,26 @@
+public class MovieData
+{
+ private final String title;
+ private final int year;
+
+ /**
+ * Create container for movie data.
+ * @param title title of movie
+ * @param year release year of movie
+ */
+ MovieData( final String title, final int year )
+ {
+ this.title = title;
+ this.year = year;
+ }
+
+ public String getTitle()
+ {
+ return title;
+ }
+
+ public int getYear()
+ {
+ return year;
+ }
+}
15 README
@@ -0,0 +1,15 @@
+== Java IMDB DB Parser
+
+This Java based parser outputs XML files based on IMDB plain text data files available at http://www.imdb.com/interfaces
+
+== Compile
+
+You'll need to edit Main.java and decide on what you want to output, movies.xml, actors.xml, or actresses.xml. Comment out the other parse lines and output lines.
+
+javac *.java
+
+== Run
+
+You'll need to set a high memory ceiling while the data is loaded into memory. I used something like this:
+
+java -Xms500m -Xmx3g test
33 RoleData.java
@@ -0,0 +1,33 @@
+/**
+4 * Holds information about what role an actor has in a movie
+5 */
+ public class RoleData
+ {
+ private final String title;
+ private final String role;
+
+ RoleData( final String title, final String role )
+ {
+ this.title = title;
+ this.role = role;
+ }
+
+ /**
+ * Returns the title of the movie, never <code>null</code>.
+ * @return title of the movie
+ */
+ public String getTitle()
+ {
+ return this.title;
+ }
+
+ /**
+ * Returns the role the actor had in the movie, may be <code>null</code>
+ * if no information is available.
+ * @return actor role or null if information not avilable
+ */
+ public String getRole()
+ {
+ return this.role;
+ }
+}

0 comments on commit ee06248

Please sign in to comment.
Something went wrong with that request. Please try again.