#Overview & motivation

All of our team members enjoy movies. In addition to enjoying movies, we also enjoy working with API’s and somewhat structured data sets. Therefore, determining what makes a movie successful using the data available in the Internet Movie Database (IMDB) and Wikipedia seemed like a natural choice.

In [1]:
from imdb import IMDb
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import cPickle as pickle
ia = IMDb(accessSystem='http')
from collections import defaultdict 
import io
from datetime import datetime
import time

In [2]:
%run 'Starpower.ipynb'

In [53]:
ia.get_movie('5152218')

<Movie id:5152218[http] title:_Bram Stoker's Dracula (1993) (VG)_>

###Related Work

#Initial Questions

#Data

In [4]:
# Load AAdict (dict of Oscar nominated movies)
AAdict = pickle.load(open('AAdict.p','rb'))
# Load movies (dict of all movies)
#movies = pickle.load(io.open('moviestemp.p','rb'))

In [5]:
# convert AAdict to pandas
AAdf = pd.DataFrame.from_dict(AAdict).transpose()
AAdf['movieid'] = AAdf.index
# hand-code genres for one movie that was missing genre info
AAdf.loc['5152218',:].genres = ["Horror","Romance"]
AAdf.head()
#AAdf[AAdf['Nominated Best Actor']==1].head()

Unnamed: 0,Nominated Best Actor,Nominated Best Actress,Nominated Best Animated Feature Film,Nominated Best Art Direction,Nominated Best Cinematography,Nominated Best Costume Design,Nominated Best Director,"Nominated Best Documentary, Feature","Nominated Best Documentary, Short Subject",Nominated Best Film Editing,...,genres,keywords,mpaa,nominations,releasedate,runtime,title,won,year,movieid
35423,False,False,False,False,False,False,False,False,False,False,...,"[Comedy, Fantasy, Romance]","[time-travel, brooklyn-bridge, bridge, time-tr...",PG-13,"[Best Music, Song]",2001-12-25,118,Kate & Leopold,[],2001,35423
80388,Burt Lancaster,Susan Sarandon,False,False,False,False,Louis Malle,False,False,False,...,"[Crime, Drama, Romance]","[drugs, gangster, camera-shot-of-feet, female-...",,"[Best Picture, Best Actor, Best Actress, Best ...",1981-04-03,104,Atlantic City,[],1981,80388
80855,False,False,False,Tambi Larsen (Art Direction); Jim Berkey (Set ...,False,False,False,False,False,False,...,[Western],"[immigrant, sheriff, 1890s, johnson-county-war...",,[Best Art Direction],1980-11-18,149,Heaven's Gate,[],1981,80855
81974,Paul Newman,False,False,False,False,False,False,False,False,False,...,"[Drama, Romance, Thriller]","[murder, newspaper, mafia, reporter, slander, ...",,"[Best Actor, Best Supporting Actress, Best Wri...",1981-11-19,116,Absence of Malice,[],1981,81974
81988,False,False,False,False,False,False,False,"Suzanne Bauman, Paul Neshamkin, Jim Burroughs ...",False,False,...,[Documentary],,,"[Best Documentary, Feature]",,60,Against Wind and Tide: A Cuban Odyssey,[],1981,81988


In [6]:
all_genres = set()
for _,movie in AAdf.iterrows():
    for genre in movie.genres:
        all_genres.add(genre)

In [7]:
keywords_dict = {}
for _,movie in AAdf.iterrows():
    if type(movie.keywords) == list:
        for keyword in movie.keywords:
            if keyword in keywords_dict.keys():
                keywords_dict[keyword] += 1
            else:
                keywords_dict[keyword] = 1

In [35]:
shortened_dict = keywords_dict
for keyword in shortened_dict.keys():
    if shortened_dict[keyword] <= 200:
        del shortened_dict[keyword]
len(shortened_dict)

25

In [10]:
shortened_dict

{u'based-on-novel': 259,
 u'blockbuster': 203,
 u'blood': 259,
 u'character-name-in-title': 265,
 u'cigarette-smoking': 226,
 u'cult-film': 229,
 u'dancing': 220,
 u'death': 337,
 u'family-relationships': 270,
 u'father-daughter-relationship': 252,
 u'father-son-relationship': 305,
 u'female-nudity': 233,
 u'flashback': 218,
 u'friendship': 285,
 u'husband-wife-relationship': 338,
 u'independent-film': 352,
 u'love': 291,
 u'marriage': 216,
 u'mother-daughter-relationship': 211,
 u'mother-son-relationship': 292,
 u'murder': 317,
 u'sex': 201,
 u'singing': 201,
 u'title-spoken-by-character': 232,
 u'violence': 220}

In [18]:
# create genres & keywords sparse matrices
for genre in all_genres:
    AAdf.loc[:,genre] = 0
for keyword in shortened_dict:
    AAdf.loc[:,keyword] = 0

for movie in AAdf.iterrows():
    if type(movie[1].genres) == list:
        for genre in all_genres:
            if genre in set(movie[1].genres):
                AAdf.loc[movie[0],genre] = 1
    if type(movie[1].keywords) == list:
        for keyword in shortened_dict:
            if keyword in set(movie[1].keywords):
                AAdf.loc[movie[0],keyword] = 1

In [17]:
AAdf

Unnamed: 0,Nominated Best Actor,Nominated Best Actress,Nominated Best Animated Feature Film,Nominated Best Art Direction,Nominated Best Cinematography,Nominated Best Costume Design,Nominated Best Director,"Nominated Best Documentary, Feature","Nominated Best Documentary, Short Subject",Nominated Best Film Editing,...,mother-son-relationship,murder,friendship,husband-wife-relationship,family-relationships,father-son-relationship,singing,based-on-novel,violence,dancing
0035423,False,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,1
0080388,Burt Lancaster,Susan Sarandon,False,False,False,False,Louis Malle,False,False,False,...,0,1,0,1,0,0,0,0,0,0
0080855,False,False,False,Tambi Larsen (Art Direction); Jim Berkey (Set ...,False,False,False,False,False,False,...,0,1,0,0,0,0,0,0,1,0
0081974,Paul Newman,False,False,False,False,False,False,False,False,False,...,0,1,0,0,0,0,0,0,0,0
0081988,False,False,False,False,False,False,False,"Suzanne Bauman, Paul Neshamkin, Jim Burroughs ...",False,False,...,,,,,,,,,,
0082010,False,False,False,False,False,False,False,False,False,False,...,0,1,1,0,0,0,0,0,0,0
0082012,False,False,False,False,False,False,False,False,Obie Benz (Producer),False,...,,,,,,,,,,
0082031,Dudley Moore,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
0082095,False,False,False,False,False,False,False,False,False,False,...,0,0,0,1,1,0,1,0,0,0
0082096,False,False,False,False,Jost Vacano,False,Wolfgang Petersen,False,False,Hannes Nikel,...,0,0,0,0,0,0,0,1,0,0


In [52]:
test = AAdf[0:3]
test.apply(lambda row: starpower(row['cast'],row['year']), axis=1)

2015-12-04 19:34:45,595 CRITICAL [imdbpy] C:\Users\Steven\Anaconda\lib\site-packages\imdb\_exceptions.py:35: IMDbError exception raised; args: ("object '0000212' is not a Movie, Person, Character or Company instance",); kwds: {}
None
CRITICAL:imdbpy:IMDbError exception raised; args: ("object '0000212' is not a Movie, Person, Character or Company instance",); kwds: {}
None
2015-12-04 19:34:45,598 CRITICAL [imdbpy] C:\Users\Steven\Anaconda\lib\site-packages\imdb\_exceptions.py:35: IMDbError exception raised; args: ("object '0000212' is not a Movie, Person, Character or Company instance",); kwds: {}
Traceback (most recent call last):
  File "C:\Users\Steven\Anaconda\lib\site-packages\pandas\core\frame.py", line 3770, in _apply_standard
    labels=labels)
  File "pandas\src\reduce.pyx", line 625, in pandas.lib.reduce (pandas\lib.c:41385)
  File "pandas\src\reduce.pyx", line 136, in pandas.lib.Reducer.get_result (pandas\lib.c:31446)
  File "pandas\src\reduce.pyx", line 120, in pandas.lib.Re

0000212
0000212


IMDbError: ("object '0000212' is not a Movie, Person, Character or Company instance", u'occurred at index 0035423')

#Exploratory Data Analysis

#Final Analysis

#Presentation