# Imports

In [1]:
%matplotlib notebook
import pandas as pd
import string
from matplotlib import pyplot as plt
from matplotlib import mlab
from math import sqrt
import re
import requests
import json
import tqdm

ImportError: No module named tqdm

## Explore the datasets


### Read in data

In [3]:
data = pd.read_table('./data/u.data', names=['UserId', 'ItemId', 'Rating', 'Timestamp'], index_col=[0,1])
item = pd.read_table('./data/u.item', sep='|', names=['ItemId', 'Title', 'ReleaseDt', 'VideoReleaseDt', 'Url', \
                                                      'Unknown', 'Action', 'Adventure', 'Animation', 'Children',\
                                                      'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Noir',\
                                                      'Horror', 'Musical','Mystery', 'Romance', 'Sci-Fi', 'Thriller',\
                                                      'War', 'Western'], index_col=0)
user = pd.read_table('./data/u.user', sep='|', names=['UserId', 'Age', 'Gender','Occupation', 'Zip'], index_col=0)


In [4]:
vals = pd.DataFrame()
vals['Fresh'] = data['Rating'].apply(lambda x: x > 2.5)
vals['Rotten'] = data['Rating'].apply(lambda x: x < 2.5)
vals['1'] = data['Rating'].apply(lambda x: x == 1)
vals['2'] = data['Rating'].apply(lambda x: x == 2)
vals['3'] = data['Rating'].apply(lambda x: x == 3)
vals['4'] = data['Rating'].apply(lambda x: x == 4)
vals['5'] = data['Rating'].apply(lambda x: x == 5)
vals = vals.groupby(level=1).sum()
vals.head()

Unnamed: 0_level_0,Fresh,Rotten,1,2,3,4,5
ItemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,417.0,35.0,8.0,27.0,96.0,202.0,119.0
2,106.0,25.0,8.0,17.0,55.0,42.0,9.0
3,59.0,31.0,11.0,20.0,25.0,23.0,11.0
4,179.0,30.0,6.0,24.0,57.0,93.0,29.0
5,71.0,15.0,4.0,11.0,32.0,33.0,6.0


In [5]:
score = pd.merge(item, data.groupby(level=1).agg({'Rating':{'Average':'mean', 'Total':'count'}}), left_index=True, right_index=True) #.sort_values(by = 'Rating', ascending=False)
score = pd.merge(score, vals, left_index=True, right_index=True)
score = score.rename(index=str, columns={(u'Rating', u'Average'):'Average',(u'Rating', u'Total'):'Ratings',(u'Fresh', u'Fresh'):'Fresh',(u'Rotten', u'Rotten'):'Rotten'})
#score[['Title', 'Average', 'Ratings', 'Fresh', 'Rotten']].sort_values(by=['Average', 'Ratings'], ascending=False).head()



## Retrieve more data

The Online Movie Database (OMBD) offers a free, public api which can be used to look up information about movies.  I will use this to gather the runtime for the the list of movies.

In order to get the runtime data from OMDB I parsed the title and year (in case of remakes or other cases of duplicate titles) from the IMDB url provided in the data.


In [6]:
def urlParse(url):
    """
    Parse Url to retrieve properly formatted Title and Year
    """
    p1 = re.compile('http://us\.imdb\.com/M/title-exact\?([^\(]*).*?(\d{4})')
    t, year = re.match(p1, url).groups()
    p2 = re.compile('%20')
    title = re.sub(p2, ' ', t)
    return title, year

#import random
#print urlParse(item.loc[random.randint(0, len(item)), 'Url'])

In [7]:
def titleParse(title):
    """
    Parse title to retrieve properly formated Title and Year
    """
    p = re.compile('^([^,(]*)([^(]*)?\s?\(?(.*)?\)?\s?\((\d{4})\)')
    title,s1,s2,year = re.match(p, title).groups()
    if len(s1) > 2:
        title = s1[2:]+' '+title
    return title.strip(), year


In [8]:
item['temp'] = item['Title'].loc[item['Title']!= 'unknown'].apply(titleParse)
item[['Title', 'Year']] = item['temp'].apply(pd.Series)

In [9]:
def getRuntime(title):
    '''
    IMDB url is parsed to retrieve Title and Year of release for film
    '''
    title, year = title
    url = 'http://omdbapi.com/?'
    #title, year = urlParse(url)
    params = {'t':title, 'y':year}
    try:
        resp = requests.get(url, params=params)
    except e:
       raise Exception('Request returned status'.format(s))
    dat = json.loads(resp.text)
    return int(dat['Runtime'][:-4])

In [10]:
'''
This takes forever to run (2-3 minutes)
I've exported the results to a csv so it can be imported each time 
rather than having to grab the data from OMDB each time
'''
#rt = []
#for i in tqdm.tqdm(item['temp']):
#    try:
#        rt.append(getRuntime(i))
#    except:
#        rt.append(-1)
#runtimes = pd.DataFrame(zip(item.index.values, rt))
#runtimes.to_csv('./data/u.runtime', sep='|')
#runtimes.rename(index=str, columns={0: "ItemId", 1: "Runtime"}, inplace=True)
#runtimes.set_index('ItemId', inplace=True)
runtimes = pd.read_table('./data/u.runtime', sep='|', header = 0, names = ['ItemId', 'Runtime'], index_col=0)
runtimes.head()

Unnamed: 0_level_0,Runtime
ItemId,Unnamed: 1_level_1
1,81
2,130
3,98
4,105
5,123


In [11]:
score = pd.merge(score, runtimes, on='ItemId')

In [12]:
#score.drop(['ReleaseDt', 'VideoReleaseDt', 'Url'], axis=1, inplace=True)
score.head()

Unnamed: 0_level_0,Title,ReleaseDt,VideoReleaseDt,Url,Unknown,Action,Adventure,Animation,Children,Comedy,...,Average,Ratings,Fresh,Rotten,1,2,3,4,5,Runtime
ItemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,3.878319,452,417.0,35.0,8.0,27.0,96.0,202.0,119.0,
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,3.206107,131,106.0,25.0,8.0,17.0,55.0,42.0,9.0,
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,3.033333,90,59.0,31.0,11.0,20.0,25.0,23.0,11.0,
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,3.550239,209,179.0,30.0,6.0,24.0,57.0,93.0,29.0,
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,3.302326,86,71.0,15.0,4.0,11.0,32.0,33.0,6.0,


In [31]:
genres = item.columns.values[5:23]
for n, g in enumerate(genres):
    plt.subplot(3, 6, n+1)
    plt.scatter(score['Year'].loc[score[g]==1], score['Rating'].loc[raw_score[g]==1])
    plt.title(g)


<IPython.core.display.Javascript object>

KeyError: 'Rating'

In [343]:
def confidence(x):
    fresh, rotten = x[0],x[1]
    n = fresh+rotten
    if n == 0:
        return 0
    z = 1.5
    phat = float(fresh) / n
    return ((phat + z*z/(2*n) - z * sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n))

In [344]:
score['Confidence'] = score[['Fresh', 'Rotten']].apply(confidence, axis = 1)


In [345]:
score.sort_values(by='Confidence', ascending = False)

Unnamed: 0_level_0,Title,ReleaseDt,VideoReleaseDt,Url,Unknown,Action,Adventure,Animation,Children,Comedy,...,Sci-Fi,Thriller,War,Western,Average,Ratings,Rotten,Fresh,Runtime,Confidence
ItemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
480,North by Northwest (1959),01-Jan-1959,,http://us.imdb.com/M/title-exact?North%20by%20...,0,0,0,0,0,1,...,0,1,0,0,4.284916,179,1.0,178.0,136,0.977950
483,Casablanca (1942),01-Jan-1942,,http://us.imdb.com/M/title-exact?Casablanca%20...,0,0,0,0,0,0,...,0,0,1,0,4.456790,243,2.0,241.0,102,0.977499
603,Rear Window (1954),01-Jan-1954,,http://us.imdb.com/M/title-exact?Rear%20Window...,0,0,0,0,0,0,...,0,1,0,0,4.387560,209,2.0,207.0,112,0.973883
484,"Maltese Falcon, The (1941)",01-Jan-1941,,http://us.imdb.com/M/title-exact?Maltese%20Fal...,0,0,0,0,0,0,...,0,0,0,0,4.210145,138,1.0,137.0,100,0.971511
178,12 Angry Men (1957),01-Jan-1957,,http://us.imdb.com/M/title-exact?12%20Angry%20...,0,0,0,0,0,0,...,0,0,0,0,4.344000,125,1.0,124.0,96,0.968604
498,"African Queen, The (1951)",01-Jan-1951,,http://us.imdb.com/M/title-exact?African%20Que...,0,1,1,0,0,0,...,0,0,1,0,4.184211,152,2.0,150.0,105,0.964254
166,Manon of the Spring (Manon des sources) (1986),01-Jan-1986,,http://us.imdb.com/M/title-exact?Manon%20des%2...,0,0,0,0,0,0,...,0,0,0,0,4.120690,58,0.0,58.0,113,0.962656
427,To Kill a Mockingbird (1962),01-Jan-1962,,http://us.imdb.com/M/title-exact?To%20Kill%20a...,0,0,0,0,0,0,...,0,0,0,0,4.292237,219,4.0,215.0,129,0.962471
494,His Girl Friday (1940),01-Jan-1940,,http://us.imdb.com/M/title-exact?His%20Girl%20...,0,0,0,0,0,1,...,0,0,0,0,4.000000,56,0.0,56.0,92,0.961373
194,"Sting, The (1973)",01-Jan-1973,,"http://us.imdb.com/M/title-exact?Sting,%20The%...",0,0,0,0,0,1,...,0,0,0,0,4.058091,241,5.0,236.0,129,0.960413
