# Imports

In [190]:
import pandas as pd
import string
from matplotlib import pyplot as plt
from matplotlib import mlab
import re
import requests
import json
import tqdm

## Explore the datasets


### Read in data

In [191]:
data = pd.read_table('./data/u.data', names=['UserId', 'ItemId', 'Rating', 'Timestamp'], index_col=[0,1])
item = pd.read_table('./data/u.item', sep='|', names=['ItemId', 'Title', 'ReleaseDt', 'VideoReleaseDt', 'Url', \
                                                      'Unknown', 'Action', 'Adventure', 'Animation', 'Children',\
                                                      'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Noir',\
                                                      'Horror', 'Musical','Mystery', 'Romance', 'Sci-Fi', 'Thriller',\
                                                      'War', 'Western'], index_col=0)
user = pd.read_table('./data/u.user', sep='|', names=['UserId', 'Age', 'Gender','Occupation', 'Zip'], index_col=0)
item.head()

Unnamed: 0_level_0,Title,ReleaseDt,VideoReleaseDt,Url,Unknown,Action,Adventure,Animation,Children,Comedy,...,Fantasy,Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
ItemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### u.data
|User ID|Item ID|Rating|Timestamp|
---------------------------------

### u.item
|User ID|Item ID|Rating|Timestamp|
---------------------------------

### u.user
|User ID|Item ID|Rating|Timestamp|
---------------------------------

In [192]:
score = pd.merge(item, data.groupby(level=1).agg({'Rating':{'Average':'mean', 'Total':'count'}}), left_index=True, right_index=True) #.sort_values(by = 'Rating', ascending=False)
score = score.rename(index=str, columns={(u'Rating', u'Average'):'Average',(u'Rating', u'Total'):'Ratings'})
score[['Title', 'Average', 'Ratings']].sort_values(by=['Average', 'Ratings'], ascending=False).head()

Unnamed: 0_level_0,Title,Average,Ratings
ItemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1189,Prefontaine (1997),5.0,3
1293,Star Kid (1997),5.0,3
1467,"Saint of Fort Washington, The (1993)",5.0,2
1500,Santa with Muscles (1996),5.0,2
814,"Great Day in Harlem, A (1994)",5.0,1


## Retrieve more data

The Online Movie Database (OMBD) offers a free, public api which can be used to look up information about movies.  I will use this to gather the runtime for the the list of movies.

In order to get the runtime data from OMDB I parsed the title and year (in case of remakes or other cases of duplicate titles) from the IMDB url provided in the data.


In [193]:
def urlParse(url):
    """
    Parse Url to retrieve properly formatted Title and Year
    """
    p1 = re.compile('http://us\.imdb\.com/M/title-exact\?([^\(]*).*?(\d{4})')
    t, year = re.match(p1, url).groups()
    p2 = re.compile('%20')
    title = re.sub(p2, ' ', t)
    return title, year

import random
print urlParse(item.loc[random.randint(0, len(item)), 'Url'])

('Freeway ', '1996')


In [194]:
def titleParse(title):
    """
    Parse title to retrieve properly formated Title and Year
    """
    p = re.compile('^([^,(]*)([^(]*)?\s?\(?(.*)?\)?\s?\((\d{4})\)')
    title,s1,s2,year = re.match(p, title).groups()
    if len(s1) > 2:
        title = s1[2:]+' '+title
    return title.strip(), year


unknown


In [195]:
item['temp'] = item['Title'].loc[item['Title']!= 'unknown'].apply(titleParse)
item[['Title', 'Year']] = item['temp'].apply(pd.Series)
item.head()

Unnamed: 0_level_0,Title,ReleaseDt,VideoReleaseDt,Url,Unknown,Action,Adventure,Animation,Children,Comedy,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,temp,Year
ItemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story,01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,"(Toy Story, 1995)",1995
2,GoldenEye,01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0,"(GoldenEye, 1995)",1995
3,Four Rooms,01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,"(Four Rooms, 1995)",1995
4,Get Shorty,01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,"(Get Shorty, 1995)",1995
5,Copycat,01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,"(Copycat, 1995)",1995


In [201]:
def getRuntime(title):
    '''
    IMDB url is parsed to retrieve Title and Year of release for film
    '''
    title, year = title
    url = 'http://omdbapi.com/?'
    #title, year = urlParse(url)
    params = {'t':title, 'y':year}
    try:
        resp = requests.get(url, params=params)
    except e:
        raise Exception('Request returned status'.format(resp.status_code))
    dat = json.loads(resp.text)
    return int(dat['Runtime'[:-4]])

In [None]:
rt = []

for i in tqdm(item['temp']):
    try:
        rt.append(getRuntime(i))
    except:
        rt.append(-1)
runtimes = pd.DataFrame(zip(item.index.values, rt))


In [None]:
runtimes.head()