In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

  from pandas.core import datetools


In [4]:
user_columns = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=user_columns, encoding='latin-1')
users

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
5,6,42,M,executive,98101
6,7,57,M,administrator,91344
7,8,36,M,administrator,05201
8,9,29,M,student,01002
9,10,53,M,lawyer,90703


In [5]:
movie_columns = ['movies_id','title','release_date','video_release_date','imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=movie_columns, encoding='latin-1', usecols= range(5))
movies = movies.drop('video_release_date', axis=1)
movies

Unnamed: 0,movies_id,title,release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995)
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...
6,7,Twelve Monkeys (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Twelve%20Monk...
7,8,Babe (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Babe%20(1995)
8,9,Dead Man Walking (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Dead%20Man%20...
9,10,Richard III (1995),22-Jan-1996,http://us.imdb.com/M/title-exact?Richard%20III...


In [6]:
ratings_columns = ['user_id','movie_id','rating']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_columns, encoding='latin-1', usecols = range(3))
ratings

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
5,298,474,4
6,115,265,2
7,253,465,5
8,305,451,3
9,6,86,3


In [7]:
#users.to_csv('data/users.csv',sep=',', encoding = 'utf-8')
#movies.to_csv('data/movies.csv',sep=',', encoding = 'utf-8')
#ratings.to_csv('data/ratings.csv',sep=',', encoding = 'utf-8')

In [8]:
movies = pd.read_csv('data/movies.csv')
movies.head()

Unnamed: 0,movies_id,title,release_date,imdb_url
0,1,Toy Story (1995),1-Jan-95,http://www.imdb.com/title/tt0114709/
1,2,GoldenEye (1995),1-Jan-95,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),1-Jan-95,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),1-Jan-95,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),1-Jan-95,http://us.imdb.com/M/title-exact?Copycat%20(1995)


In [9]:
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

In [10]:
make_clickable('www.wvu.edu')

'<a href="www.wvu.edu">www.wvu.edu</a>'

In [11]:
movies.style.format({'imdb_url': make_clickable})

Unnamed: 0,movies_id,title,release_date,imdb_url
0,1,Toy Story (1995),1-Jan-95,http://www.imdb.com/title/tt0114709/
1,2,GoldenEye (1995),1-Jan-95,http://us.imdb.com/M/title-exact?GoldenEye%20(1995)
2,3,Four Rooms (1995),1-Jan-95,http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)
3,4,Get Shorty (1995),1-Jan-95,http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)
4,5,Copycat (1995),1-Jan-95,http://us.imdb.com/M/title-exact?Copycat%20(1995)
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) (1995),1-Jan-95,http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995)
6,7,Twelve Monkeys (1995),1-Jan-95,http://us.imdb.com/M/title-exact?Twelve%20Monkeys%20(1995)
7,8,Babe (1995),1-Jan-95,http://us.imdb.com/M/title-exact?Babe%20(1995)
8,9,Dead Man Walking (1995),1-Jan-95,http://us.imdb.com/M/title-exact?Dead%20Man%20Walking%20(1995)
9,10,Richard III (1995),22-Jan-96,http://us.imdb.com/M/title-exact?Richard%20III%20(1995)


In [12]:
my_ratings = ratings.where(ratings.user_id == 196).dropna(0)
test_ratings = my_ratings[:15]
test_ratings

my_ratings = my_ratings[~my_ratings.movie_id.isin(test_ratings.movie_id)]
my_ratings

Unnamed: 0,user_id,movie_id,rating
17102,196.0,8.0,5.0
17830,196.0,428.0,4.0
18853,196.0,1118.0,4.0
21605,196.0,70.0,3.0
22271,196.0,66.0,3.0
22773,196.0,257.0,2.0
23189,196.0,108.0,4.0
24030,196.0,202.0,3.0
25726,196.0,340.0,3.0
32721,196.0,287.0,3.0


In [68]:
def get_similar_movies():
    ratings196 = ratings.where(ratings.user_id == 196).dropna(0)
    otherratings = ratings.where(ratings.user_id != 196).dropna(0)
    # merge based on movie id
    merged_df = pd.merge(ratings196,otherratings,how = 'inner', right_on = 'movie_id', left_index = True)
    merged2 = merged_df.where(rating_x == rating_y)

In [69]:
get_similar_movies()

Unnamed: 0,movie_id,user_id_x,movie_id_x,rating_x,user_id_y,movie_id_y,rating_y
5720,940,196.0,393,4.0,16.0,940,2.0
9080,940,196.0,393,4.0,130.0,940,3.0
9391,940,196.0,393,4.0,125.0,940,2.0
11317,940,196.0,393,4.0,393.0,940,2.0
13428,940,196.0,393,4.0,279.0,940,5.0
15491,940,196.0,393,4.0,221.0,940,4.0
15791,940,196.0,393,4.0,363.0,940,2.0
19902,940,196.0,393,4.0,303.0,940,2.0
21883,940,196.0,393,4.0,472.0,940,4.0
28185,940,196.0,393,4.0,450.0,940,2.0
