In [124]:
from sklearn.linear_model import LinearRegression, LogisticRegression
import json
import os
from collections import Counter
import numpy as np

#local library
import config


In [4]:
movies_data = json.load(open(os.path.join(config.BASE_DIR, 'data', 'movies.json'), 'rb'))
ratings_data = json.load(open(os.path.join(config.BASE_DIR, 'data', 'ratings.json'), 'rb'))

In [7]:
NUM_USERS = len(ratings_data)
NUM_MOVIES = len(movies_data)

#normalize each movie ID to a value between 0 and NUM_MOVIES - 1
MOVIE_TO_IDX_LOOKUP = dict(zip(map(int, movies_data.keys()), range(NUM_MOVIES)))
IDX_TO_MOVIE_LOOKUP = dict([(v, k) for k,v in MOVIE_TO_IDX_LOOKUP.items()])

In [5]:
#create genre vectors for each Movie

In [8]:
genre_set = set([])

for m_data in movies_data.values():
    for g in m_data['genres']:
        genre_set.add(g)
        
genre_dict = dict([(val, idx) for idx, val in enumerate(genre_set)])
genre_dict

{u'(no genres listed)': 14,
 u'Action': 13,
 u'Adventure': 19,
 u'Animation': 10,
 u'Children': 17,
 u'Comedy': 15,
 u'Crime': 6,
 u'Documentary': 16,
 u'Drama': 7,
 u'Fantasy': 8,
 u'Film-Noir': 5,
 u'Horror': 4,
 u'IMAX': 12,
 u'Musical': 9,
 u'Mystery': 0,
 u'Romance': 1,
 u'Sci-Fi': 3,
 u'Thriller': 18,
 u'War': 11,
 u'Western': 2}

In [9]:
NUM_GENRES = len(genre_dict)

In [92]:
genre_matrix = np.zeros((NUM_MOVIES, NUM_GENRES))

In [93]:
for m_data in movies_data.values():
    movie_row_idx = MOVIE_TO_IDX_LOOKUP[m_data['movieId']]
    for g in m_data['genres']:
        genre_matrix[movie_row_idx, genre_dict[g]] = 1

In [94]:
print 'matrix shape:', genre_matrix.shape

matrix shape: (9125, 20)


In [95]:
# look up Movie Zero in Vector and 

print IDX_TO_MOVIE_LOOKUP[0]
print genre_matrix[0] #the vectors populated should match the genere's from genre_dict
print 'genre idx for movie at zer'
movies_data["73469"]

73469
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
  0.  0.]
genre idx for movie at zer


{u'genres': [u'Documentary'],
 u'movieId': 73469,
 u'title': u'Mr. Warmth: The Don Rickles Project (2007)'}

In [141]:
#now to create a model for each user...http://localhost:5000/user/342
#strong interest in Crime, Drama "Good Films", biased towards high reviews (avg 4.12)

user_id = 342

user_ratings = ratings_data[str(user_id)]
num_user_ratings = len(user_ratings)
print 'User:', user_id, 'Num Ratings:', num_user_ratings

user_movie_idx, ratings = np.zeros((num_user_ratings,)), np.zeros((num_user_ratings,))

for idx, movie in enumerate(user_ratings):
    movie_idx = MOVIE_TO_IDX_LOOKUP[movie['movieId']]
    rating = movie['rating']
    user_movie_idx[idx] = int(movie_idx)
    ratings[idx] = rating

#needs to be integer for lookup
user_movie_idx = user_movie_idx.astype(int)
#ratings = ratings - ratings.mean() data normalization step

User: 342 Num Ratings: 187


In [142]:
train_cut_off = int(0.8*user_movie_idx.shape[0])

user_movie_idx_train = user_movie_idx[:train_cut_off]
user_movie_idx_test = user_movie_idx[train_cut_off:]

In [145]:
X_train = genre_matrix[user_movie_idx_train]
y_train = ratings[:train_cut_off]

X_test = genre_matrix[user_movie_idx_test]
y_test = ratings[train_cut_off:]

In [146]:
lr = LinearRegression()

In [147]:
lr.fit(X = X_train, y = y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [148]:
print lr.coef_
print lr.intercept_

[  4.39677619e-01   8.91430669e-03   1.67273166e-01  -2.87643639e-01
  -1.64752717e-01  -4.11036012e-01   2.02605238e-01  -5.60865873e-02
   2.70979216e-01  -1.00586197e-01   2.13798395e-01   2.78216100e-01
   6.34338948e-01  -4.29967854e-01  -3.59054352e-15  -1.01741328e-01
  -3.68024237e-02  -5.83934152e-01  -1.54191801e-01   5.16118827e-02]
4.37013575705


In [149]:
genre_coefs = [(genre, lr.coef_[idx]) for genre, idx in genre_dict.items()]
for genre, coef in sorted(genre_coefs, key = lambda x : -x[1]):
    print genre, coef

IMAX 0.634338947543
Mystery 0.43967761942
War 0.278216099942
Fantasy 0.270979215958
Animation 0.21379839483
Crime 0.202605238376
Western 0.167273165524
Adventure 0.0516118827287
Romance 0.00891430669054
(no genres listed) -3.59054352021e-15
Documentary -0.0368024237117
Drama -0.0560865873167
Musical -0.100586196847
Comedy -0.101741327837
Thriller -0.154191800539
Horror -0.164752717456
Sci-Fi -0.287643639141
Film-Noir -0.411036012236
Action -0.429967854437
Children -0.583934151875


## Compared to User Based Filtering Results dont seem as good a fit to the users profile

In [152]:
preds_all_movies = lr.predict(X_test)
for test_idx in np.argsort(-preds_all_movies)[:25]:
    idx = user_movie_idx_test[test_idx]
    movie_id = IDX_TO_MOVIE_LOOKUP[idx]
    print movies_data[str(movie_id)]
    print preds_all_movies[test_idx], y_test[test_idx]

{u'genres': [u'Adventure', u'Animation', u'Children', u'Comedy', u'Crime', u'Fantasy', u'Mystery'], u'movieId': 2987, u'title': u'Who Framed Roger Rabbit? (1988)'}
4.86313262865 3.0
{u'genres': [u'Drama', u'Mystery'], u'movieId': 3068, u'title': u'Verdict, The (1982)'}
4.75372678915 4.0
{u'genres': [u'Drama', u'Mystery', u'Romance', u'Thriller'], u'movieId': 2819, u'title': u'Three Days of the Condor (3 Days of the Condor) (1975)'}
4.6084492953 4.0
{u'genres': [u'Drama', u'Mystery', u'Thriller'], u'movieId': 2712, u'title': u'Eyes Wide Shut (1999)'}
4.59953498861 4.0
{u'genres': [u'Comedy', u'Drama', u'War'], u'movieId': 5060, u'title': u'M*A*S*H (a.k.a. MASH) (1970)'}
4.49052394183 4.0
{u'genres': [u'Drama', u'Fantasy', u'Musical'], u'movieId': 2971, u'title': u'All That Jazz (1979)'}
4.48444218884 4.0
{u'genres': [u'Comedy', u'Crime', u'Drama'], u'movieId': 2973, u'title': u'Crimes and Misdemeanors (1989)'}
4.41491308027 4.0
{u'genres': [u'Crime', u'Drama', u'Thriller'], u'movieId': 

In [159]:
lr2 = LogisticRegression()
y_train2 = np.zeros_like(y_train)
y_train2[y_train >= 4.0] = 1.0
print 'avg score', y_train2.mean()
print
lr2.fit(X = X_train, y = y_train2)
genre_coefs = [(genre, lr2.coef_[0, idx]) for genre, idx in genre_dict.items()]
for genre, coef in sorted(genre_coefs, key = lambda x : -x[1]):
    print genre, coef

avg score 0.872483221477

Mystery 0.828678989114
Fantasy 0.758664283701
Musical 0.539288983304
Horror 0.491428634312
War 0.36904321696
Western 0.35473903545
Crime 0.346038802647
Romance 0.29844210996
Drama 0.224017280444
Film-Noir 0.190754970604
Animation 0.149970181577
IMAX 0.130253978859
(no genres listed) 0.0
Children -0.176223746377
Adventure -0.2166154618
Thriller -0.342724628932
Documentary -0.391764627531
Sci-Fi -0.44706702059
Action -0.455803205083
Comedy -0.55135296112


In [162]:
preds_all_movies = lr2.predict_proba(X_test)[:, 1]
#print preds_all_movies
for test_idx in np.argsort(-preds_all_movies)[:25]:
    idx = user_movie_idx_test[test_idx]
    movie_id = IDX_TO_MOVIE_LOOKUP[idx]
    print movies_data[str(movie_id)]
    print preds_all_movies[test_idx], y_test[test_idx]

{u'genres': [u'Drama', u'Fantasy', u'Musical'], u'movieId': 2971, u'title': u'All That Jazz (1979)'}
0.963840728298 4.0
{u'genres': [u'Adventure', u'Animation', u'Children', u'Comedy', u'Crime', u'Fantasy', u'Mystery'], u'movieId': 2987, u'title': u'Who Framed Roger Rabbit? (1988)'}
0.947855443344 3.0
{u'genres': [u'Drama', u'Musical', u'Romance'], u'movieId': 2565, u'title': u'King and I, The (1956)'}
0.943893967179 4.0
{u'genres': [u'Drama', u'Mystery'], u'movieId': 3068, u'title': u'Verdict, The (1982)'}
0.943412653812 4.0
{u'genres': [u'Drama', u'Mystery', u'Romance', u'Thriller'], u'movieId': 2819, u'title': u'Three Days of the Condor (3 Days of the Condor) (1975)'}
0.941001675234 4.0
{u'genres': [u'Drama', u'Mystery', u'Thriller'], u'movieId': 2712, u'title': u'Eyes Wide Shut (1999)'}
0.9220831923 4.0
{u'genres': [u'Drama', u'Romance'], u'movieId': 2906, u'title': u'Random Hearts (1999)'}
0.907499713613 4.0
{u'genres': [u'Drama', u'Romance'], u'movieId': 2942, u'title': u'Flashda