In [1]:
TMDB_API_KEY = #API KEY с https://developers.themoviedb.org/

In [2]:
import requests

def get_movie_data(tmdbId):  
    url = "https://api.themoviedb.org/3/movie/%s" % tmdbId
    params = {
        "api_key": TMDB_API_KEY,
        "external_source": "imdb_id"
    }
    resp = requests.get(url, params=params)
    return resp.json()

In [3]:
get_movie_data(555)

{'adult': False,
 'backdrop_path': None,
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 53, 'name': 'Thriller'}],
 'homepage': 'http://www.luecke-im-system.de/',
 'id': 555,
 'imdb_id': 'tt0442896',
 'original_language': 'en',
 'original_title': 'Absolut',
 'overview': 'Two guys against globalization want to plant a virus in the network of a finance corporation. On the day of the attack Alex has an accident and cannot remember anything.',
 'popularity': 0.026365,
 'poster_path': '/6YemisOilgHbBp6UtgoONHg8eJk.jpg',
 'production_companies': [{'id': 319, 'name': 'Frenetic Films'}],
 'production_countries': [{'iso_3166_1': 'CH', 'name': 'Switzerland'}],
 'release_date': '2005-04-20',
 'revenue': 0,
 'runtime': 94,
 'spoken_languages': [{'iso_639_1': 'fr', 'name': 'Français'}],
 'status': 'Released',
 'tagline': '',
 'title': 'Absolut',
 'video': False,
 'vote_average': 0.0,
 'vote_count': 0}

In [4]:
import pandas as pd
import numpy as np

In [6]:
links_df = pd.read_csv("/home/ubuntu/data/movielens/ml-latest-small/links.csv")
links_df.tail()

Unnamed: 0,movieId,imdbId,tmdbId
9120,162672,3859980,402672.0
9121,163056,4262980,315011.0
9122,163949,2531318,391698.0
9123,164977,27660,137608.0
9124,164979,3447228,410803.0


In [7]:
import time

movies = []
for cnt, movie_id in enumerate(links_df.movieId.unique()):
    tmdbId = int(links_df.loc[movie_id].tmdbId)
    data = get_movie_data(tmdbId)
    if len(data) == 0:
        continue
    movies.append({
        "movie_id": movie_id, 
        "overview": data["overview"],
        "title": data["title"]
    })
    # 40 requests every 10 second
    time.sleep(10 / 40)

In [8]:
print(len(movies))
movies[:2]

101


[{'movie_id': 1,
  'overview': "When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.",
  'title': 'Jumanji'},
 {'movie_id': 2,
  'overview': "A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max.",
  'title': 'Grumpier Old Men'}]

In [9]:
texts = [m["overview"] for m in movies]

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
target_m = movies[0]
target_m

{'movie_id': 1,
 'overview': "When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.",
 'title': 'Jumanji'}

In [12]:
item_profile = tfidf.transform([target_m["overview"]])
item_profile

<1x1907 sparse matrix of type '<class 'numpy.float64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [13]:
ftr_id_to_term = {ftr_id: term for term, ftr_id in tfidf.vocabulary_.items()}

In [14]:
for ftr_id, score in sorted(
    zip(item_profile.indices, item_profile.data),
    key=lambda x: x[1],
    reverse=True
):
    print(ftr_id_to_term[ftr_id], ":", score)

game : 0.416298942633
alan : 0.322917283351
unwittingly : 0.161458641675
trapped : 0.161458641675
terrifying : 0.161458641675
siblings : 0.161458641675
running : 0.161458641675
risky : 0.161458641675
rhinoceroses : 0.161458641675
proves : 0.161458641675
peter : 0.161458641675
judy : 0.161458641675
invite : 0.161458641675
giant : 0.161458641675
freedom : 0.161458641675
finish : 0.161458641675
enchanted : 0.161458641675
creatures : 0.161458641675
adult : 0.161458641675
26 : 0.161458641675
opens : 0.148184481055
monkeys : 0.148184481055
magical : 0.148184481055
living : 0.148184481055
door : 0.148184481055
years : 0.138766314211
room : 0.138766314211
inside : 0.138766314211
hope : 0.138766314211
evil : 0.138766314211
board : 0.138766314211
discover : 0.131461016462
world : 0.102799826126


In [15]:
r_df = pd.read_csv("/home/ubuntu/data/movielens/ml-latest-small/ratings.csv")
# посчитаем количество рейтингов на пользователя
r_per_user = r_df.userId.value_counts()
# оставим только 10 пользователей с минимум 50 оценками
good_uids = np.random.choice(r_per_user[r_per_user >= 50].index, size=10, replace=False)
good_uids

array([482, 136, 166, 532, 525,  13, 298, 652, 433, 560])

In [29]:
uid = np.random.choice(good_uids)
uid

433

In [30]:
# история рейтингов пользователя
uid_df = r_df[r_df.userId == uid]
uid_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
59860,433,1,4.5,1417930541
59861,433,6,5.0,1417930660
59862,433,11,1.5,1417930769
59863,433,16,4.5,1417930766
59864,433,19,2.5,1417930709


In [31]:
uid_df["liked"] = (uid_df.rating >= uid_df.rating.mean()).astype(int)
uid_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,userId,movieId,rating,timestamp,liked
59860,433,1,4.5,1417930541,1
59861,433,6,5.0,1417930660,1
59862,433,11,1.5,1417930769,0
59863,433,16,4.5,1417930766,1
59864,433,19,2.5,1417930709,0


In [32]:
uid_df.liked.value_counts()

1    115
0     83
Name: liked, dtype: int64

In [33]:
X = []
y = []

for t in uid_df.itertuples():
    for m in movies:
        if m["movie_id"] == t.movieId:
            print(m["title"], ":", t.liked)
            X.append(tfidf.transform([m["overview"]]))
            y.append(t.liked)
            break

Jumanji : 1
Sabrina : 1
Dracula: Dead and Loving It : 0
Sense and Sensibility : 1
Money Train : 0
Babe : 1
It Takes Two : 0
When Night Is Falling : 1
Lamerica : 0
Margaret's Museum : 0


In [34]:
from scipy.sparse import vstack
X = vstack(X, 'csr')
y = np.array(y)

In [35]:
sum(y.dot(X)).data

array([ 0.80729321,  0.80729321,  1.61458642,  0.69383157,  0.80729321,
        0.65730508,  0.74092241,  0.80729321,  0.69383157,  0.80729321,
        0.80729321,  2.08149471,  0.80729321,  0.69383157,  0.69383157,
        0.80729321,  0.80729321,  0.74092241,  0.74092241,  0.74092241,
        0.74092241,  0.80729321,  0.80729321,  0.80729321,  0.80729321,
        0.69383157,  0.80729321,  0.80729321,  0.80729321,  0.80729321,
        0.80729321,  0.51399913,  0.69383157,  1.03183239,  1.32755914,
        1.32755914,  1.14097633,  1.32755914,  1.32755914,  1.32755914,
        1.32755914,  1.32755914,  1.2184152 ,  1.32755914,  1.32755914,
        1.32755914,  1.32755914,  1.32755914,  1.14959554,  1.25257472,
        1.25257472,  1.25257472,  1.25257472,  1.07653065,  1.25257472,
        0.97355148,  1.25257472,  1.25257472,  1.01985712,  1.14959554,
        1.25257472,  1.01985712,  1.25257472,  1.14959554,  1.07653065,
        1.25257472,  1.31158655,  2.62317309,  1.20375577,  1.31