# Item profile

In [120]:
TMDB_API_KEY = #API KEY с https://developers.themoviedb.org/

In [55]:
import requests

def get_movie_data(imdb_id):
    real_imdb_id = 'tt{:>07}'.format(imdb_id)    
    url = "https://api.themoviedb.org/3/find/%s" % real_imdb_id
    params = {
        "api_key": TMDB_API_KEY,
        "external_source": "imdb_id"
    }
    resp = requests.get(url, params=params)
    return resp.json()

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [70]:
links_df = pd.read_csv("/Users/tural/Datasets/ml-20m/links.csv")
links_df.set_index("movieId", inplace=True)
links_df.tail()

Unnamed: 0_level_0,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
131254,466713,4436.0
131256,277703,9274.0
131258,3485166,285213.0
131260,249110,32099.0
131262,1724965,286971.0


In [60]:
get_movie_data(466713)

{'movie_results': [{'adult': False,
   'backdrop_path': '/4fdm7zgQQIgUwtgEcHJsPTRVHQw.jpg',
   'genre_ids': [35],
   'id': 4436,
   'original_language': 'de',
   'original_title': "Kein Bund für's Leben",
   'overview': 'The movie deals with a guy who gets to join the German Bundeswehr involuntarily because a colleague loses his denial papers in order to get the chance to get down on his girlfriend. When entering the Bundeswehr he acts like a giant idiot and of course gets in one room with some of the biggest losers around. The loser turns out to be a hero and leads his loser-colleagues to win a contest with the local US army squad.',
   'popularity': 1.098836,
   'poster_path': '/2gBDKYK0teVaOhzo6zhTnsMpUVM.jpg',
   'release_date': '2007-08-30',
   'title': "Kein Bund für's Leben",
   'video': False,
   'vote_average': 5.4,
   'vote_count': 10}],
 'person_results': [],
 'tv_episode_results': [],
 'tv_results': [],
 'tv_season_results': []}

## Создадим toy-dataset

In [77]:
r_df = pd.read_csv("/Users/tural/Datasets/ml-20m/ratings.csv")
r_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [91]:
# посчитаем количество рейтингов на пользователя
r_per_user = r_df.userId.value_counts()
# оставим только 10 пользователей с 50 рейтингами
good_uids = np.random.choice(r_per_user[r_per_user == 50].index, size=10, replace=False)
good_uids

array([130046,  89196, 105781,   2785,  96034, 102295,  55451,  95713,
        80579,  60065])

In [92]:
r_df = r_df[r_df.userId.isin(good_uids)]
r_df.shape

(500, 4)

## Посчитаем TFIDF

In [93]:
r_df.movieId.unique().size

325

In [94]:
import time

In [95]:
movies = []
for cnt, movie_id in enumerate(r_df.movieId.unique()):
    if cnt % 100 == 0:
        print("Обработано %s из %s фильмов" % (cnt, r_df.movieId.unique().size))
    
    imdb_id = int(links_df.loc[movie_id].imdbId)
    
    data = get_movie_data(imdb_id)
    if len(data['movie_results']) > 0:
        data = data['movie_results'][0]
    elif len(data['tv_results']) > 0:
        data = data['tv_results'][0]
    else:
        continue
        
    movies.append({
        "movie_id": movie_id, 
        "overview": data["overview"],
        "title": data["title"]
    })
    
    #40 requests every 10 second
    time.sleep(10 / 40)

Обработано 0 из 325 фильмов
Обработано 100 из 325 фильмов
Обработано 200 из 325 фильмов
Обработано 300 из 325 фильмов


In [100]:
print(len(movies))
movies[:2]

324


[{'movie_id': 111,
  'overview': 'A mentally unstable Vietnam War veteran works as a night-time taxi driver in New York City where the perceived decadence and sleaze feeds his urge for violent action, attempting to save a preadolescent prostitute in the process.',
  'title': 'Taxi Driver'},
 {'movie_id': 296,
  'overview': "A burger-loving hit man, his philosophical partner, a drug-addled gangster's moll and a washed-up boxer converge in this sprawling, comedic crime caper. Their adventures unfurl in three stories that ingeniously trip back and forth in time.",
  'title': 'Pulp Fiction'}]

In [102]:
texts = [m["overview"] for m in movies]

In [105]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

## Создадим профиль объекта

In [114]:
target_m = movies[0]
target_m

{'movie_id': 111,
 'overview': 'A mentally unstable Vietnam War veteran works as a night-time taxi driver in New York City where the perceived decadence and sleaze feeds his urge for violent action, attempting to save a preadolescent prostitute in the process.',
 'title': 'Taxi Driver'}

In [115]:
item_profile = tfidf.transform([target_m["overview"]])
item_profile

<1x4612 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [116]:
ftr_id_to_term = {ftr_id: term for term, ftr_id in tfidf.vocabulary_.items()}

In [119]:
for ftr_id, score in sorted(zip(item_profile.indices, item_profile.data), key=lambda x: x[1], reverse=True):
    print(ftr_id_to_term[ftr_id], ":", score)

vietnam : 0.229581083492
urge : 0.229581083492
taxi : 0.229581083492
sleaze : 0.229581083492
preadolescent : 0.229581083492
perceived : 0.229581083492
mentally : 0.229581083492
feeds : 0.229581083492
driver : 0.229581083492
decadence : 0.229581083492
unstable : 0.214297543826
process : 0.214297543826
attempting : 0.214297543826
veteran : 0.195042561023
prostitute : 0.195042561023
works : 0.188170160193
night : 0.188170160193
violent : 0.182359628084
action : 0.172886620526
york : 0.162042776561
city : 0.153631637724
save : 0.148913757389
war : 0.140948704785
time : 0.139195186443
new : 0.117614735702
