# User profile

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
TMDB_API_KEY = #API KEY с https://developers.themoviedb.org/

In [3]:
import requests

def get_movie_data(imdb_id):
    real_imdb_id = 'tt{:>07}'.format(imdb_id)    
    url = "https://api.themoviedb.org/3/find/%s" % real_imdb_id
    params = {
        "api_key": TMDB_API_KEY,
        "external_source": "imdb_id"
    }
    resp = requests.get(url, params=params)
    return resp.json()

In [4]:
links_df = pd.read_csv("/Users/tural/Datasets/ml-20m/links.csv")
links_df.set_index("movieId", inplace=True)
links_df.tail()

Unnamed: 0_level_0,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
131254,466713,4436.0
131256,277703,9274.0
131258,3485166,285213.0
131260,249110,32099.0
131262,1724965,286971.0


## Создадим toy-dataset

In [5]:
r_df = pd.read_csv("/Users/tural/Datasets/ml-20m/ratings.csv")
r_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [6]:
# посчитаем количество рейтингов на пользователя
r_per_user = r_df.userId.value_counts()
# оставим только 10 пользователей с 50 рейтингами
good_uids = np.random.choice(r_per_user[r_per_user == 50].index, size=10, replace=False)
good_uids

array([ 56632,  89693,  18633, 104151, 117552, 119301,   8667,  50302,
        85588,  97712])

In [7]:
r_df = r_df[r_df.userId.isin(good_uids)]
r_df.shape

(500, 4)

## Посчитаем TFIDF

In [8]:
import time

In [9]:
movies = []
for cnt, movie_id in enumerate(r_df.movieId.unique()):
    if cnt % 100 == 0:
        print("Обработано %s из %s фильмов" % (cnt, r_df.movieId.unique().size))
    
    imdb_id = int(links_df.loc[movie_id].imdbId)
    
    data = get_movie_data(imdb_id)
    if len(data['movie_results']) > 0:
        data = data['movie_results'][0]
    elif len(data['tv_results']) > 0:
        data = data['tv_results'][0]
    else:
        continue
        
    movies.append({
        "movie_id": movie_id, 
        "overview": data["overview"],
        "title": data["title"]
    })
    
    #40 requests every 10 second
    time.sleep(10 / 40)

Обработано 0 из 332 фильмов
Обработано 100 из 332 фильмов
Обработано 200 из 332 фильмов
Обработано 300 из 332 фильмов


In [10]:
texts = [m["overview"] for m in movies]

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

## Создадим профиль пользователя

In [12]:
uid = np.random.choice(good_uids)
uid

56632

In [13]:
# история рейтингов пользователя
uid_df = r_df[r_df.userId == uid]
uid_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
8215288,56632,11,4.0,839594819
8215289,56632,21,4.0,839594749
8215290,56632,25,5.0,839594931
8215291,56632,39,4.0,839594749
8215292,56632,47,4.0,839594718


In [14]:
uid_df["liked"] = (uid_df.rating >= uid_df.rating.mean()).astype(int)
uid_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,userId,movieId,rating,timestamp,liked
8215288,56632,11,4.0,839594819,1
8215289,56632,21,4.0,839594749,1
8215290,56632,25,5.0,839594931,1
8215291,56632,39,4.0,839594749,1
8215292,56632,47,4.0,839594718,1


In [15]:
uid_df.liked.value_counts()

1    34
0    16
Name: liked, dtype: int64

In [91]:
X = []
y = []

for t in uid_df.itertuples():
    for m in movies:
        if m["movie_id"] == t.movieId:
            print(m["title"], ":", t.liked)
            X.append(tfidf.transform([m["overview"]]))
            y.append(t.liked)
            break

The American President : 1
Get Shorty : 1
Leaving Las Vegas : 1
Clueless : 1
Se7en : 1
Braveheart : 1
Apollo 13 : 1
Congo : 0
Crimson Tide : 1
The Net : 0
Nine Months : 0
Waterworld : 0
Disclosure : 1
Dumb and Dumber : 1
I.Q. : 1
Interview with the Vampire : 0
Junior : 0
Legends of the Fall : 1
Nell : 1
Outbreak : 1
Leon: The Professional : 0
Quiz Show : 1
The Specialist : 0
The Santa Clause : 1
The Shawshank Redemption : 1
Tommy Boy : 0
While You Were Sleeping : 0
Ace Ventura: Pet Detective : 1
Clear and Present Danger : 1
The Client : 1
Forrest Gump : 1
Four Weddings and a Funeral : 1
The Mask : 0
Speed : 1
True Lies : 0
City Slickers II: The Legend of Curly's Gold : 0
The Firm : 1
The Fugitive : 1
In the Line of Fire : 1
Jurassic Park : 1
Mrs. Doubtfire : 1
Philadelphia : 1
The Piano : 1
Sleepless in Seattle : 1
Home Alone : 0
Ghost : 0
Aladdin : 1
The Silence of the Lambs : 1
Beauty and the Beast : 1
Pretty Woman : 0


In [92]:
from scipy.sparse import vstack
X = vstack(X, 'csr')
y = np.array(y)

## Feature selection

In [93]:
from sklearn.feature_selection import chi2
chi2s, pval = chi2(X, y)

In [94]:
chi2s[np.isnan(chi2s)] = -1
chi2s

array([ 0.07432642, -1.        , -1.        , ..., -1.        ,
       -1.        , -1.        ])

In [95]:
TOP_FTRS = 20
best_ftrs = np.argsort(chi2s)[-TOP_FTRS:]
chi2s[best_ftrs]

array([ 0.74547937,  0.77624087,  0.77624087,  0.77794576,  0.7853112 ,
        0.79402942,  0.7952263 ,  0.79786416,  0.81634676,  0.83371667,
        0.83689944,  0.86639572,  0.86639572,  0.89717871,  0.91694147,
        0.92747911,  0.97327149,  1.04960351,  1.12168425,  1.13657971])

In [96]:
X = X[:, best_ftrs]

In [97]:
X.shape

(50, 20)

## Алгоритм Rocchio

$$
\overrightarrow{Q_m} = 
a \cdot \overrightarrow{Q_0} + 
(b \cdot \frac{1}{|D_r|} \cdot \sum_{\overrightarrow{D_j} \in D_r}\overrightarrow{D_j}) - 
(c \cdot \frac{1}{|D_{nr}|} \cdot \sum_{\overrightarrow{D_k} \in D_{nr}}\overrightarrow{D_k})
$$

In [98]:
a = 0
b = c = 1

In [99]:
r = np.where(y == 1)[0]
nr = np.where(y == 0)[0]

In [100]:
user_profile = b * X[r].sum(axis=0) / r.size - c * X[nr].sum(axis=0) / nr.size
user_profile

matrix([[-0.02192586, -0.02283061, -0.02283061, -0.02288076, -0.02309739,
         -0.02335381, -0.02338901, -0.02346659, -0.0240102 , -0.02452108,
         -0.02461469, -0.02548223, -0.02548223, -0.02638761, -0.02696887,
         -0.0272788 , -0.02862563, -0.03087069, -0.03299071, -0.03342881]])

In [113]:
from sklearn.preprocessing import normalize
np.c_[normalize(X).dot(normalize(user_profile).T), y]

array([[ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [-0.18904583,  0.        ],
       [ 0.        ,  1.        ],
       [-0.38259862,  0.        ],
       [-0.21142189,  0.        ],
       [-0.19914676,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [-0.23519907,  0.        ],
       [-0.20166113,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [-0.20233006,  0.        ],
       [ 0.        ,  1.        ],
       [-0.28822481,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [-0.3943304 ,  0.        ],
       [-0.2681997 ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,