# recommendMe

In [11]:
import vk
import pandas as pd
import random
import time
from sklearn.externals import joblib
from scipy.sparse import csr_matrix, lil_matrix
from pathlib import Path
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm_notebook
import urllib
import pymongo
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity 
from IPython.display import clear_output
from flask import Flask

## База данных

In [2]:
client = pymongo.MongoClient("localhost", 27017)
db = client["vk_users"]

In [3]:
user_info = db["User_Info"]
movie_info = db["Movie_Info"]
ratings_info = db["ratings"]

## Validating movies

In [4]:
API_KEY = "c30070156d8d7a150ea066f8094a2013"

In [5]:
def get_coded_string(string):
    value_utf8 = string.encode("utf-8")
    return urllib.parse.quote(value_utf8)

In [6]:
def get_movie_info(movie_name, api_key, desired_list=['id', 'title', 'original_title', 'genre_ids', 'poster_path']):
    
    URL = "https://api.themoviedb.org/3/search/movie?api_key={0}&language=ru&query={1}&page=1&include_adult=false"
    
    URL = URL.format(api_key, get_coded_string(movie_name))
    
    response = urllib.request.urlopen(URL)
    
    response = response.read().decode("utf-8")
    
    js = json.loads(response)
    
    if len(js['results']) == 0:
        return []
    else:
        return {x : js['results'][0][x] for x in desired_list}

## Получаем информацию о юзерах, у которых указано больше N любимых фильмов

In [7]:
def get_data(id, n, user_db=user_info, movie_db=movie_info, ratings_db=ratings_info, min_num_of_movies=3):
    
    last_id = 0
    count = 0
    
    while user_db.count() < n:
        
        f = vk.get_friends(str(id))
        
        friends = []
        for x in f:
            friends.append(str(x))
        
        friends_info = vk.get_users_info(friends)
                
        for friend in friends_info.keys():
            if user_db.count() >= n:
                break
            
            try:
                if count == 100:
                    time.sleep(1)
                    count = 0
                else:
                    count += 1        
                if ("movies" in friends_info[friend].keys() 
                    and len(friends_info[friend]["movies"].split(",")) >= min_num_of_movies
                    #and type(user_db.find_one({"_id":friend})) == "NoneType"
                    and len(friends) >= 5):
                                        
                    movies = []
                    min_len = 2
                    for movie in friends_info[friend]["movies"].split(","):
                        if len(movie) >= min_len:
                            m_info = get_movie_info(movie, API_KEY)
                            if m_info != []:
                                movies.append(m_info['id'])
                                m_info['_id'] = m_info['id']
                                del m_info['id']
                                
                                try:
                                    movie_db.insert_one(m_info)
                                except Exception as e:
                                    1+1
                    if len(movies) >= min_num_of_movies:
                        friends_info[friend]['_id'] = friends_info[friend]['uid']
                        del friends_info[friend]['uid']
                        friends_info[friend]['movies'] = movies
                        try:
                            f = friends_info[friend]
                            f['groups'] = vk.get_groups(f['_id'])['groups']['items']
                            f = {x : f[x] for x in ["movies", "groups", "_id"]}
                            user_db.insert_one(f)
                            
                            rating = {str(x) : 10 for x in f['movies']}
                            rating["_id"] = f["_id"]
                            ratings_db.insert_one(rating)
                            
                        except Exception as e:
                            1+1
            except:
                pass
        clear_output()
        print("Size: " + str(user_db.count()))
        
        if user_db.count() < n:
            last_id = id
            while True: 
                ind = random.randint(0, len(friends)-1)
                if (len(vk.get_friends(friends[ind])) >= 5
                    and last_id != friends[ind]):
                    id = friends[ind]
                    break
            
    return movies

## Скачиваем юзеров

In [None]:
get_data(133506, 1000000)

## Строим Sparse матрицу

In [8]:
def build_matrix(user_db=user_info, ratings_db=ratings_info, build_type="groups", matr_type="csr"):
    '''
    build_type = "groups"/"ratings"
    matr_type = "csr"/"lil"
    '''
    
    users_arr = np.array([x["_id"] for x in user_db.find()])
    users_dict = {x : i for i, x in enumerate(users_arr)}
    
    user_matr = []
    info_matr = []
    data_matr = []
    
    if build_type == "groups":
        info_arr = []
        for x in user_db.find():
            for group in x['groups']:
                info_arr.append(group)
                user_matr.append(x['_id'])
                info_matr.append(group)
                data_matr.append(1)
        info_arr = np.unique(info_arr)
    elif build_type == "ratings":
        info_arr = []
        for x in ratings_db.find():
            user = x["_id"]
            del x["_id"]
            for movie in x.keys():
                info_arr.append(movie)
                user_matr.append(user)
                info_matr.append(movie)
                data_matr.append(x[movie])
        info_arr = np.unique(info_arr)
    
    info_dict = {x : i for i, x in enumerate(info_arr)}
    
    user_matr = np.array([users_dict[x] for x in user_matr])
    info_matr = np.array([info_dict[x] for x in info_matr])
    matr = csr_matrix((data_matr, (user_matr, info_matr)), shape=(len(users_dict), len(info_dict)))
    return matr, users_dict, info_dict    

## Похожесть юзеров

In [9]:
def get_most_simular(user_matr, user_index, N=5):
    '''
    Get N most simular users
    '''
    simularities = cosine_similarity(user_matr[user_index], user_matr)
    indexes = simularities.argsort()
    return indexes[0][-N-1:-1]

## Функции для рекомендации

In [51]:
def ratings_rec(matrix, user_index, users, movies, return_num=10, movie_db=movie_info):
    svd = TruncatedSVD(n_components=100)
    svd.fit(matrix)
    
    y = svd.inverse_transform(svd.transform(matrix[user_index].todense()))
    
    reverse_users = {y:x for y, x in enumerate(users)}
    reverse_movies = {y:x for y, x in enumerate(movies)}
    
    movies = []
    for key, value in sorted(enumerate(y[0]), key=lambda x: -x[1])[:return_num]:
        movies.append(movie_db.find_one({"_id":int(reverse_movies[key])}))
    return movies

In [58]:
def groups_rec(matrix, user_index, users, return_num=5, movie_db=movie_info, user_db=user_info):
    reverse_users = {y:x for y, x in enumerate(users)}
    
    movies = []
    
    for _id in reversed(get_most_simular(matrix, user_index, N=return_num)):
        for movie in (user_db.find_one({"_id":int(reverse_users[_id])})['movies']):
            movies.append(movie_db.find_one({"_id":movie}))
            
    return movies

## Тестим

In [27]:
ratings_matrix, r_users, r_movies = build_matrix(build_type="ratings")

In [34]:
group_matrix, g_users, g_groups = build_matrix(build_type="groups")

In [23]:
n = 1075

In [35]:
r_reverse_users = {y:x for y, x in enumerate(r_users)}
g_reverse_users = {y:x for y, x in enumerate(g_users)}

In [36]:
id_ = r_reverse_users[n]

In [30]:
for x in user_info.find_one({"_id":int(id_)})['movies']:
    print(movie_info.find_one({"_id":x})['title'])

После дождичка в четверг
Достучаться до небес
Кэнди
Счастливое число Слевина


In [32]:
for movie in ratings_rec(ratings_matrix, n, r_users, r_movies):
    print(movie['title'])

Дети кукурузы: Апокалипсис
Освобождение: Прорыв
Феи, сон в летнюю ночь
Одна любовь на миллион
В бой идут одни старики
Лицензия на брак
Индиана Джонс и последний крестовый поход
Чикаго
Двенадцать обезьян
Мастер и Маргарита


In [62]:
for movie in groups_rec(group_matrix, g_users[id_], users, return_num=1):
    print(movie['title'])

Жизнь других
Мой лучший друг Шейлок
Спасти мистера Бэнкса
Честная куртизанка
Анна и король
Токийский мраморный шоколад
Бродвейский идиот
Алешкина любовь
Через вселенную
Спеши любить
Дом у озера
Куда приводят мечты
Октябрьское небо


## Web

In [94]:
app = Flask("recApp")

In [96]:
@app.route("/getRecs/<int:user_id>")
def get_recs(user_id):
    user = user_info.find_one({"_id":user_id})
    recs = []
    if user:
        matr, users, movies = build_matrix(build_type="ratings")
        recs = ratings_rec(matrix, users[user_id], users, movies)
    else:
        vk_user = vk.get_users_info([user_id])[str(user_id)]
        if 'movies' in vk_user and len(vk_user['movies']) != 0:
            unchecked = []
            for movie in vk_user['movies']:
                m = movie_info.find_one({"title":movie}) 
                if m:
                    recs.append(m)
                else:
                    unchecked.append(movie)
            for movie in unchecked:
                m = get_movie_info(movie, API_KEY)
                if m != []:
                    m["_id"] = m["id"]
                    del m["id"]
                    recs.append(m)
                    try:
                        movie_info.insert_one(m)
                    except:
                        1+1
    return json.dumps(recs)

In [None]:
app.run(port=8888)