<h1 align='center'>Final Capstone: <i>Betcha Can't Guess What I Watched</i></h1>
<h2 align='center'>Philip Bowman</h2>
<h1 align='center'><u>Final Product</u></h1>

This is the final selected model and its implementation for demo purposes.

In [1]:
import os
from os.path import join
import pandas as pd
import numpy as np
import time
from IPython.display import clear_output
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tmdbv3api import Movie, TMDb

In [2]:
movies_dir = r'C:\Users\philb\Datasets\movies_post_exploration'
movies_file = 'movies.pkl'
unpop_file = 'less_popular_movies.pkl'
pool_file = 'recommendation_pool.pkl' #won't need this until later

In [3]:
#this cell takes all the textual features in the movies dataframe and puts it altogether into the movies_strings variable 

movies = pd.read_pickle(join(movies_dir, movies_file))
unpop_movies_ids = pd.read_pickle(join(movies_dir, unpop_file)).index

text_features = ['spoken_languages', 'genres', 'overview', 'tagline', 'keywords', 'production_companies', 'acting_top_5', 'director', 'writers']
list_features = ['spoken_languages', 'genres', 'keywords', 'production_companies', 'acting_top_5', 'director', 'writers']

movies_features = movies[text_features].copy()
movies_features = movies_features.fillna(' ')

def unpack_list(x):
    string_rep = ' '
    for item in x:
        string_rep = string_rep + str(item) + ' '
    return string_rep

for column in list_features:
    movies_features[column] = movies_features[column].apply(unpack_list)
    
add_space_columns = [column for column in movies_features.columns if column not in list_features]

for column in add_space_columns:
    movies_features[column] = movies_features[column].apply(lambda x: x + ' ')
    
movies_text = movies_features.sum(axis=1)
movies_titles = movies.title
pop_movies_indicies = [index for index in movies_titles.index if index not in unpop_movies_ids]
pop_movies = movies_titles[pop_movies_indicies]

del movies, text_features, list_features, movies_features, add_space_columns, unpop_movies_ids

rec_pool = pd.read_pickle(join(movies_dir, pool_file))
rec_idxs = [movies_titles.index.get_loc(movie_id) for movie_id in rec_pool.index]

In [4]:
def cosine_pipe(text_docs, feature_space):
    cv = CountVectorizer(max_features=feature_space, stop_words='english')
    feature_csr = cv.fit_transform(text_docs)
    all_vars = (cv, feature_csr)
    return all_vars

tokenizer, movie_vectors = cosine_pipe(movies_text, None)
rec_vectors = movie_vectors[rec_idxs]

In [5]:
API_KEY_PATH = r'C:\Users\philb\Datasets\API_KEY.txt'
with open(API_KEY_PATH) as f:
    API_KEY = f.readline()
f.close()

In [6]:
tmdb = TMDb()
movie = Movie()
tmdb.api_key = API_KEY

In [9]:
def get_id(search_term):
    print('------------')
    print('MOVIE SEARCH')
    print('------------')
    try:
        page1 = [(i, movie) for i, movie in enumerate(movie.search(search_term))]
        for tup in page1:
            print(tup[0], tup[1])
        time.sleep(.01)
        movie_sel = int(input('Enter index # of selected movie: '))
        movie_id = page1[movie_sel][1].id
        clear_output(wait=True)
        return movie_id
    except:
        print('No movies were found using that query.')

def top_X_obscure(movie_id, x_sim=10):
    cos_sim = cosine_similarity(rec_vectors, movie_vectors[movies_titles.index.get_loc(movie_id)])
    if movie_id in rec_idxs:
        top_X_idx = pd.DataFrame(cos_sim, index=rec_pool.index).loc[:, 0].sort_values(ascending=False).iloc[1:(x_sim+1)].index
    elif movie_id not in rec_idxs:
        top_X_idx = pd.DataFrame(cos_sim, index=rec_pool.index).loc[:, 0].sort_values(ascending=False).iloc[0:(x_sim)].index
    print(f'Top {x_sim} Obscure Movies Similar to {movies_titles.loc[movie_id]}')
    movie_links = ['https://www.themoviedb.org/movie/'+str(i) for i in top_X_idx]
    movie_df = pd.DataFrame(movies_titles.loc[top_X_idx])
    movie_df['links'] = movie_links
    print(movie_df.to_markdown())
        
        
def user_query(user_input, x_sim=10):
    vector = tokenizer.transform([user_input])
    cos_sim = cosine_similarity(rec_vectors, vector)
    top_X_idx = pd.DataFrame(cos_sim, index=rec_pool.index).loc[:, 0].sort_values(ascending=False).iloc[0:(x_sim)].index
    print(f'Top {x_sim} Obscure Movies Similar to user search: "{user_input}"')
    movie_links = ['https://www.themoviedb.org/movie/'+str(i) for i in top_X_idx]
    movie_df = pd.DataFrame(movies_titles.loc[top_X_idx])
    movie_df['links'] = movie_links
    print(movie_df.to_markdown())

def get_recommendations(query, kind='movie', x_sim=10):
    if kind == 'movie':
        top_X_obscure(get_id(query), x_sim)
    elif kind == 'query':
        user_query(query, x_sim)

In [10]:
get_recommendations('lord of the rings', kind='movie')

Top 10 Obscure Movies Similar to The Lord of the Rings: The Fellowship of the Ring
|    id | title                                         | links                                  |
|------:|:----------------------------------------------|:---------------------------------------|
|  7234 | Wizards of the Lost Kingdom                   | https://www.themoviedb.org/movie/7234  |
|  2274 | The Seeker: The Dark Is Rising                | https://www.themoviedb.org/movie/2274  |
| 24993 | Kickboxer 2:  The Road Back                   | https://www.themoviedb.org/movie/24993 |
|  9964 | Bad Taste                                     | https://www.themoviedb.org/movie/9964  |
| 87689 | The Dragon Ring                               | https://www.themoviedb.org/movie/87689 |
|  1362 | The Hobbit                                    | https://www.themoviedb.org/movie/1362  |
| 14034 | What's the Worst That Could Happen?           | https://www.themoviedb.org/movie/14034 |
| 11188 | Ring of the Nibe