# Machine Learning Movie Recommendation Application
Allows users to search a movie and find various related movies that they may like to watch. This is done by training the model with movie data.

In [None]:
!pip install Ipython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 4.3 MB/s 
Installing collected packages: jedi
Successfully installed jedi-0.18.1


In [None]:
#obtain the movie data from a well known website known as grouplens
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy
from IPython.display import display
import ipywidgets
import pandas

movies_data = pandas.read_csv("movies.csv")
movie_ratings = pandas.read_csv("ratings.csv")
#remove unncessary characters
def remove_chars(movie_name):
  return re.sub("[^a-zA-Z0-9 ]", "", movie_name)

#searching for movies
def movie_search(movie_title):
  movie_title = remove_chars(movie_title)
  check_vector = vector.transform([movie_title])
  similarity_score = cosine_similarity(check_vector, tfid).flatten() #compares the search term to the altered titles in the dataset and returns the similarity score using cosine similarity
  #get movie titles with greatest similarity
  similar_indices = numpy.argpartition(similarity_score, -5)[-5:] #finds 5 most similar movie titles to what is searched 
  relevant_titles = movies_data.iloc[similar_indices][::-1]
  return relevant_titles

#updates movie search everytime the user types
def when_user_types(movie_data):
  with data_output:
    data_output.clear_output()
    movie_title = movie_data["new"]
    if(len(movie_title) > 5):
      search = movie_search(movie_title)
      id = search.iloc[0]["movieId"]
      display(get_similar_movies(id))

#implementing movie recommendation system
#checks what other viewers of the searched movie liked
def get_similar_movies(movie_id):
  #find recommendations from users similar to the user who searched
  also_liked_movie = movie_ratings[(movie_ratings["rating"] > 4) & (movie_ratings["movieId"] == movie_id)]["userId"].unique()
  user_recommendations = movie_ratings[(movie_ratings["rating"] > 4) & (movie_ratings["userId"].isin(also_liked_movie))]["movieId"]
  #narrow down the amount of similar users for more precision
  user_recommendations = user_recommendations.value_counts() / len(also_liked_movie)
  user_recommendations = user_recommendations[user_recommendations > 0.10] #obtains the top 10 percent of similar users
  #get movies from similar users that are actually similar to the searched movie
  get_users = movie_ratings[(movie_ratings["rating"] > 4) & (movie_ratings["movieId"].isin(user_recommendations.index))] #obtains the users within the similar user recommendations dataset  
  user_recs = get_users["movieId"].value_counts() / len(get_users["userId"].unique()) #obtains the percentage of users who recommended the movies in similar recommendations  
  #getting the score the determines the top recommendations
  recommend_percent = pandas.concat([user_recommendations, user_recs], axis = 1)
  recommend_percent.columns = ["Similar", "All"] #need the numbers to be different to get better ratings
  recommend_percent["score"] = recommend_percent["Similar"] / recommend_percent["All"]
  recommend_percent = recommend_percent.sort_values("score", ascending = False) #sort according to the score --> better score = better recommendation
  return recommend_percent.head(15).merge(movies_data, left_index = True, right_on = "movieId")[["score", "title", "genres"]] #get top ten recommendations; merge to get titles  

#create new column and remove unnecessary characters for easier searching
movies_data["Altered_Title"] = movies_data["title"].apply(remove_chars)

#convert titles to a set of numbers, since computers only understand numbers
#use term frequency --> inverse document frequency (finds uniqueness within movie titles) --> results in a vector that describes the movie title as a number --> uses that number to find the search title according to which is most similar
vector = TfidfVectorizer(ngram_range = (1,2)) #helps searching by considering two words of title --> example harry potter 2005 --> examines harry potter together and also potter 2005 together
tfid = vector.fit_transform(movies_data["Altered_Title"]) #converts set of movie titles to a matrix using machine learning

In [None]:
#integrates Jupyter widget for interactive search
user_input = ipywidgets.Text(description = "Movie Name:", value = "Toy Story", disabled = False)
data_output = ipywidgets.Output()
user_input.observe(when_user_types, names = "value") #dictates when to run the function
display(user_input, data_output) #displays both widgets

Text(value='Toy Story', description='Movie Name:')

Output()