In [1]:
import numpy as np
import pandas as pd
import ast
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

import os
os.chdir("/content/drive/My Drive")
!ls

 churn-bigml-20.csv			     IRIS.csv
 climate_2009_2016.csv			    'Loan Prediction.csv'
'Colab Notebooks'			     Mall_Customers.csv
 Cuisine_rating.csv			     model.pkl
'Data_Science_Week_8_Assignment (3).ipynb'   models
 df_original.csv			     streamlit_app
 df_scaled.csv				     tmdb_5000_credits.csv
 Diabetes-Classification.csv		     tmdb_5000_movies.csv
 fitness_class_2212.csv			     twitter_training.csv
 Foreign_Exchange_Rates.xls		     weather_data_extended.csv
 heart.csv				    'Week 8.ipynb'


In [3]:
#Load the datasets
movie = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

#Merge both datasets on title
movies = movie.merge(credits, on = 'title')

#Keep important columns

movies = movies[['id', 'title', 'overview', 'keywords', 'genres', 'cast', 'crew']]

#Drop rows with null values
movies.dropna(inplace = True)

  credits = pd.read_csv('tmdb_5000_credits.csv')


In [4]:
#Process 'overview' column
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Process 'keywords' column

def convert_keywords(obj):
    l = []
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l

movies['keywords'] = movies['keywords'].apply(convert_keywords)

In [5]:
#Process 'genres' column

def convert_genres(obj):
    l = []
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l

movies['genres'] = movies['genres'].apply(convert_genres)

In [6]:
#Process 'cast' column, top 3 only

def extract_cast(obj):
    l = []
    count = 0
    for i in ast.literal_eval(obj):
        if count != 3:
            l.append(i['name'])
            count += 1
        else:
            break
    return l

movies['cast'] = movies['cast'].apply(extract_cast)

In [8]:
#Process 'crew' column, get the director only

def extract_director(obj):
    l = []
    try:
        crew_list = ast.literal_eval(obj)
    except:
        return l               # return empty list if parsing fails

    for i in crew_list:
        if i.get('job') == 'Director':
            l.append(i.get('name'))
            break
    return l

movies['crew'] = movies['crew'].apply(extract_director)

#Create 'tags' column by combining overview + keywords + genres + cast + crew
movies['tags'] = movies['overview'] + movies['cast'] + movies['crew'] + movies['keywords']

#Final dataset with relevant columns

movies = movies[['id', 'title', 'tags']]

In [9]:
# Remove spaces from tags

movies['tags'] = movies['tags'].apply(lambda x: [i.replace(" ", "") for i in x])

# Stemming

ps = PorterStemmer()

def stemming(text):
    l = []
    for i in text:
        l.append(ps.stem(i))
    return " ".join(l)

movies['tags'] = movies['tags'].apply(stemming)

In [11]:
#Before we go any further, here is a look at the shape of our data frame
movies.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


In [12]:
print(movies.shape)
movies.info()

(1489, 3)
<class 'pandas.core.frame.DataFrame'>
Index: 1489 entries, 0 to 1491
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1489 non-null   int64 
 1   title   1489 non-null   object
 2   tags    1489 non-null   object
dtypes: int64(1), object(2)
memory usage: 46.5+ KB


In [15]:
movies['tags'].apply(len).describe()

Unnamed: 0,tags
count,1489.0
mean,423.932841
std,151.598932
min,123.0
25%,311.0
50%,410.0
75%,511.0
max,1138.0


In [16]:
#We can also use 'Counter' to see which words are most common before we do vectorizing
from collections import Counter
Counter(" ".join(movies['tags']).split()).most_common(20)

[('the', 4602),
 ('a', 3083),
 ('to', 2616),
 ('and', 2265),
 ('of', 2115),
 ('hi', 1291),
 ('in', 1268),
 ('is', 997),
 ('with', 735),
 ('he', 640),
 ('an', 610),
 ('on', 591),
 ('for', 563),
 ('that', 546),
 ('as', 512),
 ('when', 487),
 ('her', 476),
 ('by', 465),
 ('their', 458),
 ('from', 436)]

In [17]:
# Vectorization
vectorizer = CountVectorizer(max_features=500, stop_words='english')
vectors = vectorizer.fit_transform(movies['tags']).toarray()

# Cosine similarity
similarity = cosine_similarity(vectors)

In [18]:
# Recommendation function
def Recommendation_system(movie_title):
    # Handle case where movie isn't in the dataset
    if movie_title not in movies['title'].values:
        print("Movie not found in the database!")
        return []

    movie_index = movies[movies['title'] == movie_title].index[0]
    distances = sorted(list(enumerate(similarity[movie_index])),
                       reverse=True, key=lambda x: x[1])

    #extract top 19 similar movie titles (skip the movie itself at index 0)
    recommended_titles = [movies.iloc[i[0]].title for i in distances[1:20]]

    #print the recommendations
    for title in recommended_titles:
        print(title)

    return recommended_titles


In [19]:
Recommendation_system('Avatar')

Independence Day
Aliens vs Predator: Requiem
Beowulf
The Thing
Titan A.E.
Treasure Planet
Edge of Tomorrow
Meet Dave
Aliens in the Attic
Predators
Tears of the Sun
Galaxy Quest
Battleship
Home
The Watch
Prometheus
We Were Soldiers
The 5th Wave
Mission to Mars


In [22]:
#Save the model and data
pickle.dump(movies, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [23]:
from flask import Flask, request, jsonify
import pickle

app = Flask(__name__)

#load model artifacts
movies = pickle.load(open('movies.pkl', 'rb'))
similarity = pickle.load(open('similarity.pkl', 'rb'))

def Recommendation_system(movie_title):
    if movie_title not in movies['title'].values:
        return []

    movie_index = movies[movies['title'] == movie_title].index[0]
    distances = sorted(list(enumerate(similarity[movie_index])),
                       reverse=True, key=lambda x: x[1])

    recommended_titles = [movies.iloc[i[0]].title for i in distances[1:20]]
    return recommended_titles

@app.route('/')
def home():
    return "Movie Recommender is running. Use /recommend?title=MovieName"

@app.route('/recommend')
def recommend():
    title = request.args.get('title')
    if not title:
        return jsonify({"error": "No title provided"}), 400

    recs = Recommendation_system(title)
    if not recs:
        return jsonify({"error": "Movie not in dataset"}), 404

    return jsonify({"recommendations": recs})

if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with watchdog (inotify)


In [None]:
#This can now be run locally using app.py