# Movie Recommendation system

```
Recommendation system types
1. Content Based
2. Collaborative Based
3. Hybrid Based
```

In [1]:
import pandas as pd 
import numpy as np
import ast

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies.sample(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
4290,1200000,"[{""id"": 18, ""name"": ""Drama""}]",,1412,"[{""id"": 459, ""name"": ""sexual obsession""}, {""id...",en,"Sex, Lies, and Videotape",A sexually repressed woman's husband is having...,10.869492,"[{""name"": ""Miramax Films"", ""id"": 14}, {""name"":...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1989-08-18,0,100.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,"Sex, Lies, and Videotape",6.7,185


In [4]:
movies = movies.merge(credits, on = 'title')

In [5]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

```
Droping columns : 
    Budgets
    Homepage
    original title
    production companies
    production countries
    movie id
```

In [6]:
movies.drop(['budget','homepage',
       'original_title','production_companies',
       'production_countries','movie_id','revenue'],axis = 1,inplace=True)

In [7]:
def converter(lst):
    ls = []
    for i in ast.literal_eval(lst):
        ls.append(i['name'])
    return ls

In [8]:
movies.genres = movies.genres.apply(converter)

In [9]:
movies.keywords = movies.keywords.apply(converter)

In [10]:
def converter1(lst):
    ls = []
    for i in ast.literal_eval(lst):
        ls.append(i['iso_639_1'])
    return ls

In [11]:
movies.spoken_languages = movies.spoken_languages.apply(converter1)

In [12]:
movies.cast = movies.cast.apply(converter)

In [13]:
movies.cast = movies.cast.apply(lambda i : i[:4])

In [14]:
def director(lst):
    ls = []
    for i in ast.literal_eval(lst):
        if i['job'] == 'Director':
            ls.append(i['name'])
    return ls

In [15]:
movies['director'] = movies.crew.apply(director)

In [16]:
movies.overview = movies.overview.apply(lambda x : str(x).split())

In [17]:
movies.tagline = movies.tagline.apply(lambda x : str(x).split())

In [18]:
movies.genres = movies.genres.apply(lambda x : [i.replace(' ', '') for i in x] )

In [19]:
movies.cast = movies.cast.apply(lambda x : [i.replace(' ', '') for i in x] )

In [20]:
movies.director = movies.director.apply(lambda x : [i.replace(' ', '') for i in x] )
movies.keywords = movies.keywords.apply(lambda x : [i.replace(' ', '') for i in x] )

In [21]:
movies.drop(['crew'],axis = 1,inplace=True)

In [22]:
movies.head(1)

Unnamed: 0,genres,id,keywords,original_language,overview,popularity,release_date,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,director
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...",en,"[In, the, 22nd, century,, a, paraplegic, Marin...",150.437577,2009-12-10,162.0,"[en, es]",Released,"[Enter, the, World, of, Pandora.]",Avatar,7.2,11800,"[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]


In [23]:
movies['tags'] = movies.overview + movies.genres + movies.cast + movies.keywords + movies.director + movies.tagline

In [24]:
movies.drop(['overview','keywords','genres','cast','director','tagline'],axis = 1, inplace = True)

In [25]:
' '.join(['overview','keywords','genres','cast','director','tagline'])

'overview keywords genres cast director tagline'

In [26]:
movies.tags = movies.tags.apply(lambda x: ' '.join(x))

In [27]:
movies.head(1)

Unnamed: 0,id,original_language,popularity,release_date,runtime,spoken_languages,status,title,vote_average,vote_count,tags
0,19995,en,150.437577,2009-12-10,162.0,"[en, es]",Released,Avatar,7.2,11800,"In the 22nd century, a paraplegic Marine is di..."


In [28]:
movies_n = movies[['id','title','tags']]

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
cv = CountVectorizer(max_features = 5000, stop_words = 'english')

In [31]:
vectors = cv.fit_transform(movies_n['tags']).toarray()

In [32]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [33]:
def stemmer(txt):
    ls = []
    for i in txt.split():
        ls.append(ps.stem(i))
    
    return  " ".join(ls)

In [34]:
movies_n.tags = movies_n.tags.apply(stemmer)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_n.tags = movies_n.tags.apply(stemmer)


In [35]:
similarity = cosine_similarity(vectors)

In [36]:
def recommender(movie,n):
    index = movies_n[movies_n.title == movie].index[0]
    cos_sim = similarity[index]
    index_ls = sorted(list(enumerate(cos_sim)),reverse=True,key=lambda x:x[1])[1:n+1]
    
    for i in index_ls:
        print(movies_n.iloc[i[0]].title)

In [37]:
 recommender('Batman Begins',10)

The Dark Knight
Batman
The Dark Knight Rises
Batman
Batman & Robin
Batman v Superman: Dawn of Justice
Defendor
Batman Forever
Mi America
Batman Returns


In [38]:
import pickle

In [39]:
pickle.dump(movies_n,open('movies.pkl','wb'))

In [40]:
movies_n.title.values 

array(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre',
       ..., 'Signed, Sealed, Delivered', 'Shanghai Calling',
       'My Date with Drew'], dtype=object)

In [41]:
pickle.dump(similarity,open('sim.pkl','wb')) 