# library imports

In [1]:
import numpy as np
import pandas as pd
import pickle

import ast              # for literal_eval()  json to list convert
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data Loading

In [2]:
credits = pd.read_csv('/content/drive/MyDrive/Datasets/5000 TMDB movies/tmdb_5000_credits.csv')
movies = pd.read_csv('/content/drive/MyDrive/Datasets/5000 TMDB movies/tmdb_5000_movies.csv')

# Understanding dataset

In [3]:
credits.shape

(4803, 4)

In [4]:
credits.head(5)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


Detailed inpection of cast column in credits

In [5]:
ast.literal_eval(credits.iloc[0]['cast'])[:3]

[{'cast_id': 242,
  'character': 'Jake Sully',
  'credit_id': '5602a8a7c3a3685532001c9a',
  'gender': 2,
  'id': 65731,
  'name': 'Sam Worthington',
  'order': 0},
 {'cast_id': 3,
  'character': 'Neytiri',
  'credit_id': '52fe48009251416c750ac9cb',
  'gender': 1,
  'id': 8691,
  'name': 'Zoe Saldana',
  'order': 1},
 {'cast_id': 25,
  'character': 'Dr. Grace Augustine',
  'credit_id': '52fe48009251416c750aca39',
  'gender': 1,
  'id': 10205,
  'name': 'Sigourney Weaver',
  'order': 2}]

In [6]:
ast.literal_eval(credits.iloc[0]['crew'])[:3]

[{'credit_id': '52fe48009251416c750aca23',
  'department': 'Editing',
  'gender': 0,
  'id': 1721,
  'job': 'Editor',
  'name': 'Stephen E. Rivkin'},
 {'credit_id': '539c47ecc3a36810e3001f87',
  'department': 'Art',
  'gender': 2,
  'id': 496,
  'job': 'Production Design',
  'name': 'Rick Carter'},
 {'credit_id': '54491c89c3a3680fb4001cf7',
  'department': 'Sound',
  'gender': 0,
  'id': 900,
  'job': 'Sound Designer',
  'name': 'Christopher Boyes'}]

In [7]:
print(movies.shape)
movies.head(1)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


Detailed inspection on str of list of dicts

In [8]:
print('length : ' , len(ast.literal_eval(movies.iloc[0]['genres'])))
ast.literal_eval(movies.iloc[0]['genres'])[:3]

length :  4


[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'}]

In [9]:
print('length : ' ,len(ast.literal_eval(movies.iloc[0]['keywords'])))
ast.literal_eval(movies.iloc[0]['keywords'])[:3]

length :  21


[{'id': 1463, 'name': 'culture clash'},
 {'id': 2964, 'name': 'future'},
 {'id': 3386, 'name': 'space war'}]

In [10]:
print('length : ' , len(ast.literal_eval(movies.iloc[0]['production_companies'])))
ast.literal_eval(movies.iloc[0]['production_companies'])[:3]

length :  4


[{'name': 'Ingenious Film Partners', 'id': 289},
 {'name': 'Twentieth Century Fox Film Corporation', 'id': 306},
 {'name': 'Dune Entertainment', 'id': 444}]

In [11]:
print('length : ' , len(ast.literal_eval(movies.iloc[0]['production_countries'])))
ast.literal_eval(movies.iloc[0]['production_countries'])[:3]

length :  2


[{'iso_3166_1': 'US', 'name': 'United States of America'},
 {'iso_3166_1': 'GB', 'name': 'United Kingdom'}]

In [12]:
print('length : ' , len(ast.literal_eval(movies.iloc[0]['spoken_languages'])))
ast.literal_eval(movies.iloc[0]['spoken_languages'])[:3]

length :  2


[{'iso_639_1': 'en', 'name': 'English'},
 {'iso_639_1': 'es', 'name': 'Español'}]

# merging movies and credits datasets on the bases of title

In [13]:
movies = movies.merge(credits , on = 'title')
movies.shape

(4809, 23)

In [14]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


# Missing values

In [15]:
print('missing values in each column')
movies.isnull().sum()

missing values in each column


Unnamed: 0,0
budget,0
genres,0
homepage,3096
id,0
keywords,0
original_language,0
original_title,0
overview,3
popularity,0
production_companies,0


**If there are missing values in a column, we will not use that column, so ignore it.**

In [16]:
movies.duplicated('id').sum()

6

# choosing the columns

In [17]:
movies = movies[['id' ,'title' , 'runtime' ,'vote_average' , 'popularity' , 'genres','keywords' , 'overview' , 'production_companies' , 'production_countries' , 'cast' , 'crew']]
movies.shape

(4809, 12)

In [18]:
movies.head(1)

Unnamed: 0,id,title,runtime,vote_average,popularity,genres,keywords,overview,production_companies,production_countries,cast,crew
0,19995,Avatar,162.0,7.2,150.437577,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


**keep this columns It is just as it was `id , title , runtime , vote_average , popularity` .**  
**and create a new column with help of rest columns called `tag column`**

## Extracting all the genres into list from dictionary

In [19]:
def get_genres(line):
  lst = []
  for key in ast.literal_eval(line):
    lst.append(key['name'])

  return lst

In [20]:
movies['genres'] = movies['genres'].apply(get_genres)

## Extracting all the keywords into list

In [21]:
def get_keywords(line):
  lst = []
  for key in ast.literal_eval(line):
    lst.append(key['name'])

  return lst

In [22]:
movies['keywords'] = movies['keywords'].apply(get_keywords)

## Converting overview into list from string

In [23]:
movies['overview'] = movies['overview'].str.split(' ')

## Extracting production companies

In [24]:
def get_prod_com(line):
  lst = []
  for key in ast.literal_eval(line):
    lst.append(key['name'])

  return lst

In [25]:
movies['production_companies'] = movies['production_companies'].apply(get_prod_com)

## Extracting Production Countries

In [26]:
def get_prod_country(line):
  lst = []
  for key in ast.literal_eval(line):
    lst.append(key['name'])

  return lst

In [27]:
movies['production_countries'] = movies['production_countries'].apply(get_prod_country)

## Extracting top 5 Actors

In [28]:
def get_actor_name(line):
  lst = []
  i = 0
  for key in ast.literal_eval(line):
    if i == 5:
      break

    lst.append(key['name'])
    i = i + 1

  return lst

In [29]:
movies['cast'] = movies['cast'].apply(get_actor_name)

## Extracting Director , Sound designer , Writer , Producer , ScreenPlay

In [30]:
def get_crew(line):
  lst = []
  for dict in ast.literal_eval(line):
    if (dict['job']  == 'Director') or ('Sound' in dict['job']) or ('Writer' in dict['job']) or ('Producer' in dict['job']) or ('Screenplay' in dict['job']):
      lst.append(dict['name'])

  # extracting only unique names
  return list(set(lst))

In [31]:
 movies['crew'] = movies['crew'].apply(get_crew)

In [32]:
movies.head(1)

Unnamed: 0,id,title,runtime,vote_average,popularity,genres,keywords,overview,production_companies,production_countries,cast,crew
0,19995,Avatar,162.0,7.2,150.437577,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Ingenious Film Partners, Twentieth Century Fo...","[United States of America, United Kingdom]","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[Addison Teague, Ken Fischer, Joyce Cox, Jenny..."


### Removing all Nan rows and duplicated rows (on the basis of id's)

In [33]:
movies.dropna(inplace = True)

In [34]:
movies.shape

(4806, 12)

In [35]:
# removing rows that have duplicat id's
movies.drop_duplicates('id' , inplace = True)

In [36]:
movies.shape

(4800, 12)

## Remove Space of each element have more then 1 word in all columns
Christopher  Nolan  --> ChristopherNolan

In [37]:
def remove_space(line):
  lst = []
  for element in line:
    element = element.split(' ')
    lst.append(''.join(element))

  return lst

In [38]:
movies['genres'] = movies['genres'].apply(remove_space)
movies['keywords'] = movies['keywords'].apply(remove_space)
movies['overview'] = movies['overview'].apply(remove_space)
movies['production_companies'] = movies['production_companies'].apply(remove_space)
movies['production_countries'] = movies['production_countries'].apply(remove_space)
movies['cast'] = movies['cast'].apply(remove_space)
movies['crew'] = movies['crew'].apply(remove_space)

In [39]:
movies.head(1)

Unnamed: 0,id,title,runtime,vote_average,popularity,genres,keywords,overview,production_companies,production_countries,cast,crew
0,19995,Avatar,162.0,7.2,150.437577,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[IngeniousFilmPartners, TwentiethCenturyFoxFil...","[UnitedStatesofAmerica, UnitedKingdom]","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...","[AddisonTeague, KenFischer, JoyceCox, JennyFos..."


### Creating a single column called `tags` from `'genres' , 'keywords' , 'overview' ,'production companies' , 'production countries' , 'cast' ,'crew'`

In [40]:
movies['tags'] = movies['genres'] + movies['keywords'] + movies['overview']  + movies['production_companies'] + movies['production_countries'] + movies['cast'] + movies['crew']

## Deleting 'genres' , 'keywords' , 'overview' ,'production_companies' , 'production_countries' , 'cast' ,'crew' columns

In [41]:
movies.drop(columns = ['genres' , 'keywords' , 'overview' ,'production_companies' , 'production_countries' , 'cast' ,'crew'] , inplace = True)

In [42]:
movies.head()

Unnamed: 0,id,title,runtime,vote_average,popularity,tags
0,19995,Avatar,162.0,7.2,150.437577,"[Action, Adventure, Fantasy, ScienceFiction, c..."
1,285,Pirates of the Caribbean: At World's End,169.0,6.9,139.082615,"[Adventure, Fantasy, Action, ocean, drugabuse,..."
2,206647,Spectre,148.0,6.3,107.376788,"[Action, Adventure, Crime, spy, basedonnovel, ..."
3,49026,The Dark Knight Rises,165.0,7.6,112.31295,"[Action, Crime, Drama, Thriller, dccomics, cri..."
4,49529,John Carter,132.0,6.1,43.926995,"[Action, Adventure, ScienceFiction, basedonnov..."


# converting all the list of tags into String to  perform Vectorization

In [43]:
movies['tags'] = movies['tags'].apply(lambda x : ' '.join(x))

these are stop words that are removed

`stop words = ['who', 'i', 'detail', 'anywhere', 'have', 'namely', 'yourself', 'empty', 'latter', 'than', 'his', 'latterly', 'ltd', 'wherever', 'then', 'two', 'up', 'yourselves', 'has', 'if', 'nowhere', 'itself', 'for', 'etc', 'name', 'although', 'put', 'former', 'before', 'eight', 'amongst', 'into', 'please', 'been', 'below', 'eleven', 'neither', 'alone', 'were', 'while', 'during', 'whereas', 'is', 'part', 'thereafter', 'ever', 'which', 'give', 'perhaps', 'thus', 'whom', 'other', 'themselves', 'ourselves', 'around', 'along', 'third', 'already', 'yours', 'bill', 'another', 'everyone', 'becoming', 'we', 'or', 'at', 'sixty', 'will', 'between', 'twenty', 'my', 'each', 'formerly', 'might', 'describe', 'its', 'hasnt', 'still', 'moreover', 'also', 'throughout', 'thereby', 'anyway', 'inc', 'too', 'because', 'am', 'nevertheless', 'it', 'first', 'had', 'your', 'on', 'there', 'himself', 'others', 'ours', 'now', 'whatever', 'de', 'everywhere', 'made', 'done', 'many', 'more', 'else', 'whose', 'whither', 'somehow', 'rather', 'whence', 'keep', 'and', 'her', 'without', 'sincere', 'with', 'mine', 'all', 'herein', 'twelve', 'what', 'about', 'hundred', 'myself', 'per', 'anyone', 'amoungst', 'thru', 'otherwise', 'none', 'seem', 'very', 'once', 'three', 'ie', 'when', 'fifty', 'thin', 'full', 'someone', 'again', 'whenever', 'never', 'well', 'thence', 'seemed', 'become', 'back', 'front', 'onto', 'eg', 'afterwards', 'why', 'whole', 'five', 'forty', 'meanwhile', 'mostly', 'those', 'where', 'amount', 'even', 'least', 'though', 'an', 'nine', 'whereafter', 'own', 'everything', 'being', 'something', 'cry', 'upon', 'whoever', 'whereby', 'toward', 'in', 'within', 'side', 'anything', 'take', 'should', 'out', 'several', 'among', 'beforehand', 'un', 'our', 'via', 'due', 'wherein', 'towards', 'above', 'be', 'so', 'one', 'enough', 'found', 'serious', 'somewhere', 'nobody', 'bottom', 'off', 'show', 'move', 'almost', 'few', 'however', 'system', 'yet', 'must', 'thick', 'a', 'can', 'him', 'further', 'hers', 'less', 'becomes', 'until', 'next', 'whereupon', 'was', 'elsewhere', 'every', 'mill', 'them', 'against', 'he', 'through', 'fifteen', 'fire', 'hereby', 'would', 'down', 'therefore', 'from', 'get', 'noone', 'con', 'fill', 'these', 'but', 'hereupon', 'besides', 'find', 'since', 'either', 'sometimes', 'over', 'do', 'together', 'much', 'me', 'after', 'they', 'always', 'cant', 'became', 'herself', 'last', 'no', 'nothing', 'couldnt', 'beside', 'their', 'some', 'except', 'see', 'six', 'not', 'four', 're', 'you', 'seems', 'behind', 'seeming', 'that', 'here', 'any', 'such', 'us', 'sometime', 'under', 'ten', 'by', 'beyond', 'of', 'the', 'whether', 'this', 'co', 'how', 'top', 'are', 'may', 'often', 'both', 'she', 'therein', 'hence', 'cannot', 'hereafter', 'as', 'thereupon', 'only', 'to', 'could', 'across', 'same', 'most', 'indeed', 'call', 'anyhow', 'nor', 'interest', 'go']`




## Creating a CountVectorizer object to perform Vectorization

In [44]:
movies.head(1)

Unnamed: 0,id,title,runtime,vote_average,popularity,tags
0,19995,Avatar,162.0,7.2,150.437577,Action Adventure Fantasy ScienceFiction cultur...


**without** `'runtime' , 'vote_average' and 'popularity'`

In [45]:
cv = CountVectorizer( stop_words = 'english')

In [46]:
vector = cv.fit_transform(movies['tags']).toarray()

In [47]:
print(cv.get_feature_names_out().shape)
words = cv.get_feature_names_out().tolist()
words

(56851,)


['00',
 '000',
 '007',
 '07am',
 '10',
 '100',
 '1000',
 '100bares',
 '101',
 '1019entertainment',
 '101ststreetfilms',
 '108',
 '10th',
 '10thholeproductions',
 '10weststudios',
 '11',
 '114',
 '117',
 '118',
 '119',
 '11th',
 '12',
 '1200',
 '120dbfilms',
 '120films',
 '1215',
 '1250',
 '125th',
 '12th',
 '13',
 '1300',
 '13ghostsproductionscanadainc',
 '13th',
 '14',
 '140',
 '1408',
 '142',
 '1429',
 '148',
 '1492pictures',
 '14pm',
 '14th',
 '15',
 '150',
 '150th',
 '1520s',
 '1536',
 '15th',
 '15thcentury',
 '16',
 '1600s',
 '161',
 '1630s',
 '1644',
 '1681',
 '1691',
 '16blockproductions',
 '16th',
 '16thcentury',
 '17',
 '170',
 '1700s',
 '173rd',
 '1748',
 '1776',
 '17th',
 '17thcentury',
 '18',
 '180',
 '1800',
 '1812productions',
 '1818',
 '1820',
 '1820s',
 '1821pictures',
 '1824',
 '1831',
 '1834',
 '1836',
 '1838',
 '1839',
 '1841',
 '1845',
 '1850',
 '1850s',
 '1856',
 '1857',
 '1860',
 '1862',
 '1863',
 '1870s',
 '1875',
 '1876',
 '1879',
 '1880s',
 '1882',
 '1885',
 '1

In [48]:
list(cv.get_stop_words())

['further',
 'may',
 'with',
 'down',
 'someone',
 'when',
 'while',
 'for',
 'hereby',
 'eg',
 'who',
 'across',
 'will',
 'except',
 'whatever',
 'the',
 'how',
 'sincere',
 'but',
 'often',
 'detail',
 'most',
 'cannot',
 'everywhere',
 'thus',
 'made',
 'they',
 'yourself',
 'nobody',
 'con',
 'himself',
 'would',
 'their',
 'four',
 'these',
 'moreover',
 'get',
 'whereas',
 'hers',
 'do',
 'own',
 'were',
 'if',
 'why',
 'none',
 'alone',
 'by',
 'anything',
 'those',
 'see',
 'thereafter',
 'whereby',
 'as',
 'full',
 'whither',
 'could',
 'there',
 'all',
 'five',
 'first',
 'was',
 'that',
 'is',
 'to',
 'whole',
 'on',
 'beforehand',
 'last',
 'me',
 'whoever',
 'put',
 'be',
 'wherein',
 'or',
 'system',
 'myself',
 'meanwhile',
 'around',
 'only',
 'anywhere',
 'never',
 'move',
 'your',
 'found',
 'thereby',
 'due',
 'more',
 'much',
 'both',
 'it',
 'part',
 'we',
 'however',
 'ever',
 're',
 'hereafter',
 'towards',
 'interest',
 'afterwards',
 'them',
 'should',
 'than'

In [49]:
print(vector.shape)
vector

(4800, 56851)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## now performing cosine similarity   
because Euclidean distance is less reliable is high dimension due to `curse of Dimensionality`

In [50]:
similarity = cosine_similarity(vector)

In [51]:
print(similarity.shape)
similarity

(4800, 4800)


array([[1.        , 0.06475239, 0.06547285, ..., 0.02817181, 0.03178209,
        0.01484785],
       [0.06475239, 1.        , 0.04578685, ..., 0.03283546, 0.01852169,
        0.0173058 ],
       [0.06547285, 0.04578685, 1.        , ..., 0.02656064, 0.01498222,
        0.01399868],
       ...,
       [0.02817181, 0.03283546, 0.02656064, ..., 1.        , 0.06446584,
        0.0451754 ],
       [0.03178209, 0.01852169, 0.01498222, ..., 0.06446584, 1.        ,
        0.05096472],
       [0.01484785, 0.0173058 , 0.01399868, ..., 0.0451754 , 0.05096472,
        1.        ]])

In [52]:
# but here i lost all the indexes to identify movies title
sorted(similarity[0] , reverse = True)

[0.9999999999999991,
 0.1942571724714529,
 0.18443891499772885,
 0.1767766952966369,
 0.17504391791097776,
 0.16331864497962195,
 0.15720703638063313,
 0.14708710135363803,
 0.14617633655117157,
 0.14541714529637195,
 0.1450647132964149,
 0.1436067394758882,
 0.14315928173122225,
 0.14229426304616102,
 0.13979035613822097,
 0.13664859862498716,
 0.13608276348795437,
 0.13608276348795437,
 0.13382584475782772,
 0.13370957553973667,
 0.13095238095238096,
 0.13094570021973104,
 0.1307440900921227,
 0.12963624321753373,
 0.12909944487358058,
 0.1276884796138123,
 0.1272937693043289,
 0.12712834523274566,
 0.12580490207067818,
 0.12436708454077278,
 0.12422599874998833,
 0.12379689211803457,
 0.12354155277685021,
 0.12329924047901801,
 0.1216660658480719,
 0.12087344460380706,
 0.12087344460380706,
 0.12071432805669138,
 0.1203324787301926,
 0.11952286093343936,
 0.11907241805196006,
 0.11858541225631423,
 0.11666666666666667,
 0.11551721334727869,
 0.11504474832710555,
 0.11433239009500593

# Final Method

In [53]:
def recommend_movies(movie_name):
  index = movies[movies['title'] == movie_name].index[0]
  for i in list(sorted(enumerate(similarity[index]) , reverse = True , key = lambda x : x[1]))[1:6]:
    print(movies.iloc[i[0]]['title'])

# Testing

In [54]:
recommend_movies('The Dark Knight Rises')

The Dark Knight
Batman Begins
Batman Returns
Batman Forever
Batman


In [55]:
movies.sample(5)

Unnamed: 0,id,title,runtime,vote_average,popularity,tags
2139,1164,Babel,143.0,6.9,25.785925,Drama terror lossofmother gun morocco deaf-mut...
2373,36811,The Last Station,112.0,6.7,3.659292,Drama Romance duringcreditsstinger A historica...
3219,8069,Barbarella,98.0,5.7,14.171759,ScienceFiction sexualfantasy alienplanet dista...
4597,10238,Cries and Whispers,91.0,7.8,11.347855,Drama sistersisterrelationship sweden dyingand...
1360,365222,Ip Man 3,105.0,6.5,19.167377,Action Drama History biography When a band of ...


# With  `'runtime' , 'vote_average' and 'popularity'`

In [56]:
new_vector = np.hstack( (movies[['runtime', 'vote_average' , 'popularity']].values , vector) )
print(new_vector.shape)
new_vector

(4800, 56854)


array([[162.      ,   7.2     , 150.437577, ...,   0.      ,   0.      ,
          0.      ],
       [169.      ,   6.9     , 139.082615, ...,   0.      ,   0.      ,
          0.      ],
       [148.      ,   6.3     , 107.376788, ...,   0.      ,   0.      ,
          0.      ],
       ...,
       [120.      ,   7.      ,   1.444476, ...,   0.      ,   0.      ,
          0.      ],
       [ 98.      ,   5.7     ,   0.857008, ...,   0.      ,   0.      ,
          0.      ],
       [ 90.      ,   6.3     ,   1.929883, ...,   0.      ,   0.      ,
          0.      ]])

In [57]:
sim = cosine_similarity(new_vector)

In [58]:
def rec(movie_name):
  index = movies[movies['title'] == movie_name].index[0]
  for i in list(sorted(enumerate(sim[index]) , reverse = True , key = lambda x : x[1]))[1:6]:
    print(movies.iloc[i[0]]['title'])

In [59]:
movies.head()

Unnamed: 0,id,title,runtime,vote_average,popularity,tags
0,19995,Avatar,162.0,7.2,150.437577,Action Adventure Fantasy ScienceFiction cultur...
1,285,Pirates of the Caribbean: At World's End,169.0,6.9,139.082615,Adventure Fantasy Action ocean drugabuse exoti...
2,206647,Spectre,148.0,6.3,107.376788,Action Adventure Crime spy basedonnovel secret...
3,49026,The Dark Knight Rises,165.0,7.6,112.31295,Action Crime Drama Thriller dccomics crimefigh...
4,49529,John Carter,132.0,6.1,43.926995,Action Adventure ScienceFiction basedonnovel m...


In [61]:
recommend_movies('The Dark Knight Rises')

The Dark Knight
Batman Begins
Batman Returns
Batman Forever
Batman


In [62]:
rec('The Dark Knight Rises')

Man of Steel
Harry Potter and the Half-Blood Prince
Exodus: Gods and Kings
Jupiter Ascending
Spider-Man


<p color = 'green ' >without 'runtime' , 'vote_average' and 'popularity' is performing better

## similarrity and vector is big files

In [63]:
print(pd.DataFrame(similarity).info() , end='\n\n\n')
print(pd.DataFrame(vector).info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4800 entries, 0 to 4799
Columns: 4800 entries, 0 to 4799
dtypes: float64(4800)
memory usage: 175.8 MB
None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4800 entries, 0 to 4799
Columns: 56851 entries, 0 to 56850
dtypes: int64(56851)
memory usage: 2.0 GB
None


# final model

<p color ='green'>most optimized model

In [83]:
getsizeof(similarity)

184320128

In [84]:
getsizeof(similarity.astype('float16'))

46080128

# reducing memory usages from 18.4 cr to 4.6 cr bytes ***(memory efficiency has improved by 75%)***

In [85]:
similarity = similarity.astype('float16')

In [86]:
cv = CountVectorizer(max_features=2500, stop_words = 'english')

vector = cv.fit_transform(movies['tags']).toarray()

similarity = cosine_similarity(vector)

def recommend_movies(movie_name):
  index = movies[movies['title'] == movie_name].index[0]
  for i in list(sorted(enumerate(similarity[index]) , reverse = True , key = lambda x : x[1]))[1:6]:
    print(movies.iloc[i[0]]['title'])

In [87]:
recommend_movies('The Dark Knight Rises')

The Dark Knight
Batman Begins
Batman Forever
Batman Returns
Batman


In [88]:
# Exporting essential thing that are used in web page
pickle.dump(movies[['id' ,'title']] ,open('movies.pkl' , 'wb'))
pickle.dump(similarity ,open('similarity.pkl' , 'wb'))