In [1]:
import pandas as pd
import numpy as np
import time
import os
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
hindi_movies = pickle.load(open("hindi_movies.pkl", 'rb'))

In [3]:
hindi_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1851 entries, 0 to 1850
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Title                 1851 non-null   object 
 1   release_year          1851 non-null   int32  
 2   cast_tmdb             1851 non-null   object 
 3   director_tmdb         1851 non-null   object 
 4   tmdb_id               1851 non-null   int64  
 5   overview              1851 non-null   object 
 6   rating                1851 non-null   float64
 7   poster                1851 non-null   object 
 8   genres_tmdb           1851 non-null   object 
 9   production_companies  1851 non-null   object 
 10  imdb_id               1851 non-null   object 
dtypes: float64(1), int32(1), int64(1), object(8)
memory usage: 152.0+ KB


In [4]:
telugu_movies = pickle.load(open("telugu_movies_data.pkl", 'rb'))

In [5]:
telugu_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522 entries, 0 to 1521
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Title                 1522 non-null   object 
 1   release_year          1522 non-null   int64  
 2   cast_tmdb             1522 non-null   object 
 3   director_tmdb         1522 non-null   object 
 4   tmdb_id               1522 non-null   int64  
 5   overview              1522 non-null   object 
 6   rating                1522 non-null   float64
 7   poster                1522 non-null   object 
 8   genres_tmdb           1522 non-null   object 
 9   production_companies  1522 non-null   object 
 10  imdb_id               1522 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 130.9+ KB


In [6]:
indian_movies = pd.concat([hindi_movies, telugu_movies], ignore_index=True)

In [7]:
indian_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3373 entries, 0 to 3372
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Title                 3373 non-null   object 
 1   release_year          3373 non-null   int64  
 2   cast_tmdb             3373 non-null   object 
 3   director_tmdb         3373 non-null   object 
 4   tmdb_id               3373 non-null   int64  
 5   overview              3373 non-null   object 
 6   rating                3373 non-null   float64
 7   poster                3373 non-null   object 
 8   genres_tmdb           3373 non-null   object 
 9   production_companies  3373 non-null   object 
 10  imdb_id               3373 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 290.0+ KB


https://image.tmdb.org/t/p/original/oLhXBu8u2WUJkViIw5rl9Aq04W8.jpg

https://image.tmdb.org/t/p/original/5grbkwNnMylKw9dGq0X6CGn7E4O.jpg

poster

In [20]:
indian_movies['poster'][1]

'/9xl6TNdI2QM1kKBrsuNKJkWadu4.jpg'

In [22]:
# concat 'https://image.tmdb.org/t/p/original' with strings present in poster column

In [23]:
def get_poster(s):
    
    link='https://image.tmdb.org/t/p/original'+ s
    return link

In [26]:
get_poster(indian_movies['poster'][3000])

'https://image.tmdb.org/t/p/original/pV6xLQh54ga8oaQqJtUqLp12LwQ.jpg'

In [31]:
indian_movies['poster'] = indian_movies['poster'].apply(get_poster)

cast, director and genres for displaying

In [125]:
#cast
indian_movies['cast_display'] = indian_movies['cast_tmdb'].apply(lambda s : str(s).translate({ord(i): '' for i in ".[]-:;+|'\""}))

In [126]:
indian_movies['cast_display'].sample()

898    Nazia Hussain, Aasad Mirza, Salim Khan, Saira ...
Name: cast_display, dtype: object

In [127]:
# " ".join([cast for cast in indian_movies['cast_tmdb'][0]])

In [128]:
#genres
indian_movies['genres_display'] = indian_movies['genres_tmdb'].apply(lambda s : str(s).translate({ord(i): '' for i in ".[]-:;+|'\""}))

In [129]:
indian_movies['genres_display'].sample(5)

1825       Crime, Drama
2484            Romance
1514    Romance, Comedy
1464    Thriller, Drama
174     Family, Fantasy
Name: genres_display, dtype: object

In [130]:
#director
indian_movies['director_display'] = indian_movies['director_tmdb'].apply(lambda s : str(s).translate({ord(i): '' for i in ".[]-:;+|'\""}))

In [131]:
indian_movies['director_display'].sample(5)

235     Govind Nihalani
1043            unknown
2575            unknown
1892            unknown
3146            unknown
Name: director_display, dtype: object

In [132]:
#director
indian_movies['production_companies_display'] = indian_movies['production_companies'].apply(lambda s : str(s).translate({ord(i): '' for i in ".[]-:;+|'\""}))

In [133]:
indian_movies['production_companies_display'].sample(5)

1787                               Clean Slate Films
1396     Saptrishi Cinevision, Storm Motion Pictures
2808                            Daydream Productions
1432    Balaji Motion Pictures, Pi Films Productions
2878                                       no result
Name: production_companies_display, dtype: object

In [134]:
indian_movies[['genres_display', 'cast_display', 'director_display', 'production_companies_display']].sample(10)

Unnamed: 0,genres_display,cast_display,director_display,production_companies_display
840,"Drama, Thriller","Amitabh Bachchan, Saif Ali Khan, Deepika Paduk...",Prakash Jha,"Base Industries Group, Prakash Jha Productions"
1626,"Comedy, Drama","Shardul Bhardwaj, Mahender Nath, Nutan Sinha, ...",Prateek Vats,Na Ma Productions
2436,"Comedy, Drama","Hidetaka Yoshioka, Koyuki, Shinichi Tsutsumi, ...",Takashi Yamazaki,no result
1361,"Drama, Comedy, Family","Saif Ali Khan, Svar Kamble, Padmapriya Janakir...",Raja Menon,"Bandra West Pictures, TSeries, Abundantia Ente..."
2395,"Comedy, Drama","Krishnudu, M S Narayana, Krishna Bhagavan",Satish Vegesna,no result
2768,"Drama, Family","Sharwanand, Anupama Parameswaran, Prakash Raj,...",Satish Vegesna,Sri Venkateswara Creations
2245,Thriller,"Manjula Ghattamaneni, Charmy Kaur, Shashank, I...",V K Prakash,Indira Productions
2054,"Action, Adventure, Drama, Romance","Steve Howey, Mike Vogel, Cameron Richardson, S...",Steve Boyum,"Tag Entertainment, 20th Century Fox"
2840,"Comedy, Horror","Aadi Saikumar, Vaibhavi Shandilya, Brahmaji, R...",Prabhakar Podakandla,V4 Movies
2838,"Romance, Drama","Ram Pothineni, Anupama Parameswaran, Lavanya T...",Kishore Tirumala,"Sri Sravanthi Movies, PR Productions"


remove white spaces in between words of each entity/name

In [137]:
#cast
indian_movies['cast'] = indian_movies['cast_display'].apply(lambda x : x.replace(' ', ''))
indian_movies['cast'] = indian_movies['cast'].apply(lambda x : x.replace(',', ' ').lower())
#genres
indian_movies['genres'] = indian_movies['genres_display'].apply(lambda x : x.replace(' ', ''))
indian_movies['genres'] = indian_movies['genres'].apply(lambda x : x.replace(',', ' ').lower())
#director
indian_movies['director'] = indian_movies['director_display'].apply(lambda x : x.replace(' ', ''))
indian_movies['director'] = indian_movies['director'].apply(lambda x : x.replace(',', ' ').lower())
#production_companies
# indian_movies['production_companies'] = indian_movies['production_companies'].apply(lambda x : x.replace(' ', ''))
# indian_movies['production_companies'] = indian_movies['production_companies'].apply(lambda x : x.replace(',', ' '))

In [139]:
indian_movies[['genres', 'cast', 'director']].sample(10)

Unnamed: 0,genres,cast,director
2180,drama,raja sneha kotasrinivasarao jdchakravarthi rag...,giribabu
1571,action history war romance,arjunkapoor kritisanon sanjaydutt mohnishbehl ...,ashutoshgowariker
1649,documentary,unknown,mehrsingh
3011,horror thriller,nanditaraj satyamrajesh vidyullekharaman prasa...,raajakiran
969,crime drama thriller,aamirkhan kareenakapoorkhan ranimukerji nawazu...,reemakagti
271,drama action thriller crime,anilkapoor sanjaydutt sameerareddy koenamitra ...,sanjaygupta
329,drama,payalrohatgi aryanvaid siddharthkoirala hinata...,unknown
2439,action drama fantasy,nagarjunaakkineni anushkashetty prakashraj gan...,srinivasreddy
1814,drama,kapilsharma shahanagoswami sayanigupta swanand...,nanditadas
296,thriller,arbaazkhan gracysingh shamitashetty satishkaus...,gautamadhikari


In [140]:
indian_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3373 entries, 0 to 3372
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Title                         3373 non-null   object 
 1   release_year                  3373 non-null   int64  
 2   cast_tmdb                     3373 non-null   object 
 3   director_tmdb                 3373 non-null   object 
 4   tmdb_id                       3373 non-null   int64  
 5   overview                      3373 non-null   object 
 6   rating                        3373 non-null   float64
 7   poster                        3373 non-null   object 
 8   genres_tmdb                   3373 non-null   object 
 9   production_companies          3373 non-null   object 
 10  imdb_id                       3373 non-null   object 
 11  cast_display                  3373 non-null   object 
 12  genres_display                3373 non-null   object 
 13  dir

In [141]:
indian_movies.head()

Unnamed: 0,Title,release_year,cast_tmdb,director_tmdb,tmdb_id,overview,rating,poster,genres_tmdb,production_companies,imdb_id,cast_display,genres_display,director_display,production_companies_display,cast,genres,director
0,Abhay,2001,"[Kamal Haasan, Manisha Koirala, Raveena Tandon...",Suresh Krishna,66346,"Nandu and Vijay are brothers, who were abused ...",7.2,https://image.tmdb.org/t/p/original/5grbkwNnMy...,[Thriller],[V Creations],tt0294264,"Kamal Haasan, Manisha Koirala, Raveena Tandon,...",Thriller,Suresh Krishna,V Creations,kamalhaasan manishakoirala raveenatandon golla...,thriller,sureshkrishna
1,Aamdani Atthanni Kharcha Rupaiya,2001,"[Govinda, Juhi Chawla, Johnny Lever, Tabu, Ket...",K Raghavendra Rao,25801,"Jhoomri and her husband, Bhimsha, move into a ...",5.2,https://image.tmdb.org/t/p/original/9xl6TNdI2Q...,[Comedy],no result,tt0305173,"Govinda, Juhi Chawla, Johnny Lever, Tabu, Ketk...",Comedy,K Raghavendra Rao,no result,govinda juhichawla johnnylever tabu ketkidave ...,comedy,kraghavendrarao
2,Aashiq,2001,"[Bobby Deol, Karisma Kapoor, Ashok Saraf, Muke...",Indra Kumar,362100,Pooja and Chander Kapoor have a heart-to-heart...,6.0,https://image.tmdb.org/t/p/original/ebH0vv6e5C...,"[Action, Romance]",no result,tt0273406,"Bobby Deol, Karisma Kapoor, Ashok Saraf, Mukes...","Action, Romance",Indra Kumar,no result,bobbydeol karismakapoor ashoksaraf mukeshrishi...,action romance,indrakumar
3,Ajnabee,2001,"[Akshay Kumar, Bobby Deol, Kareena Kapoor Khan...",Mustan Alibhai Burmawalla,22316,Raj and Priya befriend their new neighbors Vic...,5.611,https://image.tmdb.org/t/p/original/rIscKIZxR4...,"[Action, Crime, Drama, Thriller]",[Film Folks],tt0278291,"Akshay Kumar, Bobby Deol, Kareena Kapoor Khan,...","Action, Crime, Drama, Thriller",Mustan Alibhai Burmawalla,Film Folks,akshaykumar bobbydeol kareenakapoorkhan bipash...,action crime drama thriller,mustanalibhaiburmawalla
4,Aks,2001,"[Amitabh Bachchan, Manoj Bajpayee, Raveena Tan...",Rakeysh Omprakash Mehra,152035,"On the trail of an underworld contract killer,...",5.1,https://image.tmdb.org/t/p/original/4vGi15Vx0K...,"[Crime, Thriller]",no result,tt0289845,"Amitabh Bachchan, Manoj Bajpayee, Raveena Tand...","Crime, Thriller",Rakeysh Omprakash Mehra,no result,amitabhbachchan manojbajpayee raveenatandon na...,crime thriller,rakeyshomprakashmehra


In [142]:
indian_movies.columns

Index(['Title', 'release_year', 'cast_tmdb', 'director_tmdb', 'tmdb_id',
       'overview', 'rating', 'poster', 'genres_tmdb', 'production_companies',
       'imdb_id', 'cast_display', 'genres_display', 'director_display',
       'production_companies_display', 'cast', 'genres', 'director'],
      dtype='object')

0                                             V Creations
1                                               no result
2                                               no result
3                                              Film Folks
4                                               no result
                              ...                        
3368                                      AB2 Productions
3369          Zee Studios, Kiran Korrapati Creative Works
3370    Retrophiles, T-Series, AA Films, UV Creations,...
3371                        WIT STUDIO, CloverWorks, TOHO
3372                      Hasya Movies, AK Entertainments
Name: production_companies, Length: 3373, dtype: object

In [118]:
indian_movies['cast_display'].apply(lambda x : x.replace(' ', ''))

0       KamalHaasan,ManishaKoirala,RaveenaTandon,Golla...
1       Govinda,JuhiChawla,JohnnyLever,Tabu,KetkiDave,...
2       BobbyDeol,KarismaKapoor,AshokSaraf,MukeshRishi...
3       AkshayKumar,BobbyDeol,KareenaKapoorKhan,Bipash...
4       AmitabhBachchan,ManojBajpayee,RaveenaTandon,Na...
                              ...                        
3368    VJSunny,Saptagiri,AksaKhan,Nakshatra,PosaniKri...
3369    Samuthirakani,AnasuyaBharadwaj,MeeraJasmine,Ma...
3370    Prabhas,SaifAliKhan,KritiSanon,SunnySinghNijja...
3371    TakuyaEguchi,AtsumiTanezaki,SaoriHayami,Kenich...
3372    SreeVishnu,RebaMonicaJohn,Naresh,SrikanthIyeng...
Name: cast_display, Length: 3373, dtype: object