### Importing Libraries

In [1]:
import pandas as pd
import numpy as np

### Reading the data

In [3]:
movies = pd.read_csv('movie.csv') # the csv contains movies and genres

In [4]:
# shape of the data
movies.shape

(27278, 3)

In [5]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# checking for null values
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [7]:
# checking for different combinations of genres
# pd.set_option('display.max_rows', None ) # for displaying the whole data
movies['genres'].value_counts() #different combination of genres

Drama                                        4520
Comedy                                       2294
Documentary                                  1942
Comedy|Drama                                 1264
Drama|Romance                                1075
                                             ... 
Adventure|Animation|Children|Comedy|Crime       1
Drama|Film-Noir|Horror|Mystery                  1
Action|Drama|Horror|IMAX                        1
Documentary|Drama|Mystery                       1
Action|Comedy|Film-Noir|Musical|Sci-Fi          1
Name: genres, Length: 1342, dtype: int64

In [8]:
#different types
len(movies['genres'].value_counts())

1342

In [9]:
# removing the no genre listed movies
movies = movies.drop(movies[movies['genres'] == '(no genres listed)'].index,axis=0) 

In [10]:
#resetting the index
movies = movies.reset_index()

In [11]:
type(movies)

pandas.core.frame.DataFrame

### Updating the movie names --> to lower case for easy search

### Data preprocessing

In [12]:
movie_names = movies['title']
movie_names = movies['title'].apply(lambda x:x[0:x.find(' (')]) #removing the year from movie name
movie_names = movie_names.str.lower() #convert the movie name to lower case for easy search

In [13]:
# creating a genre list
# creating a list of genres 
genre_list = []
for i in range(len(movies)):
    genre_list.append(movies['genres'][i].split('|'))

In [14]:
# importing libraries to do encoding
from mlxtend.preprocessing import TransactionEncoder


In [15]:
genre_encoder = TransactionEncoder()

In [16]:
movies_genre_list = genre_encoder.fit_transform(genre_list)

In [17]:
movies_genre_list = pd.DataFrame(movies_genre_list,columns = genre_encoder.columns_)

### New dataframe with encoded -genres

In [18]:
movies_genre_list

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,False,True,True,True,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False
1,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False
3,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27027,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False
27028,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
27029,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
27030,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [20]:
movies_genre_list = movies_genre_list.replace(to_replace = False, value=0) # converting true or false to 1 & 0

In [21]:
movies_genre_list = movies_genre_list.astype(int)

In [22]:
#movies_genre_list

In [23]:
movies_genre_list['movie_name'] = movies['title']

In [24]:
movies_genre_list['movie_name_lower'] = movie_names #adding lower case movie names column for easy search

In [25]:
movies_genre_list

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movie_name,movie_name_lower
0,0,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,Toy Story (1995),toy story
1,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,Jumanji (1995),jumanji
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,Grumpier Old Men (1995),grumpier old men
3,0,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,Waiting to Exhale (1995),waiting to exhale
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Father of the Bride Part II (1995),father of the bride part ii
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27027,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Forklift Driver Klaus: The First Day on the Jo...,forklift driver klaus: the first day on the job
27028,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Kein Bund für's Leben (2007),kein bund für's leben
27029,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"Feuer, Eis & Dosenbier (2002)","feuer, eis & dosenbier"
27030,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,The Pirates (2014),the pirates


In [26]:
movies_genre_notitle = movies_genre_list.drop(['movie_name','movie_name_lower'],axis=1) #converting into a new dataframe to cluster

### Clustering 

In [27]:
from sklearn.cluster import KMeans

In [28]:
model = KMeans(n_clusters = 10,n_init=40,random_state=100)

In [29]:
#help(KMeans)

In [30]:
movies_genre_notitle.head() # checking the data

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [31]:
model.fit(movies_genre_notitle) 

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=40, n_jobs=None, precompute_distances='auto',
       random_state=100, tol=0.0001, verbose=0)

In [32]:
movies_genre_list['cluster'] = model.labels_ #giving cluster names to each movies

In [33]:
movies_genre_list.head(4)

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movie_name,movie_name_lower,cluster
0,0,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,Toy Story (1995),toy story,7
1,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,Jumanji (1995),jumanji,7
2,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,Grumpier Old Men (1995),grumpier old men,2
3,0,0,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,Waiting to Exhale (1995),waiting to exhale,1


In [34]:
genre_list = list(movies_genre_notitle)

In [35]:
genre_list

['Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [36]:
count_genres = []
for i in range(0,10):
    for j in genre_list:
        count_genres.append((i,j,movies_genre_list[j][movies_genre_list['cluster']==i].sum()/movies_genre_list['cluster'][movies_genre_list['cluster'] == i ].count()))

In [37]:
count_genres

[(0, 'Action', 0.6141579152080903),
 (0, 'Adventure', 0.17036172695449242),
 (0, 'Animation', 0.017891870867366783),
 (0, 'Children', 0.002333722287047841),
 (0, 'Comedy', 0.028393621159082068),
 (0, 'Crime', 0.13146635550369506),
 (0, 'Documentary', 0.00038895371450797355),
 (0, 'Drama', 0.07312329832749903),
 (0, 'Fantasy', 0.023337222870478413),
 (0, 'Film-Noir', 0.016725009723842863),
 (0, 'Horror', 0.0019447685725398677),
 (0, 'IMAX', 0.025670945157526253),
 (0, 'Musical', 0.028004667444574097),
 (0, 'Mystery', 0.09257098405289771),
 (0, 'Romance', 0.04278490859587709),
 (0, 'Sci-Fi', 0.23298327499027616),
 (0, 'Thriller', 0.5087514585764294),
 (0, 'War', 0.055620381174640215),
 (0, 'Western', 0.11979774406845585),
 (1, 'Action', 0.046270442760271244),
 (1, 'Adventure', 0.049062624650977264),
 (1, 'Animation', 0.006781013163143199),
 (1, 'Children', 0.0043877143996808934),
 (1, 'Comedy', 0.2947746310331073),
 (1, 'Crime', 0.04547267650578381),
 (1, 'Documentary', 0.001994415636218

### Naming clusters - cluster profiling

##### From the above list: We can identify which is the predominant genres in each cluster
##### Here we are replacing the cluster names with genres that are occuring the maximum time

##### 0 ---> Action-Thriller
##### 1 ---> Drama - Romance - comedy
##### 2 ---> Romance - comedy
##### 3 ---> Comedy - Drama
##### 4 ---> Documentary
##### 5 ---> Crime - Thriller - Mystery
##### 6 ---> Comedy
##### 7 ---> Adventure - Animation - Children
##### 8 ---> Horror
##### 9 ---> Drama 

In [38]:
clust_genres = ['Action-Thriller','Drama-Romance-Comedy','Romance-Comedy','Comedy-Drama','Documentary','Crime-Thriller-Mystery','Comedy','Adventure-Animation-Children','Horror','Drama']

In [39]:
for i in range(0,10):
    movies_genre_list['cluster'] = movies_genre_list['cluster'].replace(i,clust_genres[i])

In [40]:
movies_genre_list # checking the cluster names

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movie_name,movie_name_lower,cluster
0,0,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,Toy Story (1995),toy story,Adventure-Animation-Children
1,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,Jumanji (1995),jumanji,Adventure-Animation-Children
2,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,Grumpier Old Men (1995),grumpier old men,Romance-Comedy
3,0,0,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,Waiting to Exhale (1995),waiting to exhale,Drama-Romance-Comedy
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,Father of the Bride Part II (1995),father of the bride part ii,Comedy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27027,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,Forklift Driver Klaus: The First Day on the Jo...,forklift driver klaus: the first day on the job,Comedy
27028,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,Kein Bund für's Leben (2007),kein bund für's leben,Comedy
27029,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,"Feuer, Eis & Dosenbier (2002)","feuer, eis & dosenbier",Comedy
27030,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,The Pirates (2014),the pirates,Adventure-Animation-Children


In [41]:
movies_genre_list['cluster'].value_counts() # number of movies in each cluster

Drama                           6946
Comedy                          4065
Action-Thriller                 2571
Drama-Romance-Comedy            2507
Documentary                     2417
Horror                          2183
Crime-Thriller-Mystery          1794
Comedy-Drama                    1746
Adventure-Animation-Children    1463
Romance-Comedy                  1340
Name: cluster, dtype: int64

### Find the similar type of movies

#### Function  to recommend similar movies

#### Using Jaccard distance

In [66]:
def movie_recommendation(movie):
    movie_list = []
    if movie.lower() not in list(movies_genre_list['movie_name_lower']):
        print(" The inputed movie not available in database")
    else:
        from scipy.spatial import distance
        min_dist = []
        movie_list = []
        movie_index = movies_genre_list['cluster'][movies_genre_list['movie_name_lower'] == movie.lower()].index[0] #identify the index of the useer input movie
        sim_movies = movies_genre_notitle[movies_genre_list['movie_name_lower'] == movie.lower()] #identifying the genre matrix
        cluster_in = movies_genre_list['cluster'][movies_genre_list['movie_name_lower'] == movie.lower()][movie_index]
        sim_movies_1 = movies_genre_list[movies_genre_list['cluster'] == cluster_in].drop(['movie_name_lower','cluster','movie_name'],axis=1)
        for i in sim_movies_1.index:
            dist = distance.jaccard(sim_movies,sim_movies_1.loc[i,:])
            min_dist.append((dist,i))
            sort_dist = sorted(min_dist)[1:8]
        for i in sort_dist:
            movie_list.append(movies['title'].loc[i[1]]) 
    if len(movie_list)==0 :
        return(" Enter an older movie")
    else:
        return(movie_list)

In [67]:
movie_recommendation('JuMaNji')

['Kids of the Round Table (1995)',
 'Indian in the Cupboard, The (1995)',
 'NeverEnding Story III, The (1994)',
 'Escape to Witch Mountain (1975)',
 "Darby O'Gill and the Little People (1959)",
 'Return to Oz (1985)',
 'NeverEnding Story, The (1984)']

In [68]:
movie_recommendation(input("Enter a movie name for recommendation : "))

Enter a movie name for recommendation : phani
 The inputed movie not available in database


' Enter an older movie'

### Giving GUI interface to our  Recommendation engine

In [75]:
from tkinter import *
import PIL.Image, PIL.ImageTk
root=Tk()
def movie_recommendation():
    r=Tk()
    from scipy.spatial import distance
    min_dist = []
    l = []
    movie=entry_1.get()
    movie_index = movies_genre_list['cluster'][movies_genre_list['movie_name_lower'] == movie.lower()].index[0]
    sim_movies = movies_genre_notitle[movies_genre_list['movie_name_lower'] == movie.lower()]
    cluster_in = movies_genre_list['cluster'][movies_genre_list['movie_name_lower'] == movie.lower()][movie_index]
    sim_movies_1 = movies_genre_list[movies_genre_list['cluster'] == cluster_in].drop(['movie_name_lower','cluster','movie_name'],axis=1)
    for i in sim_movies_1.index:
        dist = distance.jaccard(sim_movies,sim_movies_1.loc[i,:])
        min_dist.append((dist,i))
        sort_dist = sorted(min_dist)[1:8]
    for i in sort_dist:
        l.append(movies['title'].loc[i[1]])  
    #return(movie_list)
    #return(l[0],l[1],l[2],l[3],l[4],l[5],l[6])
        
    label_3 = Label(r, text=l[0],font = ('calibri', 15, 'italic'))
    label_4 = Label(r, text=l[1],font = ('calibri', 15, 'italic'))
    label_5 = Label(r, text=l[2],font = ('calibri', 15, 'italic'))
    label_6 = Label(r, text=l[3],font = ('calibri', 15, 'italic'))
    label_7 = Label(r, text=l[4],font = ('calibri', 15, 'italic'))
    label_8 = Label(r, text=l[5],font = ('calibri', 15, 'italic'))
    label_9 = Label(r, text=l[6],font = ('calibri', 15, 'italic'))
    label_3.pack()
    label_4.pack()
    label_5.pack()
    label_6.pack()
    label_7.pack()
    label_8.pack()
    label_9.pack()
    r.mainloop()
label_1 = Label(root, text=" Enter a movie name :",bg='Light blue',fg='white',width=20,bd=5,font = ('calibri', 30),cursor='pirate',relief=RAISED)
entry_1 = Entry(root,width=21,bd=5,bg='white',fg='black',font = ('calibri', 30),relief=RAISED)
label_1.pack()
entry_1.pack()
button7 = Button(root, text='Get recommendations',bg='light green',fg='white',width=20,bd=5,font = ('calibri', 30),cursor='pirate',relief=RAISED,command=movie_recommendation)
button7.pack()
root.mainloop()

In [None]:
movie_recommendation()