In [1]:
from sklearn import linear_model
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans

In [2]:
n_features = 10000 # Maximum number of words to use
min_df = 2 # Miniumum document frequency
max_df = 0.8 # Maxiumum fraction of docs a word can occur in 
n_clusters = 10 #Number of clusters to find

In [3]:
df = pd.read_csv('data/tmdb_5000_movies.csv.gz') #https://www.kaggle.com/tmdb/tmdb-movie-metadata
df.dropna(subset = ['overview'],inplace=True)
print("Length of Dataset ",len(df))
df.head(2)

Length of Dataset  4800


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [4]:
# Convert the movie overviews from text to TF-IDF vectors
vectorizer = TfidfVectorizer(max_df=max_df, 
                             max_features=n_features,
                             min_df=min_df, 
                             stop_words='english',
                             use_idf=True)
X = vectorizer.fit_transform(df.overview.values)
X.shape

(4800, 10000)

In [5]:
%%time
km = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1,
                verbose=False)
km.fit(X)

CPU times: user 3.31 s, sys: 9.26 ms, total: 3.32 s
Wall time: 3.33 s


In [6]:
df['cluster_id'] = km.predict(X)
clusters = df.groupby('cluster_id')['original_title']
def get_movies(cluster_id,n_movies):
  movies=[]
  for movie in clusters.get_group(cluster_id)[0:n_movies]:
    movies.append(movie) 
  return movies

In [7]:
print("Movie Clusters\n\n")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(n_clusters):
    print("Cluster %d" % i)
    print("Terms:" , end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print("\nMovies :" , get_movies(i,5))    
    print()

Movie Clusters


Cluster 0
Terms: life years man story friends woman wife night lives death
Movies : ["Pirates of the Caribbean: At World's End", 'The Chronicles of Narnia: Prince Caspian', 'The Hobbit: The Battle of the Five Armies', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass']

Cluster 1
Terms: young man woman boy girl life people lives men friend
Movies : ['Man of Steel', 'Jack the Giant Slayer', 'Indiana Jones and the Kingdom of the Crystal Skull', 'Jupiter Ascending', 'The Polar Express']

Cluster 2
Terms: world war earth planet ii save crew story human evil
Movies : ['John Carter', 'Avengers: Age of Ultron', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'The Avengers']

Cluster 3
Terms: family father son old mother year home life daughter brother
Movies : ['The Amazing Spider-Man', 'Robin Hood', 'Furious 7', 'Hugo', 'Warcraft']

Cluster 4
Terms: love film story life true based man falls set woman
Movies : ['Spider-Man 3', 'Pirates of the Caribbean: On 