In [95]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer

In [110]:
df = pd.read_csv('movie_metadata.csv')
df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [5]:
df.dtypes

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
m

Relations I want to explore
1) Duration vs Score, budget
2) Most famous type of movies - IMDB score based, Gross based and budget based to get producers view
Get the top 250 movie by Budget, IMDB Score and Gross 

In [113]:
df = df[df['num_voted_users'] > 100][['plot_keywords', 'gross', 'budget','duration','imdb_score', 'movie_title']]
tags = df['plot_keywords'].copy().str.split('|').apply(pd.Series, 1).stack()
tags.index = tags.index.droplevel(-1)
tags.index
tags.name = 'tags'
df = df.join(tags)
df.head()

Unnamed: 0,plot_keywords,gross,budget,duration,imdb_score,movie_title,tags
0,avatar|future|marine|native|paraplegic,760505847.0,237000000.0,178.0,7.9,Avatar,avatar
0,avatar|future|marine|native|paraplegic,760505847.0,237000000.0,178.0,7.9,Avatar,future
0,avatar|future|marine|native|paraplegic,760505847.0,237000000.0,178.0,7.9,Avatar,marine
0,avatar|future|marine|native|paraplegic,760505847.0,237000000.0,178.0,7.9,Avatar,native
0,avatar|future|marine|native|paraplegic,760505847.0,237000000.0,178.0,7.9,Avatar,paraplegic


In [114]:
count_vect = CountVectorizer(ngram_range=(1,2), stop_words ={'and', 'in', 'of', 'the', 'on','to', 'title','reference',\
                                                             'female','male'})
X_train_counts = count_vect.fit_transform(df.tags.dropna())
X_train_counts

<23933x11144 sparse matrix of type '<type 'numpy.int64'>'
	with 45049 stored elements in Compressed Sparse Row format>

In [126]:
MaxFeatureValues = X_train_counts.todense().sum(axis = 0).tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(MaxFeatureValues)), MaxFeatureValues) if pair[1] > 0]
TopTags = sorted(phrase_scores, key=lambda t: t[1] * -1)[:30]
ind = [x[0] for x in TopTags]
featurelist = count_vect.get_feature_names()
TopTagsNames = [featurelist[j] for j in ind]
TopTagsNames

[u'police',
 u'war',
 u'friend',
 u'death',
 u'love',
 u'based',
 u'murder',
 u'nudity',
 u'relationship',
 u'school',
 u'escape',
 u'girl',
 u'space',
 u'time',
 u'century',
 u'christmas',
 u'detective',
 u'alien',
 u'family',
 u'gang',
 u'high',
 u'memory',
 u'nazi',
 u'soldier',
 u'travel',
 u'vietnam',
 u'american',
 u'boxing',
 u'brother',
 u'crime']

In [125]:
Top250 = df[['movie_title','imdb_score']].drop_duplicates().sort_values('imdb_score').tail(250)
Top250IMDBScore = Top250.join(df[['tags']])
X_train_counts = count_vect.fit_transform(Top250IMDBScore.tags.dropna())
X_train_counts

<1221x1562 sparse matrix of type '<type 'numpy.int64'>'
	with 2349 stored elements in Compressed Sparse Row format>

In [127]:
MaxFeatureValues = X_train_counts.todense().sum(axis = 0).tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(MaxFeatureValues)), MaxFeatureValues) if pair[1] > 0]
TopTags = sorted(phrase_scores, key=lambda t: t[1] * -1)[:30]
ind = [x[0] for x in TopTags]
featurelist = count_vect.get_feature_names()
TopTagsNamesIMDB250 = [featurelist[j] for j in ind]
TopTagsNamesIMDB250

[u'police',
 u'war',
 u'friend',
 u'death',
 u'love',
 u'based',
 u'murder',
 u'nudity',
 u'relationship',
 u'school',
 u'escape',
 u'girl',
 u'space',
 u'time',
 u'century',
 u'christmas',
 u'detective',
 u'alien',
 u'family',
 u'gang',
 u'high',
 u'memory',
 u'nazi',
 u'soldier',
 u'travel',
 u'vietnam',
 u'american',
 u'boxing',
 u'brother',
 u'crime']

In [128]:
Top250 = df[['movie_title','gross']].drop_duplicates().sort_values('gross').tail(250)
Top250Grossing = Top250.join(df[['tags']])
X_train_counts = count_vect.fit_transform(Top250Grossing.tags.dropna())
X_train_counts

<1009x1312 sparse matrix of type '<type 'numpy.int64'>'
	with 1987 stored elements in Compressed Sparse Row format>

In [129]:
MaxFeatureValues = X_train_counts.todense().sum(axis = 0).tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(MaxFeatureValues)), MaxFeatureValues) if pair[1] > 0]
TopTags = sorted(phrase_scores, key=lambda t: t[1] * -1)[:30]
ind = [x[0] for x in TopTags]
featurelist = count_vect.get_feature_names()
TopTagsNames250Grossing = [featurelist[j] for j in ind]
TopTagsNames250Grossing

[u'nudity',
 u'relationship',
 u'sex',
 u'based',
 u'film',
 u'police',
 u'death',
 u'frontal',
 u'frontal nudity',
 u'love',
 u'girl',
 u'new',
 u'party',
 u'city',
 u'cult',
 u'friend',
 u'new york',
 u'school',
 u'series',
 u'war',
 u'woman',
 u'word',
 u'york',
 u'father',
 u'man',
 u'one',
 u'panties',
 u'trip',
 u'zombie',
 u'blood']

In [131]:
Top250 = df[['movie_title','budget']].drop_duplicates().sort_values('budget').tail(250)
Top250Budget = Top250.join(df[['tags']])
X_train_counts = count_vect.fit_transform(Top250Budget.tags.dropna())
X_train_counts

<1137x1423 sparse matrix of type '<type 'numpy.int64'>'
	with 2299 stored elements in Compressed Sparse Row format>

In [132]:
MaxFeatureValues = X_train_counts.todense().sum(axis = 0).tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(MaxFeatureValues)), MaxFeatureValues) if pair[1] > 0]
TopTags = sorted(phrase_scores, key=lambda t: t[1] * -1)[:30]
ind = [x[0] for x in TopTags]
featurelist = count_vect.get_feature_names()
TopTagsNames250Budget = [featurelist[j] for j in ind]
TopTagsNames250Budget

[u'nudity',
 u'school',
 u'sex',
 u'new',
 u'new york',
 u'relationship',
 u'york',
 u'frontal',
 u'frontal nudity',
 u'friend',
 u'police',
 u'high',
 u'one',
 u'city',
 u'friendship',
 u'high school',
 u'based',
 u'brother',
 u'love',
 u'death',
 u'girl',
 u'series',
 u'word',
 u'york city',
 u'best',
 u'by',
 u'father',
 u'hair',
 u'mother',
 u'singer']