In [2]:
#https://www.kaggle.com/arthurtok/principal-component-analysis-with-kmeans-visuals
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.decomposition import PCA # Principal Component Analysis module
from sklearn.cluster import KMeans # KMeans clustering 
import matplotlib.pyplot as plt # Python defacto plotting library
# import seaborn as sns # More snazzy plotting library
%matplotlib inline 

In [3]:
import json
import pandas as pd
#___________________________
def load_tmdb_movies(path):
    df = pd.read_csv(path)
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries',
                    'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df
#___________________________
def load_tmdb_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df
#___________________
LOST_COLUMNS = [
    'actor_1_facebook_likes',
    'actor_2_facebook_likes',
    'actor_3_facebook_likes',
    'aspect_ratio',
    'cast_total_facebook_likes',
    'color',
    'content_rating',
    'director_facebook_likes',
    'facenumber_in_poster',
    'movie_facebook_likes',
    'movie_imdb_link',
    'num_critic_for_reviews',
    'num_user_for_reviews']
#____________________________________
TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES = {
    'budget': 'budget',
    'genres': 'genres',
    'revenue': 'gross',
    'title': 'movie_title',
    'runtime': 'duration',
    'original_language': 'language',
    'keywords': 'plot_keywords',
    'vote_count': 'num_voted_users'}
#_____________________________________________________
IMDB_COLUMNS_TO_REMAP = {'imdb_score': 'vote_average'}
#_____________________________________________________
def safe_access(container, index_values):
    # return missing value rather than an error upon indexing/key failure
    result = container
    try:
        for idx in index_values:
            result = result[idx]
        return result
    except IndexError or KeyError:
        return pd.np.nan
#_____________________________________________________
def get_director(crew_data):
    directors = [x['name'] for x in crew_data if x['job'] == 'Director']
    return safe_access(directors, [0])
#_____________________________________________________
def pipe_flatten_names(keywords):
    return '|'.join([x['name'] for x in keywords])
#_____________________________________________________
def convert_to_original_format(movies, credits):
    tmdb_movies = movies.copy()
    tmdb_movies.rename(columns=TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES, inplace=True)
    tmdb_movies['title_year'] = pd.to_datetime(tmdb_movies['release_date']).apply(lambda x: x.year)
    # I'm assuming that the first production country is equivalent, but have not been able to validate this
    tmdb_movies['country'] = tmdb_movies['production_countries'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['language'] = tmdb_movies['spoken_languages'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['director_name'] = credits['crew'].apply(get_director)
    tmdb_movies['actor_1_name'] = credits['cast'].apply(lambda x: safe_access(x, [1, 'name']))
    tmdb_movies['actor_2_name'] = credits['cast'].apply(lambda x: safe_access(x, [2, 'name']))
    tmdb_movies['actor_3_name'] = credits['cast'].apply(lambda x: safe_access(x, [3, 'name']))
    tmdb_movies['genres'] = tmdb_movies['genres'].apply(pipe_flatten_names)
    tmdb_movies['plot_keywords'] = tmdb_movies['plot_keywords'].apply(pipe_flatten_names)
    return tmdb_movies

In [6]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
# import seaborn as sns
import math, nltk, warnings
from nltk.corpus import wordnet
from sklearn import linear_model
from sklearn.neighbors import NearestNeighbors
# from fuzzywuzzy import fuzz
from wordcloud import WordCloud, STOPWORDS
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"
pd.options.display.max_columns = 50
%matplotlib inline
warnings.filterwarnings('ignore')
PS = nltk.stem.PorterStemmer()
#__________________
# load the dataset
credits = load_tmdb_credits("./tmdb_5000_credits.csv")
movies = load_tmdb_movies("./tmdb_5000_movies.csv")
df_initial = convert_to_original_format(movies, credits)
print('Shape:',df_initial.shape)
#__________________________________________
# info on variable types and filling factor
tab_info=pd.DataFrame(df_initial.dtypes).T.rename(index={0:'column type'})
tab_info=tab_info.append(pd.DataFrame(df_initial.isnull().sum()).T.rename(index={0:'null values'}))
tab_info=tab_info.append(pd.DataFrame(df_initial.isnull().sum()/df_initial.shape[0]*100).T.
                         rename(index={0:'null values (%)'}))
tab_info

Shape: (4803, 26)


Unnamed: 0,budget,genres,homepage,id,plot_keywords,language,original_title,overview,popularity,production_companies,production_countries,release_date,gross,duration,spoken_languages,status,tagline,movie_title,vote_average,num_voted_users,title_year,country,director_name,actor_1_name,actor_2_name,actor_3_name
column type,int64,object,object,int64,object,object,object,object,float64,object,object,object,int64,float64,object,object,object,object,float64,int64,float64,object,object,object,object,object
null values,0,0,3091,0,0,86,0,3,0,0,0,1,0,2,0,0,844,0,0,0,1,174,30,53,63,93
null values (%),0,0,64.3556,0,0,1.79055,0,0.062461,0,0,0,0.0208203,0,0.0416406,0,0,17.5724,0,0,0,0.0208203,3.62274,0.62461,1.10348,1.31168,1.93629


In [9]:
movie = df_initial
movie.head()

Unnamed: 0,budget,genres,homepage,id,plot_keywords,language,original_title,overview,popularity,production_companies,production_countries,release_date,gross,duration,spoken_languages,status,tagline,movie_title,vote_average,num_voted_users,title_year,country,director_name,actor_1_name,actor_2_name,actor_3_name
0,237000000,Action|Adventure|Fantasy|Science Fiction,http://www.avatarmovie.com/,19995,culture clash|future|space war|space colony|so...,English,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{'id': 289, 'name': 'Ingenious Film Partners'...","[{'name': 'United States of America', 'iso_316...",2009-12-10,2787965087,162.0,"[{'name': 'English', 'iso_639_1': 'en'}, {'nam...",Released,Enter the World of Pandora.,Avatar,7.2,11800,2009.0,United States of America,James Cameron,Zoe Saldana,Sigourney Weaver,Stephen Lang
1,300000000,Adventure|Fantasy|Action,http://disney.go.com/disneypictures/pirates/,285,ocean|drug abuse|exotic island|east india trad...,English,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{'id': 2, 'name': 'Walt Disney Pictures'}, {'...","[{'name': 'United States of America', 'iso_316...",2007-05-19,961000000,169.0,"[{'name': 'English', 'iso_639_1': 'en'}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,2007.0,United States of America,Gore Verbinski,Orlando Bloom,Keira Knightley,Stellan Skarsgård
2,245000000,Action|Adventure|Crime,http://www.sonypictures.com/movies/spectre/,206647,spy|based on novel|secret agent|sequel|mi6|bri...,Français,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{'id': 5, 'name': 'Columbia Pictures'}, {'id'...","[{'name': 'United Kingdom', 'iso_3166_1': 'GB'...",2015-10-26,880674609,148.0,"[{'name': 'Français', 'iso_639_1': 'fr'}, {'na...",Released,A Plan No One Escapes,Spectre,6.3,4466,2015.0,United Kingdom,Sam Mendes,Christoph Waltz,Léa Seydoux,Ralph Fiennes
3,250000000,Action|Crime|Drama|Thriller,http://www.thedarkknightrises.com/,49026,dc comics|crime fighter|terrorist|secret ident...,English,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{'id': 923, 'name': 'Legendary Pictures'}, {'...","[{'name': 'United States of America', 'iso_316...",2012-07-16,1084939099,165.0,"[{'name': 'English', 'iso_639_1': 'en'}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,2012.0,United States of America,Christopher Nolan,Michael Caine,Gary Oldman,Anne Hathaway
4,260000000,Action|Adventure|Science Fiction,http://movies.disney.com/john-carter,49529,based on novel|mars|medallion|space travel|pri...,English,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{'id': 2, 'name': 'Walt Disney Pictures'}]","[{'name': 'United States of America', 'iso_316...",2012-03-07,284139100,132.0,"[{'name': 'English', 'iso_639_1': 'en'}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,2012.0,United States of America,Andrew Stanton,Lynn Collins,Samantha Morton,Willem Dafoe


In [10]:
str_list = [] # empty list to contain columns with strings (words)
for colname, colvalue in movie.iteritems():
    if type(colvalue[1]) == str:
         str_list.append(colname)
# Get to the numeric columns by inversion            
num_list = movie.columns.difference(str_list)         

In [11]:
movie_num = movie[num_list]
#del movie # Get rid of movie df as we won't need it now
movie_num.head()

Unnamed: 0,budget,duration,gross,id,num_voted_users,popularity,production_companies,production_countries,release_date,spoken_languages,title_year,vote_average
0,237000000,162.0,2787965087,19995,11800,150.437577,"[{'id': 289, 'name': 'Ingenious Film Partners'...","[{'name': 'United States of America', 'iso_316...",2009-12-10,"[{'name': 'English', 'iso_639_1': 'en'}, {'nam...",2009.0,7.2
1,300000000,169.0,961000000,285,4500,139.082615,"[{'id': 2, 'name': 'Walt Disney Pictures'}, {'...","[{'name': 'United States of America', 'iso_316...",2007-05-19,"[{'name': 'English', 'iso_639_1': 'en'}]",2007.0,6.9
2,245000000,148.0,880674609,206647,4466,107.376788,"[{'id': 5, 'name': 'Columbia Pictures'}, {'id'...","[{'name': 'United Kingdom', 'iso_3166_1': 'GB'...",2015-10-26,"[{'name': 'Français', 'iso_639_1': 'fr'}, {'na...",2015.0,6.3
3,250000000,165.0,1084939099,49026,9106,112.31295,"[{'id': 923, 'name': 'Legendary Pictures'}, {'...","[{'name': 'United States of America', 'iso_316...",2012-07-16,"[{'name': 'English', 'iso_639_1': 'en'}]",2012.0,7.6
4,260000000,132.0,284139100,49529,2124,43.926995,"[{'id': 2, 'name': 'Walt Disney Pictures'}]","[{'name': 'United States of America', 'iso_316...",2012-03-07,"[{'name': 'English', 'iso_639_1': 'en'}]",2012.0,6.1


In [29]:
movie_num = movie_num.fillna(value=0, axis=1)

In [35]:
X = movie_num.values
print(X.shape)
# Data Normalization
from sklearn.preprocessing import StandardScaler


for x in X:
    print(x)
    break

# StandardScaler().fit_transform(X)


# my_patient_data_X = []
# my_patient_data_X = [StandardScaler().fit_transform(X) for X in my_patient_data_X]
# print(my_patient_data_X)

# print(scaler.fit(X))
# sc = StandardScaler()
# print(sc)
# sc.fit(X)
# X_std = sc.fit_transform(X)


(4803, 12)
[237000000 162.0 2787965087 19995 11800 150.437577
 list([{'id': 289, 'name': 'Ingenious Film Partners'}, {'id': 306, 'name': 'Twentieth Century Fox Film Corporation'}, {'id': 444, 'name': 'Dune Entertainment'}, {'id': 574, 'name': 'Lightstorm Entertainment'}])
 list([{'name': 'United States of America', 'iso_3166_1': 'US'}, {'name': 'United Kingdom', 'iso_3166_1': 'GB'}])
 datetime.date(2009, 12, 10)
 list([{'name': 'English', 'iso_639_1': 'en'}, {'name': 'Español', 'iso_639_1': 'es'}])
 2009.0 7.2]


In [22]:
data = [[0, 0], [0, 0], [1, 1], [1, 1]]
scaler = StandardScaler()
print(scaler.fit(data))

StandardScaler(copy=True, with_mean=True, with_std=True)
