# Configuration Variables

In [None]:
ENABLE_COLAB=True
USE_GPU=True
GENERATE_PROFILE = False
NUM_CLUSTERS = 5
MOVIES_FOR_ANALYSIS = 0.08
DATA_FOR_ANALYSIS = 1.0
DATA_FILE_ROOT_PATH='/content/gdrive/MyDrive/ML1000/Project_Movies2/Original_Data/'
PANDA_PROFILE_OUTPUT_ROOT='/content/gdrive/MyDrive/ML1000/Project_Movies2/Original_Data/'

SAVE_MODEL_ROOT='/content/gdrive/MyDrive/Colab Notebooks/ML1000_Project2a/'
SAVE_DATA_CLUSTERED_CSV='/content/gdrive/MyDrive/Colab Notebooks/ML1000_Project2a/'


ENABLE_PLOTS=True

PLOT_ELBOW=True


MODEL_TYPE='kmeans'
#MODEL_TYPE='ap'

#didn't want to cluster with "usual" movie info
#MODEL_TYPE='dbscan'

# Environment

In [None]:
if ENABLE_COLAB:
  !pip install pycaret -q
  !pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip -q
  !pip install matplotlib -q
  !pip install pandasql -q
  
else:
  display('Google Colab not enabled')

In [None]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

import pandasql as ps

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

%matplotlib inline

import json

In [None]:
if ENABLE_COLAB:
  from pycaret.utils import enable_colab
  enable_colab()
else:
  display('Google Colab not enabled')

In [None]:
if ENABLE_COLAB: 
  from google.colab import drive
  drive.mount('/content/gdrive', force_remount=True)
else:
  display('Google Colab not enabled')


# Data Loading and Display

In [None]:
path_names       = DATA_FILE_ROOT_PATH + 'IMDb names.csv'  #change dir to your project folder
path_movies      = DATA_FILE_ROOT_PATH + 'IMDb movies.csv'  #change dir to your project folder
path_ratings     = DATA_FILE_ROOT_PATH + 'IMDb ratings.csv'  #change dir to your project folder
path_credits     = DATA_FILE_ROOT_PATH + 'IMDb title_principals.csv'  #change dir to your project folder

#added "low memory = False" to avoid a dtype error on loading csv. Seems like it scans a part of the file and may not have interpreted the column type correctly
data_names = pd.read_csv(path_names,low_memory=False)
data_movies = pd.read_csv(path_movies,low_memory=False)
data_ratings = pd.read_csv(path_ratings,low_memory=False)
data_credits = pd.read_csv(path_credits,low_memory=False)


In [None]:
if GENERATE_PROFILE:
  #uses too much memory so commented out
  #profile_names = ProfileReport(data_names, title="Movies Dataset - Names", html={'style': {'full_width': True}})
  #profile_names.to_file(output_file= PANDA_PROFILE_OUTPUT_ROOT + "DataProfile_Names.html")

  profile_movies = ProfileReport(data_movies, title="Movies Dataset - Movies", html={'style': {'full_width': True}})
  profile_movies.to_file(output_file= PANDA_PROFILE_OUTPUT_ROOT + "DataProfile_Movies.html")

  profile_ratings = ProfileReport(data_ratings, title="Movies Dataset - Ratings", html={'style': {'full_width': True}})
  profile_ratings.to_file(output_file= PANDA_PROFILE_OUTPUT_ROOT + "DataProfile_Ratings.html")

  profile_credits = ProfileReport(data_credits, title="Movies Dataset - Credits", html={'style': {'full_width': True}})
  profile_credits.to_file(output_file= PANDA_PROFILE_OUTPUT_ROOT + "DataProfile_Credits.html")
else:
  display("Generate profile is off")

In [None]:
display (data_names.shape)
data_names.head(2)

In [None]:
display (data_movies.shape)
data_movies.head(2)

In [None]:
display(data_ratings.shape)
data_ratings.head(2)

In [None]:
display (data_movies.shape)
data_movies.head(2)

In [None]:
display(data_credits.shape)
data_credits.head(2)

# Data Manipulation

In [None]:
display(data_movies.info())

In [None]:
#remove bad data (single record)
data_movies = data_movies.loc[(data_movies["year"] != "TV Movie 2019")]
display(data_movies.info())

In [None]:
data_movies = data_movies.dropna(subset=['genre'])
display(data_movies.info())

In [None]:
#convert year to a number for clustering
data_movies["year"] = pd.to_numeric(data_movies["year"])
display(data_movies.info())

In [None]:
#remove unneeded columns from movies

#data_movies_narrow = data_movies[['imdb_title_id',
#                           'title',
#                           #'genre',
#                           'year',
#                           'country',
#                           #'director',
#                           #'writer',
#                           'avg_vote',
#                           #'votes'
#                           #'metascore',
#                           #'reviews_from_users',
#                           #'reviews_from_critics'
#]]
#display(data_movies_narrow.info())
#data_movies_narrow.head(2)

In [None]:
data_credits.info()

In [None]:
display(data_credits.shape)
data_credits.head(5)
#Can drop columns:
#-characters (don't care)
#-job (don't care)
#-ordering (although we should filter by it)

#Filtering:
#-any director
#-actor/actress with a 1,2 beside them

In [None]:
#data_actors = data_credits.loc[data_credits['category'].isin(['actor','actress']) & data_credits['ordering']<=2]
data_actors = ps.sqldf("select imdb_title_id, imdb_name_id from data_credits where category in('actress', 'actor') and ordering in(1,2)")
#data_actors = ps.sqldf("select imdb_title_id, imdb_name_id from data_credits where category in('actress', 'actor')")
data_directors = ps.sqldf("select imdb_title_id, imdb_name_id from data_credits where category='director'")
display(data_actors.shape)
display(data_directors.shape)

In [None]:
result_step1 = pd.merge(data_movies, data_actors, how="inner", on=["imdb_title_id"])
result_step1.rename(columns={"imdb_name_id": "imdb_actor_id"},errors="raise",inplace=True)
display(result_step1.shape)
result_step1.head(5)

In [None]:
result_step2 = pd.merge(result_step1, data_directors, how="inner", on=["imdb_title_id"])
result_step2.rename(columns={"imdb_name_id": "imdb_director_id"},errors="raise",inplace=True)
display(result_step2.shape)
result_step2.head(5)

In [None]:
#time to split out and include the parsed genre
data_genres = data_movies[['imdb_title_id','genre']]
data_genres.head(2)

In [None]:
data_genres_expanded = data_genres['genre'].str.split(',', expand=True)
data_genres_expanded.columns = ['genre'+str(i) for i in data_genres_expanded.columns]

data_genres_expanded_concat = pd.concat([data_genres,data_genres_expanded], axis=1)

movie_genre_full_df = pd.melt(data_genres_expanded_concat, id_vars=['imdb_title_id'], value_vars=data_genres_expanded.columns, var_name='Genre Number', value_name='Genre').dropna()
movie_genre_full_df.drop('Genre Number', axis=1, inplace=True)

In [None]:
movie_genre_full_df['Genre'] = movie_genre_full_df['Genre'].str.strip()
movie_genre_full_df.groupby(['Genre']).size()

In [None]:
display(movie_genre_full_df.info())
movie_genre_full_df.head(5)

In [None]:
#testing before and after
data_genres[data_genres['imdb_title_id']=='tt0000574']

In [None]:
#testing before and after
movie_genre_full_df[movie_genre_full_df['imdb_title_id']=='tt0000574']

In [None]:
result_step3 = pd.merge(result_step2, movie_genre_full_df, how="inner", on=["imdb_title_id"])
display(result_step3.shape)
result_step3.head(5)

In [None]:
#Clean up the country from movies and merge back in
data_movies.groupby(['country']).size()

In [None]:
data_movies_country = data_movies[['imdb_title_id','country']]
display(data_movies_country.shape)
data_movies_country.head(5)
#test imdb_title_id  from Germany, Denmark

In [None]:
data_country_expanded = data_movies_country['country'].str.split(',', expand=True)
data_country_expanded.columns = ['Country'+str(i) for i in data_country_expanded.columns]

data_country_expanded_concat = pd.concat([data_movies_country,data_country_expanded], axis=1)

data_country_full = pd.melt(data_country_expanded_concat, id_vars=['imdb_title_id'], value_vars=data_country_expanded.columns, var_name='Country Number', value_name='Country').dropna()
data_country_full.drop('Country Number', axis=1, inplace=True)
data_country_full['Country'] = data_country_full['Country'].str.strip()
data_country_full.groupby(['Country']).size()


In [None]:
result_step4 = pd.merge(result_step3, data_country_full, how="inner", on=["imdb_title_id"])
#result_step4.drop('country', axis=1, inplace=True)
display(result_step4.shape)
result_step4.head(5)

In [None]:
#Set result dataframe to whatever you want to analyze
result = result_step4

In [None]:
#Trim down the number of movies we are using for analysis
#limit movie size to someting reasonable so it runs in a reasonable time
#this is a join filter for the working data set so it includes ALL records
#for any movie inside

#add in Als filter for movies
data_movies_short = data_movies.loc[(data_movies.year > 1935) &
                                    (data_movies.avg_vote > 4) & 
                                    (data_movies.language.str.contains("English")) &
                                    (data_movies.country.str.contains("USA"))]
display(data_movies_short.shape)



#removed random state to see if there is model impact
data_movies_active = data_movies_short.sample(frac=MOVIES_FOR_ANALYSIS)
data_movies_withheld = data_movies_short.drop(data_movies_active.index)

data_movies_active.reset_index(inplace=True, drop=True)
data_movies_withheld.reset_index(inplace=True, drop=True)

print('Number of Movies for Modeling: ' + str(data_movies_active.shape))
print('Number of Withheld Movies    : ' + str(data_movies_withheld.shape))

In [None]:
#trim columns for only imdb_title_id
data_movies_merge = data_movies_active[['imdb_title_id']]
display(data_movies_merge.info())
data_movies_merge.head(2)

result_analysis = pd.merge(result, data_movies_merge, how="inner", on=["imdb_title_id"])
result_analysis.info()

# Data Setup

In [None]:
data_analysis = result_analysis.sample(frac=DATA_FOR_ANALYSIS, random_state=54321)
data_unseen = result_analysis.drop(data_analysis.index)

data_analysis.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)

print('Data for Modeling: ' + str(data_analysis.shape))
print('Unseen Data For Predictions ' + str(data_unseen.shape))

In [None]:
from pycaret.clustering import *
setup_movies = setup(data_analysis,
                     ignore_features = ([
                                         'imdb_title_id',
                                         'title',
                                         'original_title',
                                         'year',
                                         'date_published',
                                         'genre',
                                         'duration',
                                         'country',
                                         'language',
                                         'director',
                                         'writer',
                                         'production_company',
                                         'actors',
                                         'description',
                                         'avg_vote',
                                         'votes',
                                         'budget',
                                         'usa_gross_income',
                                         'worlwide_gross_income',
                                         'metascore',
                                         #'imdb_actor_id',
                                         #'imdb_director_id',
                                         #'Genre',
                                         'Country',
                                         'reviews_from_users',
                                         'reviews_from_critics'
                                         ]),
                     #bin_numeric_features = ['avg_vote'],
                     #high_cardinality_features = (['imdb_actor_id', 'imdb_director_id']),
                     #combine_rare_levels = True, 
                     #normalize=True,
                     silent=True, 
                     use_gpu=USE_GPU,
                     session_id=123)

In [None]:
kmeans = create_model(MODEL_TYPE, num_clusters=NUM_CLUSTERS)



In [None]:
plot_model(kmeans)

In [None]:
if ENABLE_PLOTS:
  plot_model(kmeans, plot ='silhouette')
else:
  display("ENABLE_PLOTS not enabled")

In [None]:
if ENABLE_PLOTS:
  plot_model(kmeans, plot='distribution')
else:
  display("ENABLE_PLOTS not enabled")  

In [None]:
if PLOT_ELBOW:
  plot_model(kmeans, plot='elbow')
else:
  display("PLOT_ELBOW not enabled")

# Model Distribution

In [None]:
if ENABLE_PLOTS:
  plot_model(kmeans, plot='distribution', feature='imdb_director_id')
else:
  display("ENABLE_PLOTS not enabled")

In [None]:
if ENABLE_PLOTS:
  plot_model(kmeans, plot='distribution', feature='imdb_actor_id')
else:
  display("ENABLE_PLOTS not enabled")

In [None]:
if ENABLE_PLOTS:  
  plot_model(kmeans, plot='distribution', feature='original_title')
else:
  display("ENABLE_PLOTS not enabled")

In [None]:
if ENABLE_PLOTS:
  plot_model(kmeans, plot='distribution', feature='Genre')
else:
  display("ENABLE_PLOTS not enabled")

In [None]:
if ENABLE_PLOTS:
  plot_model(kmeans, plot='distribution', feature='metascore')
else:
  display("ENABLE_PLOTS not enabled")

In [None]:
if ENABLE_PLOTS:
  plot_model(kmeans, plot='distribution', feature='reviews_from_users')
else:
  display("ENABLE_PLOTS not enabled")

In [None]:
if ENABLE_PLOTS:
  plot_model(kmeans, plot='distribution', feature='reviews_from_critics')
else:
  display("ENABLE_PLOTS not enabled")

In [None]:
if ENABLE_PLOTS:
  plot_model(kmeans, plot='distribution', feature="Country")
else:
  display("ENABLE_PLOTS not enabled")

In [None]:
if ENABLE_PLOTS:
  plot_model(kmeans, plot='distribution', feature="year")
else:
  display("ENABLE_PLOTS not enabled")

In [None]:
if ENABLE_PLOTS:
  plot_model(kmeans, plot='distribution', feature="avg_vote")
else:
  display("ENABLE_PLOTS not enabled")

In [None]:
if ENABLE_PLOTS:
  plot_model(kmeans, plot='distribution', feature="votes")
else:
  display("ENABLE_PLOTS not enabled")

In [None]:
save_model(kmeans, SAVE_MODEL_ROOT + "movies2_final_model.202110301240")

# Data Prep and Save for Recommender

In [None]:
#Constants for recommender functionality
UNIQUE_TS_COLUMN_NAME='uniqueTempCol'

#if using the full set ~472K rows it crashes with out of memory error
QUICK_TEST_RECOMMENDER=True

WEIGHT_CLUSTER=1
WEIGHT_CLUSTER_GENRE=3
WEIGHT_CLUSTER_DIRECTOR=10
WEIGHT_CLUSTER_DIRECTOR_ACTOR=10
WEIGHT_CLUSTER_GENRE_ACTOR=10
WEIGHT_CLUSTER_GENRE_DIRECTOR=10

NUM_RECOMMENDED_MOVIES=10

In [None]:
#save active movies in our data set to csv for recommender
data_movies_active.to_csv(SAVE_DATA_CLUSTERED_CSV + 'data_movies_active.csv')

In [None]:
#choose the data set we are going to use
if QUICK_TEST_RECOMMENDER:
  display("Quick test set")
  movies_analysis_format = result_analysis
  
else:
  #full data set crashes runtime due to lack of memory. Need function to 
  #predict with subsets and append together
  display("Full test set")
  movies_analysis_format = result

display(movies_analysis_format.info())

In [None]:
#Need the cluster information for lookup
movies_analysis_format_wCluster = predict_model(kmeans, data=movies_analysis_format)
display(movies_analysis_format_wCluster.info())


In [None]:
movies_analysis_format_wCluster.to_csv(SAVE_DATA_CLUSTERED_CSV + "movies_analysis_format_wCluster.csv")

# Recommender Functions

In [None]:
def get_movie_details_for_analysis(imdb_title_id):
  #get record of movie details
  return movies_analysis_format[movies_analysis_format['imdb_title_id']==imdb_title_id]

In [None]:
#returns dataframe with top 10 movies based on rank
#lots of duplicate ranks so we need a tie breaker or another filter

def getTopForCluster(Cluster):  
  
  df = movies_analysis_format_wCluster

  df = df[df.Cluster.isin([Cluster])]
  df = df[['Cluster', 'avg_vote']]
  #display(df.shape)
  df = df.drop_duplicates()
  #display(df.shape)
  df = df.sort_values(by=['avg_vote'], ascending=False )
  df = df.head(WEIGHT_CLUSTER)  #now have the top 10 ranks. 

  df = pd.merge(df, movies_analysis_format_wCluster, how="inner", on=['avg_vote', 'Cluster'])
  df[UNIQUE_TS_COLUMN_NAME] = time.time()

  return df


                  

In [None]:
#returns dataframe with top 10 movies based on rank
#lots of duplicate ranks so we need a tie breaker or another filter

def getTopForClusterGenre(Cluster,Genre):  
  
  df = movies_analysis_format_wCluster

  df = df[df.Cluster.isin([Cluster])]
  df = df[df.Genre.isin([Genre])]
  df = df[['Cluster', 'avg_vote','Genre']]
  #display(df.shape)
  df = df.drop_duplicates()
  #display(df.shape)
  df = df.sort_values(by=['avg_vote'], ascending=False )
  df = df.head(WEIGHT_CLUSTER_GENRE)  #now have the top 10 ranks. 

  df = pd.merge(df, movies_analysis_format_wCluster, how="inner", on=['avg_vote', 'Cluster', 'Genre'])
  df[UNIQUE_TS_COLUMN_NAME] = time.time()

  return df


                  

In [None]:
#returns dataframe with top 10 movies based on rank
#lots of duplicate ranks so we need a tie breaker or another filter

def getTopForClusterByDirector(Cluster, Director):  
  df = movies_analysis_format_wCluster

  df = df[df.Cluster.isin([Cluster])]
  df = df[df.imdb_director_id.isin([Director])]
  
  df = df[['Cluster', 'avg_vote', 'imdb_director_id']]
  #display(df.shape)
  
  df = df.drop_duplicates()
  #display(df.shape)
  df = df.sort_values(by=['avg_vote'], ascending=False )
  df = df.head(WEIGHT_CLUSTER_DIRECTOR)  #now have the top 10 ranks. 

  df = pd.merge(df, movies_analysis_format_wCluster, how="inner", on=['avg_vote', 'Cluster', 'imdb_director_id'])
  df[UNIQUE_TS_COLUMN_NAME] = time.time()

  return df

In [None]:
#returns dataframe with top 10 movies based on rank
#lots of duplicate ranks so we need a tie breaker or another filter

def getTopForClusterByDirectorActor(Cluster, Director, Actor):
  df = movies_analysis_format_wCluster

  df = df[df.Cluster.isin([Cluster])]
  df = df[df.imdb_director_id.isin([Director])]
  df = df[df.imdb_actor_id.isin([Actor])]
  df = df[['Cluster', 'avg_vote', 'imdb_director_id', 'imdb_actor_id']]
  #display(df.shape)
  df = df.drop_duplicates()
  #display(df.shape)
  df = df.sort_values(by=['avg_vote'], ascending=False )
  df = df.head(WEIGHT_CLUSTER_DIRECTOR_ACTOR)  #now have the top 10 ranks. 

  df = pd.merge(df, movies_analysis_format_wCluster, how="inner", on=['avg_vote', 'Cluster', 'imdb_director_id', 'imdb_actor_id'])
  df[UNIQUE_TS_COLUMN_NAME] = time.time()

  return df

In [None]:
#returns dataframe with top 10 movies based on rank
#lots of duplicate ranks so we need a tie breaker or another filter

def getTopForClusterByGenreActor(Cluster, Genre, Actor):
  df = movies_analysis_format_wCluster
  
  df = df[df.Cluster.isin([Cluster])]
  df = df[df.Genre.isin([Genre])]
  df = df[df.imdb_actor_id.isin([Actor])]
  df = df[['Cluster', 'avg_vote', 'Genre', 'imdb_actor_id']]
  #display(df.shape)
  df = df.drop_duplicates()
  #display(df.shape)
  df = df.sort_values(by=['avg_vote'], ascending=False )
  df = df.head(WEIGHT_CLUSTER_GENRE_ACTOR)  #now have the top 10 ranks. 

  df = pd.merge(df, movies_analysis_format_wCluster, how="inner", on=['avg_vote', 'Cluster', 'Genre', 'imdb_actor_id'])
  df[UNIQUE_TS_COLUMN_NAME] = time.time()

  return df

In [None]:
#returns dataframe with top 10 movies based on rank
#lots of duplicate ranks so we need a tie breaker or another filter

def getTopForClusterByGenreDirector(Cluster, Genre, Director):
  df = movies_analysis_format_wCluster

  df = df[df.Cluster.isin([Cluster])]
  df = df[df.Genre.isin([Genre])]
  df = df[df.imdb_director_id.isin([Director])]
  df = df[['Cluster', 'avg_vote', 'Genre', 'imdb_director_id']]
  #display(df.shape)
  df = df.drop_duplicates()
  #display(df.shape)
  df = df.sort_values(by=['avg_vote'], ascending=False )
  df = df.head(WEIGHT_CLUSTER_GENRE_DIRECTOR)  #now have the top 10 ranks. 

  df = pd.merge(df, movies_analysis_format_wCluster, how="inner", on=['avg_vote', 'Cluster', 'Genre', 'imdb_director_id'])
  df[UNIQUE_TS_COLUMN_NAME] = time.time()
  
  return df

In [None]:
def magicRecommender(dfInput,orig_imdb_title_id):
  dfLarge = dfInput

  dfLarge = dfLarge[~dfLarge.imdb_title_id.isin([orig_imdb_title_id])]

  #remove the move we were called with
  dfLarge = dfLarge.groupby(['imdb_title_id']).size().to_frame('weighting')

  dfLarge = dfLarge.sort_values(by=['weighting'], ascending=False )
  dfLarge = dfLarge.head(NUM_RECOMMENDED_MOVIES)
  
  dfLarge = pd.merge(dfLarge, data_movies, how="inner", on=['imdb_title_id'])
  #dfLarge.head(5)
  dfLarge = dfLarge[['imdb_title_id', 'weighting','original_title','year', 'genre', 'director', 'actors', 'avg_vote']]
  #dfLarge.head(5)
  return dfLarge


In [None]:
#Three main dataFrames needed
#all movies in model format (no )
#Dataframe: movie_analysis_format
#
#all movies in model format with cluster (for separation) 
#Need to run "result" through the model and save it
#Dataframe: movie_analysis_format_clustered
#
#all movies in original format
#data_movies (filter for content)
#data_movies (remove bad record)
#Dataframe: data_movies

#main function needs to receive a movie ID as input. return as dataframe
def getRecommendations(imdb_title_id):
  
  #get dataframe of movie rows from data_analysis
  tMovieForAnalysis = get_movie_details_for_analysis(imdb_title_id)
  if (len(tMovieForAnalysis.index)==0):
    display("Could not find movie in file")

  #get cluster movie is put into (use engine)
  reqMovie = predict_model(kmeans, data=tMovieForAnalysis)
  reqMovie.reset_index(inplace=True)
  #reqMovie.info()

  #loop through all rows in the Movie cluster set
  for index, row in reqMovie.iterrows():
    #display(index)
    
    #could be in multiple clusters

    #get top 10 movies by actor/genre in cluster
    if(index == 0):
      #display("First loop, creating dataframe")
      retMovies = getTopForCluster(row['Cluster'])
      
    else:
      #display("Not first loop, appending dataframe")
      retMovies = retMovies.append(getTopForCluster(row['Cluster']))
      
      
    retMovies = retMovies.append(getTopForClusterByGenreActor(row['Cluster'], 
                                           row['Genre'],
                                           row['imdb_actor_id']))
      
       
    #get top by Genre by Cluster
    retMovies = retMovies.append(
        getTopForClusterGenre(row['Cluster'],row['Genre']))
    
    #get top by Director in Clusters
    retMovies = retMovies.append(
        getTopForClusterByDirector(row['Cluster'], 
                                         row['imdb_director_id']))

    #Get top by actor/director in cluster
    retMovies = retMovies.append(
        getTopForClusterByDirectorActor(row['Cluster'], 
                                              row['imdb_director_id'],
                                              row['imdb_actor_id']))


    #get top by genre/director
    retMovies = retMovies.append(
        getTopForClusterByGenreDirector(row['Cluster'], 
                                              row['Genre'],
                                              row['imdb_director_id']))

  #group em all and do some magic
  retMovies = magicRecommender(retMovies,imdb_title_id)

  return retMovies



# Recommender Testing

In [None]:
junkDF = result_analysis[result_analysis.imdb_actor_id.isin(['nm0000136'])]
junkDF = junkDF[['imdb_title_id']]
junkDF.groupby(['imdb_title_id']).size()


In [None]:
#Need to know some of the movies that are in this dataset for testing
moviesToTest = data_movies_active['imdb_title_id'].sample(n=10).to_frame()
moviesToTest.shape
moviesToTest = pd.merge(moviesToTest, data_movies_short, how='inner', left_on='imdb_title_id', right_on='imdb_title_id')
moviesToTest = moviesToTest[['imdb_title_id',
                             'original_title',
                             'year',
                             'genre',
                             'director',
                             'actors',
                             'avg_vote'
                            ]]
#moviesToTest = moviesToTest.sort_values(by=['avg_vote'], ascending=False )
moviesToTest.head(10)

In [None]:
recommendedMovies = getRecommendations('tt2047890')
#display(recommendedMovies.info())
recommendedMovies.head(10)

# Scratchpad

In [None]:
import time
time.time()

In [None]:
gInv1 = data_movies_short[['genre']]
gInv1 = gInv1.loc[gInv1.genre.str.contains('Drama')]

gInv1 = gInv1.groupby(['genre']).size().to_frame('size')
gInv1 = gInv1.sort_values(by=['size'], ascending=False )
display(gInv1.info())
gInv1.head(10)

In [None]:
tempDF = data_movies.groupby(['country']).size().to_frame('size')
tempDF = tempDF.sort_values(by=['size'], ascending=False )
tempDF.head(10)


In [None]:
result_analysis.groupby(['Country']).size()

In [None]:
type(kmeans)

In [None]:
result_analysis.hist(bins = 30, figsize = (12,10), grid = False)
plt.show()


In [None]:
#find the coordinates of the centre of the cluster
kmeans.cluster_centers_

In [None]:
kmeans.predict

In [None]:
result_analysis.head(5)

In [None]:
#Functions needed
#get name from imdb_name_id
#print movie details
data_result = assign_model(kmeans,verbose=True)
data_result

In [None]:
def get_person_name(imdb_name_id, df):
  return df['name'][df['imdb_name_id']==imdb_name_id].values[0]


In [None]:
#data_movies_short.head(5)
data_movies_director=data_movies_short['director'].to_frame()
data_movies_director = data_movies_director.groupby(['director']).size().to_frame('weighting')
data_movies_director = data_movies_director.sort_values(by=['weighting'],ascending=False)
data_movies_director.head(30)

In [None]:
#data_movies_short.head(5)
data_movies_actor = movies_analysis_format_wCluster ['imdb_actor_id'].to_frame()
data_movies_actor = data_movies_actor.groupby(['imdb_actor_id']).size().to_frame('weighting')
data_movies_actor = data_movies_actor.sort_values(by=['weighting'],ascending=False)
data_movies_actor = pd.merge(data_movies_actor, data_names,how='inner',left_on='imdb_actor_id', right_on='imdb_name_id')
data_movies_actor.head(30)