In [1]:
import json
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
#Fix random seed for reproducibility
np.random.seed(321)

#Set global scikit-learn configuration 
from sklearn import set_config

#Display estimators as a diagram
set_config(display='diagram')

#Set max columns for dataframes
pd.set_option('display.max_columns',100)

In [2]:
#Load in combined data set
FOLDER = 'Data/'
df = pd.read_csv(FOLDER+'combined_tmdb_data.csv.gz',lineterminator='\n')
df.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,Two rural teens sing and dance their way throu...,2.075,/hfO64mXz3DgUxkBVU7no2UWRP7x.jpg,"[{'id': 51207, 'logo_path': None, 'name': 'Sul...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-09-22,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
1,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,Earth is in a state of constant war and two co...,0.704,/h9bWO13nWRGZJo4XVPiElXyrRMU.jpg,"[{'id': 7405, 'logo_path': '/rfnws0uY8rsNAsrLb...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-11-15,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.45,10.0,
2,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,"After falling prey to underworld, four friends...",0.887,/yB5wRu4uyXXwZA3PEj8cITu0xt3.jpg,[],"[{'iso_3166_1': 'IN', 'name': 'India'}]",2000-04-14,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.0,1.0,
3,tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,Two neighbors become intimate after discoverin...,26.988,/iYypPT4bhqXfq1b6EnmxvRt6b2Y.jpg,"[{'id': 539, 'logo_path': '/iPLtePguIzOPNtAWfT...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",2000-09-29,14204632.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.111,2305.0,PG
4,tt0118852,0.0,/vceiGZ3uavAEHlTA7v0GjQsGVKe.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,49511.0,en,Chinese Coffee,"When Harry Levine, an aging, unsuccessful Gree...",2.984,/nZGWnSuf1FIuzyEuMRZHHZWViAp.jpg,"[{'id': 67930, 'logo_path': None, 'name': 'Cha...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-09-02,0.0,99.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's a fine line between friendship and bet...,Chinese Coffee,0.0,6.642,53.0,R


In [3]:
#Preview info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2616 entries, 0 to 2615
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                2616 non-null   object 
 1   adult                  2606 non-null   float64
 2   backdrop_path          1447 non-null   object 
 3   belongs_to_collection  211 non-null    object 
 4   budget                 2606 non-null   float64
 5   genres                 2606 non-null   object 
 6   homepage               170 non-null    object 
 7   id                     2606 non-null   float64
 8   original_language      2606 non-null   object 
 9   original_title         2606 non-null   object 
 10  overview               2562 non-null   object 
 11  popularity             2606 non-null   float64
 12  poster_path            2365 non-null   object 
 13  production_companies   2606 non-null   object 
 14  production_countries   2606 non-null   object 
 15  rele

## Clean Data

In [4]:
#Drop unnecessary columns
drop_cols = ['backdrop_path','original_title','overview',
             'poster_path','status','tagline','id','homepage',
             'production_countries','video','production_companies','spoken_languages',
             'original_language']
df = df.drop(columns=drop_cols)
df.head()

Unnamed: 0,imdb_id,adult,belongs_to_collection,budget,genres,popularity,release_date,revenue,runtime,title,vote_average,vote_count,certification
0,tt0113026,0.0,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",2.075,2000-09-22,0.0,86.0,The Fantasticks,5.5,22.0,
1,tt0113092,0.0,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",0.704,2000-11-15,0.0,100.0,For the Cause,5.45,10.0,
2,tt0116391,0.0,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",0.887,2000-04-14,0.0,152.0,Gang,4.0,1.0,
3,tt0118694,0.0,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",26.988,2000-09-29,14204632.0,99.0,In the Mood for Love,8.111,2305.0,PG
4,tt0118852,0.0,,0.0,"[{'id': 18, 'name': 'Drama'}]",2.984,2000-09-02,0.0,99.0,Chinese Coffee,6.642,53.0,R


In [5]:
# Change values in belongs to collection column to true/false
df['belongs_to_collection'] = df['belongs_to_collection'].notna()
df['belongs_to_collection'].value_counts()

False    2405
True      211
Name: belongs_to_collection, dtype: int64

In [6]:
# Function to get list of genres from each movie
def get_genre_name(x):
    x = x.replace("'",'"')
    x = json.loads(x)
    
    genres = []
    for genre in x:
        genres.append(genre['name'])
    return genres

In [7]:
# Test function
get_genre_name(df.loc[3,'genres'])

['Drama', 'Romance']

In [8]:
# Use function to explode genres to new columns
df['genres_list'] = df['genres'].apply(get_genre_name)
df_explode = df.explode('genres_list')
df_explode.head()

AttributeError: 'float' object has no attribute 'replace'