# IMDb Dataset Creation Part 2

Here we will focus a bit more on the people involved in the films. Here we will be generating data from what we already have that our model could use to get better results.

Here we will be making use of the datasets previously created.

In [1]:
import warnings, requests, gzip, io, gc
import multiprocessing
from joblib import Parallel, delayed
warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import math
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
number_of_cores = os.cpu_count()
number_of_cores

12

In [4]:
movie_dataset = pd.read_csv("imdb_movie_dataset.csv")
people_dataset = pd.read_csv("personnel_information.csv")

In [5]:
movie_dataset  = movie_dataset.fillna(value=np.nan)
movie_dataset.isna().sum()

Unnamed: 0                  0
tconst                      0
primaryTitle                1
isAdult                     0
releaseYear                 0
runtimeMinutes              0
Action                      0
Adult                       0
Adventure                   0
Animation                   0
Biography                   0
Comedy                      0
Crime                       0
Documentary                 0
Drama                       0
Family                      0
Fantasy                     0
Film-Noir                   0
Game-Show                   0
History                     0
Horror                      0
Music                       0
Musical                     0
Mystery                     0
News                        0
Reality-TV                  0
Romance                     0
Sci-Fi                      0
Sport                       0
Talk-Show                   0
Thriller                    0
War                         0
Western                     0
\N        

In [6]:
movie_dataset.drop('production_designer',axis=1, inplace=True)
movie_dataset['Short'] = np.where(movie_dataset['runtimeMinutes']>50, 0, 1)
movie_dataset['region'] = movie_dataset['region'].convert_dtypes(convert_string=True)
movie_dataset['region'] = movie_dataset['region'].str.replace(r"\N," ,'',regex=False)
movie_dataset['region'] = movie_dataset['region'].str.replace(r",\N" ,'',regex=False)
movie_dataset.head()

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,isAdult,releaseYear,runtimeMinutes,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Sport,Talk-Show,Thriller,War,Western,\N,Short,region,averageRating,numVotes,actor,actress,cinematographer,composer,director,editor,producer,self,writer
0,1,tt0013274,Istoriya grazhdanskoy voyny,False,2021,94,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"FR,RU,SUHH,GR,XWW",6.8,62,,,,,"nm0412842,nm0895048",,nm13054604,,\N
1,2,tt0015414,La tierra de los toros,False,2000,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,"ES,FR,XWW,FR",5.2,16,,,,,nm0615736,,,"nm0147437,nm0615736",\N
2,3,tt0035423,Kate & Leopold,False,2001,118,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,"IT,FR,ES,PT,ZA,PL,BG,GB,JP,DE,EE,CO,LT,ID,CA,P...",6.4,88083,"nm0413168,nm0000630,nm0005227",nm0000212,nm0238698,nm0448843,nm0003506,nm0107463,nm0465298,,"nm0737216,nm0003506"
3,4,tt0062336,The Tango of the Widower and Its Distorting Mi...,False,2020,70,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"RU,XWW,GB,HK,TW,US,FR,CL",6.4,183,"nm0815612,nm0016013","nm1860495,nm0739834",nm0093680,nm0005948,"nm0749914,nm0765384",,nm1131208,,"nm0749914,nm1146177"
4,5,tt0069049,The Other Side of the Wind,False,2018,122,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"AU,AR,ES,GR,FR,KR,VE,CA,JP,VN,PL,DE,HU,PT,CA,B...",6.7,7891,"nm0001379,nm0000953","nm0462648,nm0001782",nm0004372,nm0006166,nm0000080,nm0613657,"nm0550881,nm1475059",,"nm0000080,nm0462648"


In [7]:
movie_dataset["region"].fillna("", inplace=True)
movie_dataset["region"].isna().sum()

0

In [8]:
def check_region(region, target):
    try:
        return 1 if target in region else 0
    except TypeError:
        return 0

In [9]:
movie_dataset['region_US'] = movie_dataset["region"].apply(lambda x: check_region(x, 'US'))
movie_dataset['region_UK'] = movie_dataset["region"].apply(lambda x: check_region(x, 'UK'))
movie_dataset['region_AU'] = movie_dataset["region"].apply(lambda x: check_region(x, 'AU'))
movie_dataset['region_IN'] = movie_dataset["region"].apply(lambda x: check_region(x, 'IN'))
movie_dataset['region_JP'] = movie_dataset["region"].apply(lambda x: check_region(x, 'JP'))
movie_dataset['region_other'] = movie_dataset['region'].apply(lambda x: any(e not in ['US', 'UK','AU','IN','JP'] for e in x)).astype(int)
movie_dataset.head(10)

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,isAdult,releaseYear,runtimeMinutes,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Sport,Talk-Show,Thriller,War,Western,\N,Short,region,averageRating,numVotes,actor,actress,cinematographer,composer,director,editor,producer,self,writer,region_US,region_UK,region_AU,region_IN,region_JP,region_other
0,1,tt0013274,Istoriya grazhdanskoy voyny,False,2021,94,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"FR,RU,SUHH,GR,XWW",6.8,62,,,,,"nm0412842,nm0895048",,nm13054604,,\N,0,0,0,0,0,1
1,2,tt0015414,La tierra de los toros,False,2000,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,"ES,FR,XWW,FR",5.2,16,,,,,nm0615736,,,"nm0147437,nm0615736",\N,0,0,0,0,0,1
2,3,tt0035423,Kate & Leopold,False,2001,118,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,"IT,FR,ES,PT,ZA,PL,BG,GB,JP,DE,EE,CO,LT,ID,CA,P...",6.4,88083,"nm0413168,nm0000630,nm0005227",nm0000212,nm0238698,nm0448843,nm0003506,nm0107463,nm0465298,,"nm0737216,nm0003506",1,0,1,0,1,1
3,4,tt0062336,The Tango of the Widower and Its Distorting Mi...,False,2020,70,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"RU,XWW,GB,HK,TW,US,FR,CL",6.4,183,"nm0815612,nm0016013","nm1860495,nm0739834",nm0093680,nm0005948,"nm0749914,nm0765384",,nm1131208,,"nm0749914,nm1146177",1,0,0,0,0,1
4,5,tt0069049,The Other Side of the Wind,False,2018,122,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"AU,AR,ES,GR,FR,KR,VE,CA,JP,VN,PL,DE,HU,PT,CA,B...",6.7,7891,"nm0001379,nm0000953","nm0462648,nm0001782",nm0004372,nm0006166,nm0000080,nm0613657,"nm0550881,nm1475059",,"nm0000080,nm0462648",1,0,1,0,1,1
5,6,tt0070596,Socialist Realism,False,2023,78,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"CL,GB,XWW,XWW,US",7.5,49,"nm0031652,nm0250185,nm15107981,nm0539221",,nm0618088,nm0005948,"nm0749914,nm0765384",,nm1131208,,"nm0749914,nm0667610,nm0765384",1,0,0,0,0,1
6,7,tt0077684,Hist贸rias de Comb贸ios em Portugal,False,2022,46,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,"PT,PT,PT",0.0,0,,,,,nm0127908,,,,\N,0,0,0,0,0,1
7,8,tt0082328,Embodiment of Evil,False,2008,94,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"BR,XWW,GR,US,RU,KR,DE,PL,FR,BR,AR,US",5.9,1760,"nm0596261,nm0883465,nm0835629,nm1141101",,,nm0009494,nm0596261,,"nm0347775,nm0347776,nm2052281,nm0755345",,"nm0707963,nm0596261",1,0,0,0,0,1
8,9,tt0083283,Victor Seastrom,False,2021,65,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"US,US,SE",6.7,67,,,nm0085022,"nm0489225,nm0635059,nm0902553,nm0909104",nm0921402,,nm0286873,"nm0430746,nm0000005",nm0921402,1,0,0,0,0,1
9,10,tt0088751,The Naked Monster,False,2005,100,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,"US,GB,US",5.3,345,"nm0864851,nm0933983,nm0329491",nm0828288,"nm0937864,nm1848380","nm0006100,nm0006300","nm0628399,nm0078540",,,,nm0628399,1,0,0,0,0,1


In [10]:
movie_dataset.isna().sum()

Unnamed: 0              0
tconst                  0
primaryTitle            1
isAdult                 0
releaseYear             0
runtimeMinutes          0
Action                  0
Adult                   0
Adventure               0
Animation               0
Biography               0
Comedy                  0
Crime                   0
Documentary             0
Drama                   0
Family                  0
Fantasy                 0
Film-Noir               0
Game-Show               0
History                 0
Horror                  0
Music                   0
Musical                 0
Mystery                 0
News                    0
Reality-TV              0
Romance                 0
Sci-Fi                  0
Sport                   0
Talk-Show               0
Thriller                0
War                     0
Western                 0
\N                      0
Short                   0
region                  0
averageRating           0
numVotes                0
actor       

In [11]:
actor_split = movie_dataset['actor'].str.split(',', expand=True)
actor_split = actor_split.iloc[:, :2]
actor_split.rename(columns={0: 'actor_1', 1: 'actor_2'}, inplace=True)
# actor_split
actor_split.isna().sum()

actor_1     79073
actor_2    111085
dtype: int64

In [12]:
actress_split = movie_dataset['actress'].str.split(',', expand=True)
actress_split = actress_split.iloc[:, :2]
actress_split.rename(columns={0: 'actress_1', 1: 'actress_2'}, inplace=True)
# actress_split
actress_split.isna().sum()

actress_1    107425
actress_2    171206
dtype: int64

In [13]:
director_split = movie_dataset['director'].str.split(',', expand=True)
director_split = director_split.iloc[:, :2]
director_split.rename(columns={0: 'director_1', 1: 'director_2'}, inplace=True)
director_split.drop(['director_2'],axis=1, inplace=True)
# director_split
director_split.isna().sum()

director_1    0
dtype: int64

In [14]:
writer_split = movie_dataset['writer'].str.split(',', expand=True)
writer_split = writer_split.iloc[:, :2]
writer_split.rename(columns={0: 'writer_1', 1: 'writer_2'}, inplace=True)
writer_split.drop(['writer_2'],axis=1, inplace=True)
# writer_split
writer_split.isna().sum()

writer_1    0
dtype: int64

In [15]:
cinema_split = movie_dataset['cinematographer'].str.split(',', expand=True)
cinema_split = cinema_split .iloc[:, :1]
cinema_split .rename(columns={0: 'cinematographer_1'}, inplace=True)
# cinema_split
cinema_split.isna().sum()

cinematographer_1    130827
dtype: int64

In [16]:
movie_dataset.drop(['self'], axis = 1, inplace = True)

In [17]:
# self_split= movie_dataset['self'].str.split(',', expand=True)
# self_split = self_split .iloc[:, :1]
# self_split .rename(columns={0: 'self_1'}, inplace=True)
# self_split
# self_split.isna().sum()

In [18]:
producer_split = movie_dataset['producer'].str.split(',', expand=True)
producer_split = producer_split.iloc[:, :1]
producer_split.rename(columns={0: 'producer_1'}, inplace=True)
# producer_split
producer_split.isna().sum()

producer_1    123131
dtype: int64

In [19]:
composer_split = movie_dataset['composer'].str.split(',', expand=True)
composer_split = composer_split.iloc[:, :1]
composer_split.rename(columns={0: 'composer_1',}, inplace=True)
# composer_split
composer_split.isna().sum()

composer_1    132799
dtype: int64

In [20]:
editor_split = movie_dataset['editor'].str.split(',', expand=True)
editor_split = editor_split.iloc[:, :1]
editor_split.rename(columns={0: 'editor_1',}, inplace=True)
# editor_split
editor_split.isna().sum()

editor_1    167304
dtype: int64

In [21]:
# merge the 8 dataframes on index using inner join
movie_dataset = pd.merge(movie_dataset, actor_split, left_index=True, right_index=True).merge(actress_split, left_index=True, right_index=True).merge(director_split, left_index=True, right_index=True).merge(writer_split, left_index=True, right_index=True).merge(cinema_split, left_index=True, right_index=True).merge(producer_split, left_index=True, right_index=True).merge(composer_split, left_index=True, right_index=True).merge(editor_split, left_index=True, right_index=True)
movie_dataset  = movie_dataset.fillna(value=np.nan)
movie_dataset.drop(['actor', 'actress','director', 'writer','producer','composer','cinematographer','editor'], axis = 1, inplace =True)
movie_dataset.isna().sum()

Unnamed: 0                0
tconst                    0
primaryTitle              1
isAdult                   0
releaseYear               0
runtimeMinutes            0
Action                    0
Adult                     0
Adventure                 0
Animation                 0
Biography                 0
Comedy                    0
Crime                     0
Documentary               0
Drama                     0
Family                    0
Fantasy                   0
Film-Noir                 0
Game-Show                 0
History                   0
Horror                    0
Music                     0
Musical                   0
Mystery                   0
News                      0
Reality-TV                0
Romance                   0
Sci-Fi                    0
Sport                     0
Talk-Show                 0
Thriller                  0
War                       0
Western                   0
\N                        0
Short                     0
region              

In [22]:
movie_dataset.head()

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,isAdult,releaseYear,runtimeMinutes,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Sport,Talk-Show,Thriller,War,Western,\N,Short,region,averageRating,numVotes,region_US,region_UK,region_AU,region_IN,region_JP,region_other,actor_1,actor_2,actress_1,actress_2,director_1,writer_1,cinematographer_1,producer_1,composer_1,editor_1
0,1,tt0013274,Istoriya grazhdanskoy voyny,False,2021,94,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"FR,RU,SUHH,GR,XWW",6.8,62,0,0,0,0,0,1,,,,,nm0412842,\N,,nm13054604,,
1,2,tt0015414,La tierra de los toros,False,2000,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,"ES,FR,XWW,FR",5.2,16,0,0,0,0,0,1,,,,,nm0615736,\N,,,,
2,3,tt0035423,Kate & Leopold,False,2001,118,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,"IT,FR,ES,PT,ZA,PL,BG,GB,JP,DE,EE,CO,LT,ID,CA,P...",6.4,88083,1,0,1,0,1,1,nm0413168,nm0000630,nm0000212,,nm0003506,nm0737216,nm0238698,nm0465298,nm0448843,nm0107463
3,4,tt0062336,The Tango of the Widower and Its Distorting Mi...,False,2020,70,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"RU,XWW,GB,HK,TW,US,FR,CL",6.4,183,1,0,0,0,0,1,nm0815612,nm0016013,nm1860495,nm0739834,nm0749914,nm0749914,nm0093680,nm1131208,nm0005948,
4,5,tt0069049,The Other Side of the Wind,False,2018,122,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"AU,AR,ES,GR,FR,KR,VE,CA,JP,VN,PL,DE,HU,PT,CA,B...",6.7,7891,1,0,1,0,1,1,nm0001379,nm0000953,nm0462648,nm0001782,nm0000080,nm0000080,nm0004372,nm0550881,nm0006166,nm0613657


In [23]:
people_dataset.head()

Unnamed: 0.1,Unnamed: 0,index,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,pi_mean,pi_median,pi_std
0,0,0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0053137,tt0072308,tt0050419,tt0031983",,,
1,1,1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0038355,tt0075213,tt0117057",,,
2,2,2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0057345,tt0056404,tt0054452,tt0049189",,,
3,3,3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0077975,tt0072562,tt0078723,tt0080455",,,
4,4,4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0069467,tt0050986,tt0050976,tt0083922",,,


In [24]:

people_dataset['knownForTitles'] = people_dataset['knownForTitles'].convert_dtypes(convert_string=True)

people_dataset.info()

# people_dataset.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1127583 entries, 0 to 1127582
Data columns (total 11 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Unnamed: 0         1127583 non-null  int64  
 1   index              1127583 non-null  int64  
 2   nconst             1127583 non-null  object 
 3   primaryName        1127583 non-null  object 
 4   birthYear          1127583 non-null  object 
 5   deathYear          1127583 non-null  object 
 6   primaryProfession  1033360 non-null  object 
 7   knownForTitles     1127583 non-null  string 
 8   pi_mean            727666 non-null   float64
 9   pi_median          727666 non-null   float64
 10  pi_std             317116 non-null   float64
dtypes: float64(3), int64(2), object(5), string(1)
memory usage: 94.6+ MB


In [25]:
people_dataset.dropna(subset=['primaryProfession','pi_mean','pi_median','pi_std'], inplace=True)
cols = [0,1]
people_dataset.drop(people_dataset.columns[cols], axis=1, inplace=True)
people_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 313221 entries, 28 to 1127582
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   nconst             313221 non-null  object 
 1   primaryName        313221 non-null  object 
 2   birthYear          313221 non-null  object 
 3   deathYear          313221 non-null  object 
 4   primaryProfession  313221 non-null  object 
 5   knownForTitles     313221 non-null  string 
 6   pi_mean            313221 non-null  float64
 7   pi_median          313221 non-null  float64
 8   pi_std             313221 non-null  float64
dtypes: float64(3), object(5), string(1)
memory usage: 23.9+ MB


In [26]:
people_dataset['knownForTitles'] = people_dataset['knownForTitles'].str.split(',')
# people_dataset.dropna(subset=['knownForTitles'], inplace=True)
people_dataset= people_dataset.explode(column='knownForTitles')
people_dataset = people_dataset.rename(columns={'knownForTitles': 'tconst'})
people_dataset.info()
# people_dataset

<class 'pandas.core.frame.DataFrame'>
Index: 1186037 entries, 28 to 1127582
Data columns (total 9 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   nconst             1186037 non-null  object 
 1   primaryName        1186037 non-null  object 
 2   birthYear          1186037 non-null  object 
 3   deathYear          1186037 non-null  object 
 4   primaryProfession  1186037 non-null  object 
 5   tconst             1186037 non-null  object 
 6   pi_mean            1186037 non-null  float64
 7   pi_median          1186037 non-null  float64
 8   pi_std             1186037 non-null  float64
dtypes: float64(3), object(6)
memory usage: 90.5+ MB


In [27]:
people_dataset.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,tconst,pi_mean,pi_median,pi_std
28,nm0000035,James Horner,1953,2015,"music_department,soundtrack,composer",tt0177971,7.2,7.2,0.989949
28,nm0000035,James Horner,1953,2015,"music_department,soundtrack,composer",tt0499549,7.2,7.2,0.989949
28,nm0000035,James Horner,1953,2015,"music_department,soundtrack,composer",tt0120338,7.2,7.2,0.989949
28,nm0000035,James Horner,1953,2015,"music_department,soundtrack,composer",tt0120746,7.2,7.2,0.989949
60,nm0000083,Alan Miller,\N,\N,"editor,writer,director",tt27504185,6.8,6.8,0.565685


In [28]:
movie_dataset.head()

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,isAdult,releaseYear,runtimeMinutes,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Sport,Talk-Show,Thriller,War,Western,\N,Short,region,averageRating,numVotes,region_US,region_UK,region_AU,region_IN,region_JP,region_other,actor_1,actor_2,actress_1,actress_2,director_1,writer_1,cinematographer_1,producer_1,composer_1,editor_1
0,1,tt0013274,Istoriya grazhdanskoy voyny,False,2021,94,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"FR,RU,SUHH,GR,XWW",6.8,62,0,0,0,0,0,1,,,,,nm0412842,\N,,nm13054604,,
1,2,tt0015414,La tierra de los toros,False,2000,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,"ES,FR,XWW,FR",5.2,16,0,0,0,0,0,1,,,,,nm0615736,\N,,,,
2,3,tt0035423,Kate & Leopold,False,2001,118,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,"IT,FR,ES,PT,ZA,PL,BG,GB,JP,DE,EE,CO,LT,ID,CA,P...",6.4,88083,1,0,1,0,1,1,nm0413168,nm0000630,nm0000212,,nm0003506,nm0737216,nm0238698,nm0465298,nm0448843,nm0107463
3,4,tt0062336,The Tango of the Widower and Its Distorting Mi...,False,2020,70,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"RU,XWW,GB,HK,TW,US,FR,CL",6.4,183,1,0,0,0,0,1,nm0815612,nm0016013,nm1860495,nm0739834,nm0749914,nm0749914,nm0093680,nm1131208,nm0005948,
4,5,tt0069049,The Other Side of the Wind,False,2018,122,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"AU,AR,ES,GR,FR,KR,VE,CA,JP,VN,PL,DE,HU,PT,CA,B...",6.7,7891,1,0,1,0,1,1,nm0001379,nm0000953,nm0462648,nm0001782,nm0000080,nm0000080,nm0004372,nm0550881,nm0006166,nm0613657


In [29]:
# Chris Hemsworth = nm1165110
# Tom Cruise = nm0000129
# Matt Damon = nm0000354
# Robert Downey Jr. = nm0000375

# Angelina Jolie = nm0001401
# Scarlett Johansson = nm0424060
# Emma Stone = nm1297015
# Anya Taylor-Joy = nm5896355
# Lily Collins = nm2934314

# Missing: Jackie Chan, Bruce Willis, Alan Rickman etc.

In [30]:
movie_dataset.shape

(260603, 54)

In [31]:
def get_movie_popularity_index(movies_df, people_df):
    movie_pi_indexes = []

    people_columns = [
        "actor_1",
        "actor_2",
        "actress_1",
        "actress_2",
        "director_1",
        "writer_1",
        "cinematographer_1",
        "producer_1",
        "composer_1",
        "editor_1"
    ]
    for index,row in movies_df.iterrows():
        movie_pi_mean_mean = 0
        movie_pi_median_mean = 0
        movie_people_ids = [x for x in row[people_columns].to_numpy().flatten() if (not pd.isnull(x) and x!="\\N")]
        pi_dataframe = people_df[["pi_mean","pi_median"]].loc[people_df["nconst"].isin(movie_people_ids)]

        # Important to note
        # https://stackoverflow.com/questions/25039328/specifying-skip-na-when-calculating-mean-of-the-column-in-a-data-frame-created
        movie_pi_mean_mean = pi_dataframe["pi_mean"].mean()
        movie_pi_median_mean = pi_dataframe["pi_median"].mean()
        movie_dataset.loc[index, "movie_pi_mean"] = movie_pi_mean_mean
        movie_dataset.loc[index, "movie_pi_median"] = movie_pi_median_mean
        print("Row number: " + str(index) + " done")



In [32]:
def parallel_get_movie_popularity_index(movies_df, people_df):
    split_movie_dataframe_list = np.array_split(movies_df, number_of_cores)
    Parallel(n_jobs=-1, verbose=20, max_nbytes=None)(delayed(get_movie_popularity_index)(split_movie_dataframe, people_df) for split_movie_dataframe in split_movie_dataframe_list)


In [33]:
parallel_get_movie_popularity_index(movie_dataset,people_dataset)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


In [None]:
# def get_movie_popularity_index_mean(row, people_df):
#     movie_pi_indexes = []
#     people_columns = [
#         "actor_1",
#         "actor_2",
#         "actress_1",
#         "actress_2",
#         "director_1",
#         "writer_1",
#         "cinematographer_1",
#         "producer_1",
#         "composer_1",
#         "editor_1"
#     ]
#     movie_pi_mean_mean = 0

#     movie_people_ids = [x for x in row[people_columns].to_numpy().flatten() if (not pd.isnull(x) and x!="\\N")]
#     pi_dataframe = people_df[["pi_mean"]].loc[people_df["nconst"].isin(movie_people_ids)]

#     # Important to note
#     # https://stackoverflow.com/questions/25039328/specifying-skip-na-when-calculating-mean-of-the-column-in-a-data-frame-created
#     movie_pi_mean_mean = pi_dataframe["pi_mean"].mean()
    
#     return movie_pi_mean_mean

# def get_movie_popularity_index_median(row, people_df):
#     movie_pi_indexes = []
#     people_columns = [
#         "actor_1",
#         "actor_2",
#         "actress_1",
#         "actress_2",
#         "director_1",
#         "writer_1",
#         "cinematographer_1",
#         "producer_1",
#         "composer_1",
#         "editor_1"
#     ]
#     movie_pi_median_mean = 0

#     movie_people_ids = [x for x in row[people_columns].to_numpy().flatten() if (not pd.isnull(x) and x!="\\N")]
#     pi_dataframe = people_df[["pi_median"]].loc[people_df["nconst"].isin(movie_people_ids)]

#     # Important to note
#     # https://stackoverflow.com/questions/25039328/specifying-skip-na-when-calculating-mean-of-the-column-in-a-data-frame-created
#     movie_pi_median_mean = pi_dataframe["pi_median"].mean()
    
#     return movie_pi_median_mean

In [None]:
movie_dataset.head()

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,isAdult,releaseYear,runtimeMinutes,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Sport,Talk-Show,Thriller,War,Western,\N,Short,region,averageRating,numVotes,region_US,region_UK,region_AU,region_IN,region_JP,region_other,actor_1,actor_2,actress_1,actress_2,director_1,writer_1,cinematographer_1,producer_1,composer_1,editor_1
0,1,tt0013274,Istoriya grazhdanskoy voyny,False,2021,94,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"FR,RU,SUHH,GR,XWW",6.8,62,0,0,0,0,0,1,,,,,nm0412842,\N,,nm13054604,,
1,2,tt0015414,La tierra de los toros,False,2000,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,"ES,FR,XWW,FR",5.2,16,0,0,0,0,0,1,,,,,nm0615736,\N,,,,
2,3,tt0035423,Kate & Leopold,False,2001,118,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,"IT,FR,ES,PT,ZA,PL,BG,GB,JP,DE,EE,CO,LT,ID,CA,P...",6.4,88083,1,0,1,0,1,1,nm0413168,nm0000630,nm0000212,,nm0003506,nm0737216,nm0238698,nm0465298,nm0448843,nm0107463
3,4,tt0062336,The Tango of the Widower and Its Distorting Mi...,False,2020,70,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"RU,XWW,GB,HK,TW,US,FR,CL",6.4,183,1,0,0,0,0,1,nm0815612,nm0016013,nm1860495,nm0739834,nm0749914,nm0749914,nm0093680,nm1131208,nm0005948,
4,5,tt0069049,The Other Side of the Wind,False,2018,122,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"AU,AR,ES,GR,FR,KR,VE,CA,JP,VN,PL,DE,HU,PT,CA,B...",6.7,7891,1,0,1,0,1,1,nm0001379,nm0000953,nm0462648,nm0001782,nm0000080,nm0000080,nm0004372,nm0550881,nm0006166,nm0613657


In [None]:
# movie_dataset["movie_pi_mean"] = movie_dataset.apply(lambda x: get_movie_popularity_index_mean(x, people_dataset), axis=1)
# movie_dataset["movie_pi_median"] = movie_dataset.apply(lambda x: get_movie_popularity_index_median(x, people_dataset), axis=1)

KeyboardInterrupt: 

In [None]:
movie_dataset.head()

In [None]:
movie_dataset.to_csv('imdb_movie_dataset1.csv')