In [1]:
# Install & Import Libraries
import pandas as pd
import numpy as np

import os
from datetime import datetime
import warnings

In [2]:
# Set Other Configs
pd.options.display.float_format = '{:,.2f}'.format
warnings.filterwarnings(action="ignore")

In [3]:
# Set Path Configs
ROOT_PATH = os.getcwd()
DATA_PATH = os.path.join(ROOT_PATH, 'data')
IMAGE_DATA_PATH = os.path.join(DATA_PATH, 'images')
RAW_DATA_PATH = os.path.join(DATA_PATH, 'input')
PROCESSED_DATA_PATH = os.path.join(DATA_PATH, 'processed')

### Read Movies Data

In [4]:
# df_movies_data = pd.read_csv(os.path.join(RAW_DATA_PATH, "MovieGenre.csv"), encoding='latin-1')
df_movies_data = pd.read_csv(os.path.join(RAW_DATA_PATH, "MovieGenre.csv.gz"))
df_movies_data

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.30,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.90,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.60,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.70,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.90,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...
...,...,...,...,...,...,...
40103,83168,http://www.imdb.com/title/tt83168,Tanya's Island (1980),4.30,Drama,https://images-na.ssl-images-amazon.com/images...
40104,82875,http://www.imdb.com/title/tt82875,Pacific Banana (1981),4.70,Comedy,https://images-na.ssl-images-amazon.com/images...
40105,815258,http://www.imdb.com/title/tt815258,Werewolf in a Womens Prison (2006),4.50,Horror,https://images-na.ssl-images-amazon.com/images...
40106,79142,http://www.imdb.com/title/tt79142,Xiao zi ming da (1979),6.50,Action|Comedy,https://images-na.ssl-images-amazon.com/images...


In [5]:
df_movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40108 entries, 0 to 40107
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   imdbId      40108 non-null  int64  
 1   Imdb Link   40108 non-null  object 
 2   Title       40108 non-null  object 
 3   IMDB Score  40060 non-null  float64
 4   Genre       39963 non-null  object 
 5   Poster      39383 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 1.8+ MB


In [6]:
df_movies_data['imdbId'] = df_movies_data['imdbId'].astype('int32')
df_movies_data['imdbId'].dtype

dtype('int32')

In [7]:
df_movies_data.sort_values(by=['imdbId'], ascending=True, inplace=True)

In [8]:
duplicated_movie_list = df_movies_data[df_movies_data['imdbId'].duplicated()]['imdbId'].to_list()
print("Duplicate Movies Count:", len(duplicated_movie_list))

Duplicate Movies Count: 593


In [9]:
df_movies_data[(df_movies_data['imdbId'].isin(duplicated_movie_list))].sort_values(by=['imdbId'], ascending=True)

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
12409,11358,http://www.imdb.com/title/tt11358,Just Pals (1920),6.60,Comedy|Drama|Western,https://images-na.ssl-images-amazon.com/images...
13003,11358,http://www.imdb.com/title/tt11358,Just Pals (1920),6.60,Comedy|Drama|Western,https://images-na.ssl-images-amazon.com/images...
12748,11439,http://www.imdb.com/title/tt11439,The Mark of Zorro (1920),7.30,Adventure|Romance|Western,https://images-na.ssl-images-amazon.com/images...
13342,11439,http://www.imdb.com/title/tt11439,The Mark of Zorro (1920),7.30,Adventure|Romance|Western,https://images-na.ssl-images-amazon.com/images...
13344,12752,http://www.imdb.com/title/tt12752,The Three Musketeers (1921),7.00,Action|Adventure|Romance,https://images-na.ssl-images-amazon.com/images...
...,...,...,...,...,...,...
12707,1151309,http://www.imdb.com/title/tt1151309,Bigger Stronger Faster* (2008),7.60,Documentary|Sport,https://images-na.ssl-images-amazon.com/images...
12521,1166827,http://www.imdb.com/title/tt1166827,Zeitgeist (2007),8.20,Documentary|History,https://images-na.ssl-images-amazon.com/images...
13115,1166827,http://www.imdb.com/title/tt1166827,Zeitgeist (2007),8.20,Documentary|History,https://images-na.ssl-images-amazon.com/images...
13304,1219671,http://www.imdb.com/title/tt1219671,Allan Quatermain and the Temple of Skulls (2008),2.60,Action|Adventure|Romance,https://images-na.ssl-images-amazon.com/images...


In [10]:
df_movies_data.drop_duplicates(keep='first', inplace=True)

### Read Movie Image

In [11]:
image_list = os.listdir(IMAGE_DATA_PATH)

In [12]:
df_images_data = pd.DataFrame(image_list, columns=['ImgName'])

df_images_data['ImgId'] = df_images_data['ImgName'].apply(lambda x: x.split('.')[0])
# df_images_data['ImgId'] = df_images_data['ImgName'].str.split('.', expand=True)[0]

df_images_data

Unnamed: 0,ImgName,ImgId
0,10040.jpg,10040
1,10057.jpg,10057
2,10071.jpg,10071
3,10155.jpg,10155
4,10195.jpg,10195
...,...,...
992,9899.jpg,9899
993,9932.jpg,9932
994,9937.jpg,9937
995,9968.jpg,9968


In [13]:
df_images_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 997 entries, 0 to 996
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ImgName  997 non-null    object
 1   ImgId    997 non-null    object
dtypes: object(2)
memory usage: 15.7+ KB


In [14]:
df_images_data['ImgId'] = df_images_data['ImgId'].astype('int32')
df_images_data['ImgId'].dtype

dtype('int32')

In [15]:
df_movies_data.shape, df_images_data.shape

((39515, 6), (997, 2))

In [16]:
df_movies_data['imdbId'].nunique(), df_images_data['ImgId'].nunique()

(39515, 997)

In [17]:
df_master_data = pd.merge(left=df_movies_data, right=df_images_data, how='inner', left_on=['imdbId'], right_on=['ImgId']).reset_index(drop=True)

In [18]:
df_master_data.columns = df_master_data.columns.str.replace(' ', '')
df_master_data.rename(columns={'imdbId':'ImdbId'}, inplace=True)
df_master_data

Unnamed: 0,ImdbId,ImdbLink,Title,IMDBScore,Genre,Poster,ImgName,ImgId
0,2461,http://www.imdb.com/title/tt2461,Richard III (1912),5.70,Drama,https://images-na.ssl-images-amazon.com/images...,2461.jpg,2461
1,2544,http://www.imdb.com/title/tt2544,TrÌ_dgÌ´rdsmÌ_staren (1912),6.40,Drama,https://images-na.ssl-images-amazon.com/images...,2544.jpg,2544
2,2795,http://www.imdb.com/title/tt2795,Death's Marathon (1913),6.20,Short|Drama|Romance,https://images-na.ssl-images-amazon.com/images...,2795.jpg,2795
3,2844,http://www.imdb.com/title/tt2844,FantÌ«mas: In the Shadow of the Guillotine (1913),6.90,Crime|Drama,https://images-na.ssl-images-amazon.com/images...,2844.jpg,2844
4,2985,http://www.imdb.com/title/tt2985,The House of Darkness (1913),6.10,Short|Drama,https://images-na.ssl-images-amazon.com/images...,2985.jpg,2985
...,...,...,...,...,...,...,...,...
992,25580,http://www.imdb.com/title/tt25580,Now and Forever (1934),6.60,Drama|Romance,https://images-na.ssl-images-amazon.com/images...,25580.jpg,25580
993,25586,http://www.imdb.com/title/tt25586,Of Human Bondage (1934),7.30,Drama|Romance,https://images-na.ssl-images-amazon.com/images...,25586.jpg,25586
994,25590,http://www.imdb.com/title/tt25590,The Old Fashioned Way (1934),7.60,Comedy,https://images-na.ssl-images-amazon.com/images...,25590.jpg,25590
995,25601,http://www.imdb.com/title/tt25601,One Night of Love (1934),6.00,Music|Romance,https://images-na.ssl-images-amazon.com/images...,25601.jpg,25601


In [19]:
df_master_data = df_master_data.join(df_master_data['Genre'].str.split('|', expand=True))
df_master_data.rename(columns={0:'Genre1', 1:'Genre2', 2:'Genre3'}, inplace=True)
# df_master_data

In [20]:
df_master_data['Genre1'] = np.where(df_master_data['Genre1'] == 'Short', df_master_data['Genre2'], df_master_data['Genre1'])
df_master_data

Unnamed: 0,ImdbId,ImdbLink,Title,IMDBScore,Genre,Poster,ImgName,ImgId,Genre1,Genre2,Genre3
0,2461,http://www.imdb.com/title/tt2461,Richard III (1912),5.70,Drama,https://images-na.ssl-images-amazon.com/images...,2461.jpg,2461,Drama,,
1,2544,http://www.imdb.com/title/tt2544,TrÌ_dgÌ´rdsmÌ_staren (1912),6.40,Drama,https://images-na.ssl-images-amazon.com/images...,2544.jpg,2544,Drama,,
2,2795,http://www.imdb.com/title/tt2795,Death's Marathon (1913),6.20,Short|Drama|Romance,https://images-na.ssl-images-amazon.com/images...,2795.jpg,2795,Drama,Drama,Romance
3,2844,http://www.imdb.com/title/tt2844,FantÌ«mas: In the Shadow of the Guillotine (1913),6.90,Crime|Drama,https://images-na.ssl-images-amazon.com/images...,2844.jpg,2844,Crime,Drama,
4,2985,http://www.imdb.com/title/tt2985,The House of Darkness (1913),6.10,Short|Drama,https://images-na.ssl-images-amazon.com/images...,2985.jpg,2985,Drama,Drama,
...,...,...,...,...,...,...,...,...,...,...,...
992,25580,http://www.imdb.com/title/tt25580,Now and Forever (1934),6.60,Drama|Romance,https://images-na.ssl-images-amazon.com/images...,25580.jpg,25580,Drama,Romance,
993,25586,http://www.imdb.com/title/tt25586,Of Human Bondage (1934),7.30,Drama|Romance,https://images-na.ssl-images-amazon.com/images...,25586.jpg,25586,Drama,Romance,
994,25590,http://www.imdb.com/title/tt25590,The Old Fashioned Way (1934),7.60,Comedy,https://images-na.ssl-images-amazon.com/images...,25590.jpg,25590,Comedy,,
995,25601,http://www.imdb.com/title/tt25601,One Night of Love (1934),6.00,Music|Romance,https://images-na.ssl-images-amazon.com/images...,25601.jpg,25601,Music,Romance,


In [21]:
df_master_data_genre_grpby = df_master_data.groupby(by=['Genre1'], as_index=False).agg(Cnt=('ImdbId','count')).sort_values(by=['Cnt'], ascending=False)
df_master_data_genre_grpby['CntRatio'] = df_master_data_genre_grpby['Cnt'] / df_master_data.shape[0] * 100
df_master_data_genre_grpby['CumSum'] = df_master_data_genre_grpby['Cnt'].cumsum()
df_master_data_genre_grpby['CumRatio'] = df_master_data_genre_grpby['CumSum'] / df_master_data.shape[0] * 100
df_master_data_genre_grpby

Unnamed: 0,Genre1,Cnt,CntRatio,CumSum,CumRatio
7,Drama,357,35.81,357,35.81
4,Comedy,317,31.8,674,67.6
5,Crime,84,8.43,758,76.03
1,Adventure,53,5.32,811,81.34
0,Action,38,3.81,849,85.16
2,Animation,23,2.31,872,87.46
3,Biography,20,2.01,892,89.47
6,Documentary,17,1.71,909,91.17
15,Romance,16,1.6,925,92.78
9,Fantasy,14,1.4,939,94.18


In [22]:
selected_genre_list = df_master_data_genre_grpby['Genre1'].to_list()
selected_genre_list = selected_genre_list[0:4]
selected_genre_list

['Drama', 'Comedy', 'Crime', 'Adventure']

In [23]:
df_master_data = df_master_data[(df_master_data['Genre1'].isin(selected_genre_list))][['ImgId', 'ImgName', 'Title', 'Genre1', 'Genre2', 'Genre3', 'IMDBScore', 'ImdbLink', 'Poster']].reset_index(drop=True)
df_master_data

Unnamed: 0,ImgId,ImgName,Title,Genre1,Genre2,Genre3,IMDBScore,ImdbLink,Poster
0,2461,2461.jpg,Richard III (1912),Drama,,,5.70,http://www.imdb.com/title/tt2461,https://images-na.ssl-images-amazon.com/images...
1,2544,2544.jpg,TrÌ_dgÌ´rdsmÌ_staren (1912),Drama,,,6.40,http://www.imdb.com/title/tt2544,https://images-na.ssl-images-amazon.com/images...
2,2795,2795.jpg,Death's Marathon (1913),Drama,Drama,Romance,6.20,http://www.imdb.com/title/tt2795,https://images-na.ssl-images-amazon.com/images...
3,2844,2844.jpg,FantÌ«mas: In the Shadow of the Guillotine (1913),Crime,Drama,,6.90,http://www.imdb.com/title/tt2844,https://images-na.ssl-images-amazon.com/images...
4,2985,2985.jpg,The House of Darkness (1913),Drama,Drama,,6.10,http://www.imdb.com/title/tt2985,https://images-na.ssl-images-amazon.com/images...
...,...,...,...,...,...,...,...,...,...
806,25555,25555.jpg,Nana (1934),Drama,,,6.10,http://www.imdb.com/title/tt25555,https://images-na.ssl-images-amazon.com/images...
807,25580,25580.jpg,Now and Forever (1934),Drama,Romance,,6.60,http://www.imdb.com/title/tt25580,https://images-na.ssl-images-amazon.com/images...
808,25586,25586.jpg,Of Human Bondage (1934),Drama,Romance,,7.30,http://www.imdb.com/title/tt25586,https://images-na.ssl-images-amazon.com/images...
809,25590,25590.jpg,The Old Fashioned Way (1934),Comedy,,,7.60,http://www.imdb.com/title/tt25590,https://images-na.ssl-images-amazon.com/images...


In [24]:
df_master_data.to_csv(os.path.join(PROCESSED_DATA_PATH, "MasterData.csv.gz"), compression='gzip', index=False)