## Data Cleaning: IMDB Name Basics

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
imdb_name_basics = pd.read_csv('../../data/01_raw/imdb.name.basics.csv.gz')
imdb_name_basics.head()

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


### 1. Split values in 'known_for_titles' column 

In [3]:
knownfor_expand = imdb_name_basics.known_for_titles.str.split(",", expand=True)

In [4]:
imdb_name_basics['known_for_titles_1'] = knownfor_expand[0]
imdb_name_basics['known_for_titles_2'] = knownfor_expand[1]
imdb_name_basics['known_for_titles_3'] = knownfor_expand[2]
imdb_name_basics['known_for_titles_4'] = knownfor_expand[3]
imdb_name_basics['known_for_titles_5'] = knownfor_expand[4]
imdb_name_basics['known_for_titles_6'] = knownfor_expand[5]

In [5]:
imdb_name_basics.head()

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles,known_for_titles_1,known_for_titles_2,known_for_titles_3,known_for_titles_4,known_for_titles_5,known_for_titles_6
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553",tt0837562,tt2398241,tt0844471,tt0118553,,
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940",tt0896534,tt6791238,tt0287072,tt1682940,,
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898",tt1470654,tt0363631,tt0104030,tt0102898,,
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387",tt0114371,tt2004304,tt1618448,tt1224387,,
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256",tt0452644,tt0452692,tt3458030,tt2178256,,


### 2. Change dataframe format from wide to long so that the key column changes from 'primary_name' to 'tconst'    

In [6]:
imdb_names_small = imdb_name_basics
imdb_names_small = imdb_names_small.drop(['birth_year', 
                                          'death_year', 
                                          'known_for_titles'], 
                                         axis=1)

In [7]:
imdb_names_melt = pd.melt(imdb_names_small, 
                          id_vars =['primary_name', 
                                    'nconst', 
                                    'primary_profession'], 
                          value_vars =['known_for_titles_1', 
                                       'known_for_titles_2', 
                                       'known_for_titles_3', 
                                       'known_for_titles_4',
                                       'known_for_titles_5', 
                                       'known_for_titles_6'])


In [8]:
imdb_names_small.head()

Unnamed: 0,nconst,primary_name,primary_profession,known_for_titles_1,known_for_titles_2,known_for_titles_3,known_for_titles_4,known_for_titles_5,known_for_titles_6
0,nm0061671,Mary Ellen Bauder,"miscellaneous,production_manager,producer",tt0837562,tt2398241,tt0844471,tt0118553,,
1,nm0061865,Joseph Bauer,"composer,music_department,sound_department",tt0896534,tt6791238,tt0287072,tt1682940,,
2,nm0062070,Bruce Baum,"miscellaneous,actor,writer",tt1470654,tt0363631,tt0104030,tt0102898,,
3,nm0062195,Axel Baumann,"camera_department,cinematographer,art_department",tt0114371,tt2004304,tt1618448,tt1224387,,
4,nm0062798,Pete Baxter,"production_designer,art_department,set_decorator",tt0452644,tt0452692,tt3458030,tt2178256,,


In [9]:
imdb_names_melt.drop('variable', inplace=True, axis=1)
imdb_names_melt.dropna(inplace=True)
imdb_names_melt.reset_index(inplace=True)
imdb_names_melt.drop('index', axis=1, inplace=True)

In [10]:
imdb_names_melt.rename(columns={'value': 'movie_titles'}, inplace=True)

In [11]:
imdb_names_melt.rename(columns={'movie_titles': 'tconst'}, inplace=True)
imdb_names_melt.set_index('tconst', inplace=True)
imdb_names_melt.head()

Unnamed: 0_level_0,primary_name,nconst,primary_profession
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0837562,Mary Ellen Bauder,nm0061671,"miscellaneous,production_manager,producer"
tt0896534,Joseph Bauer,nm0061865,"composer,music_department,sound_department"
tt1470654,Bruce Baum,nm0062070,"miscellaneous,actor,writer"
tt0114371,Axel Baumann,nm0062195,"camera_department,cinematographer,art_department"
tt0452644,Pete Baxter,nm0062798,"production_designer,art_department,set_decorator"


In [12]:
imdb_names_melt.to_csv('../../data/02_intermediate/imdb_names_basics_clean')