### ETL Project Dependencies

#### Required: pip install imdbpy

In [1]:
#!pip install imdbpy

In [2]:
# dependencies
from imdb import IMDb

# create an instance of the IMDb class
ia = IMDb()

### Set up MongoDB connection and client

#### Note: make sure `mongod` is running on a terminal

In [3]:
import pymongo 

In [4]:
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

#### Create a new database

In [5]:
db = client.IMDB

#### List all 2019 movies from CVS for use by IMDB API calls

In [6]:
# dependencies
import pandas as pd
import os
import csv

In [7]:
# store filepath in a variable
data_file = "../Data/all_2019_movies.csv"
# Read file with the pandas library
data_file_df = pd.read_csv(data_file, encoding="ISO-8859-1")
# show header
data_file_df.head()

Unnamed: 0,Position,Const,Created,Modified,Description,Title,URL,Title Type,IMDb Rating,Runtime (mins),Year,Genres,Num Votes,Release Date,Directors
0,1,tt6823368,2018-09-03,2019-07-07,Release date - January 18th\nStudio - Universa...,Glass,https://www.imdb.com/title/tt6823368/,movie,6.7,129.0,2019.0,"Drama, Sci-Fi, Thriller",150546.0,2019-01-07,M. Night Shyamalan
1,2,tt6811018,2018-09-03,2019-07-07,Release date - January 25th\nStudio - Twentiet...,The Kid Who Would Be King,https://www.imdb.com/title/tt6811018/,movie,6.0,120.0,2019.0,"Action, Adventure, Family, Fantasy",9512.0,2019-01-16,Joe Cornish
2,3,tt5941692,2018-11-18,2019-07-07,Release date - February 1st\nStudio - Sony Pic...,Miss Bala,https://www.imdb.com/title/tt5941692/,movie,5.6,104.0,2019.0,"Action, Crime, Drama, Thriller",4612.0,2019-01-25,Catherine Hardwicke
3,4,tt3513498,2018-09-03,2019-07-07,Release date - February 8th\nStudio - Warner B...,The Lego Movie 2: The Second Part,https://www.imdb.com/title/tt3513498/,movie,6.7,107.0,2019.0,"Animation, Action, Adventure, Comedy, Family, ...",36374.0,2019-02-06,Mike Mitchell
4,5,tt7634968,2018-09-03,2019-07-07,Release date - February 8th\nStudio - Paramoun...,What Men Want,https://www.imdb.com/title/tt7634968/,movie,5.1,117.0,2019.0,"Comedy, Fantasy, Romance",12466.0,2019-01-10,Adam Shankman


In [8]:
def last_name(name):
    name_list = name.split(" ")
    last_item = name_list[len(name_list) - 1]
    if last_item == "":
        return None
    else:
        return last_item

assert not last_name("Joe Cornish") == "Joe Cornish", "You just gave me back the whole name"
assert last_name("Joe Cornish") == "Cornish", "That is not the last name"
assert last_name("M. Night Shyamalan") == "Shyamalan", "You didn't handle cases where there are three names"
assert last_name("Cher") == "Cher", "You didn't handle cases where there is one name"
assert last_name("") == None


In [9]:
#Functions that obtain the last name of the directors
#data_file_df['Directors_last'] = data_file_df['Directors'].map(lambda x: x.split(" ")[-1])
#data_file_df['Directors_last'] = data_file_df['Directors'].map(last_name)
#data_file_df['Const_Clean'] = data_file_df['Const'].map(lambda x: x[2:])
#data_file_df.head()

In [10]:
#Function that cleans up the const value in dataframe from above
def const_cleanup(const_value):
    return const_value[2:]

assert const_cleanup('tt7634968') == '7634968'

In [11]:
#titles_df["Year"] = titles_df['Title'].map(lambda x: my_result_search(x,'cover url'))

In [12]:
titles_df = data_file_df.loc[:, ["Title"]].values.tolist()

In [13]:
#my_list returns a list of movie names from csv file
my_list = []
for i in titles_df:
    my_list.append(i[0])
#my_list

In [14]:
#movies = db.movies.find()

In [15]:
#movie_titles = []
#for i in range(len(titles_df)):
#    movie_titles.append(titles_df[i][0])

#### Loop thru the title list, pull their title data and push to MongoDB database

In [16]:
import my_functions as nv

In [17]:
from imdb import IMDb


def my_result_search(title_name: str, movie_key: str = 'long imdb title') -> str:
    """This will search for a movie title on imdb's api
    
    Input: Movie Title as a string
    Returns: Movie object from IMDB
    """
    try: 
        ia = IMDb()
        return ia.search_movie(title_name)[0][movie_key]
    except:
        return "NA"

In [18]:
def multi_result_search(my_list: list, our_key) -> dict:
    """
    Inputs: Take in a list of title names and for our_keys input either a list or a single search key
    Outputs: Return a dictionary of results. 
    """
    title_result_list = []
    for title in my_list:
        title_result = {}
        if not type(our_key) == list:
            title_result[our_key] = my_result_search(title, our_key)
            title_result_list.append(title_result)
        else:
            for i in our_key:
                title_result[i] = my_result_search(title, i)
            title_result_list.append(title_result)
    return title_result_list
    print(title_result_list)

In [19]:
all_keys = ['title','year']

In [60]:
#We pop the movie 'The Hunt' because there is no year key for the movie
my_list.index('The Hunt')
my_list.pop(71)

'The Hunt'

In [20]:
#We pop 'Once Upon a Time ... in Hollywood' because mongo does not allow for '...'
my_list.index('Once Upon a Time ... in Hollywood')
my_list.pop(54)

'Once Upon a Time ... in Hollywood'

In [66]:
#movie does not have 'year' in the database
my_list.index('Eli')
my_list.pop(78)

'Eli'

In [65]:
ia.search_movie('Eli')[0].keys()

['title',
 'kind',
 'cover url',
 'canonical title',
 'long imdb title',
 'long imdb canonical title',
 'smart canonical title',
 'smart long imdb canonical title',
 'full-size cover url']

In [21]:
#run the functions we defined and we get a dictionary as output
#if 'nv.multi_result_search()' doesn't work, run the 'multi_result_search' cell from above and remove 'nv' 
new_movies_list = multi_result_search(my_list, all_keys)

In [69]:
# movie_keys = ['title','year']
# for title_name in my_list:
#         print(title_name)
#         print(ia.search_movie(title_name)[0]['year'])

In [22]:
new_movies_list

[{'title': 'Glass', 'year': 2019},
 {'title': 'The Kid Who Would Be King', 'year': 2019},
 {'title': 'Miss Bala', 'year': 2019},
 {'title': 'The Lego Movie 2: The Second Part', 'year': 2019},
 {'title': 'What Men Want', 'year': 2019},
 {'title': 'Alita: Battle Angel', 'year': 2019},
 {'title': 'Fighting with My Family', 'year': 2019},
 {'title': "Isn't It Romantic", 'year': 2019},
 {'title': 'Happy Death Day 2U', 'year': 2019},
 {'title': 'How to Train Your Dragon: The Hidden World', 'year': 2019},
 {'title': 'A Madea Family Funeral', 'year': 2019},
 {'title': 'Shazam!', 'year': 2019},
 {'title': 'The Kid', 'year': 2019},
 {'title': 'Wonder Park', 'year': 2019},
 {'title': 'The Hummingbird Project', 'year': 2018},
 {'title': 'The Aftermath', 'year': 2019},
 {'title': 'Us', 'year': 2019},
 {'title': 'Hotel Mumbai', 'year': 2018},
 {'title': 'Dumbo', 'year': 2019},
 {'title': 'The Beach Bum', 'year': 2019},
 {'title': 'Shazam!', 'year': 2019},
 {'title': 'Pet Sematary', 'year': 2019},
 {

In [71]:
#Declare collection 'movies_info_db'
db.movie_info_db

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'IMDB'), 'movie_info_db')

In [72]:
#Insert into database
for i in new_movies_list:
    db.movie_info_db.insert_one(i)

db.movie_info_db.find()

<pymongo.cursor.Cursor at 0x11855dba8>