#                                              IMDB Web Scrape

## Define Functions to Scrape and Parse IMDB for Top 2000 Horror Movies

In [378]:
import requests
import time
from bs4 import BeautifulSoup as BS

In [379]:
def remove_comma_int(comma_str):
    """
    Helper function.
    Returns: An it from a numerical string that contains commas. Meant to remove commas from imdb rank
    '1,000' --> 1000
    Param comma_str: [str] is a numerical string that may or may not contain a comma
    """
    if comma_str.find(',') == -1:
        return int(comma_str)
    else:
        place = comma_str.find(',')
        return int(comma_str[:place] + comma_str[place + 1:])

        
def get_page(url,headers):
    """
    Returns: List of 50 IMDB html tags representing each one movie block from the specified url.
    Param url: [str] IMDB url to search on.
    Param headers: [str] headers to pass into requests.get so that IMDB knows we are not russian hackers.
    """
    try:
        page = requests.get(url,headers=headers, timeout = 5)
        if page.status_code != 200:
            print(page.status_code)
            
    except requests.ConnectionError as e:
        print("OOPS!! Connection Error. Make sure you are connected to Internet. Technical Details given below.\n")
        print(str(e))
    except requests.Timeout as e:
        print("OOPS!! Timeout Error")
        print(str(e))
    except requests.RequestException as e:
        print("OOPS!! General Error")
        print(str(e))
    except KeyboardInterrupt:
        print("Someone closed the program") 
    
    soup = BS(page.content, 'html.parser')
    
    return  soup.find(class_='lister list detail sub-list').find_all(class_='lister-item mode-advanced')

def all_pages(url, headers):
    """
    Runs get_page(url, headers) multiple times.
    Param url: [str] IMDB url to search on.
    Param headers: [str] headers to pass into requests.get so that IMDB knows we are not russian hackers.
    """
    base_url = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie&genres=horror&'
    suffix = 'view=advanced'
    cur = 1
    results = []
    while cur < 2000:
        if cur == 1:
            results.extend(get_page(base_url, headers))
        elif cur > 1:
            suffix = 'start={}&ref_=adv_nxt'.format(cur)
            results.extend(get_page(base_url + suffix, headers))
        time.sleep(0.5)
        cur += 50
    return results

def parse_data(movies):
    """
    Parses through the raw html movie blocks from the data call to return usable results.
    Returns: List of dictionaries each including the keys described below. 
    Param movies: [list] of IMDB html tags, each element representing individual movies.        
    """
    movies_details = []
    for movie in movies:
        movie_details = {}
        
        movie_details['imdb_rank'] = remove_comma_int(movie.find('span',{'class':'lister-item-index unbold text-primary'}).text[:-1]) # Popularity rank within Horror genre from 29,461 titles
        movie_details['imdb_id'] = movie.find('div',{'class':'ribbonize'})['data-tconst'] # IMDB id [str]
#         movie_details['url'] = 'www.imdb.com/' + movie.find('a')['href'] # Movie's url [str]
        movie_details['title'] = movie.find('img')['alt'] # Title [str]
        movie_details['genre'] = movie.find('span',{'class':'genre'}).text.strip().split(', ') # Genres [list of strs]
        
        try:
            movie_details['metacritic_score'] = float(movie.find(class_=['span', 'metascore mixed']).text.strip()) # Metacritic score [str]
        except:
            movie_details['metacritic_score'] = None
        try:
            movie_details['imdb_score'] = float(movie.find('strong').text) # IMDB score [str]
        except:
            movie_details['imdb_score'] = None
        try:
            movie_details['votes'] = movie.find_all('span', {'name':'nv'})[0]['data-value'] # No. of votes [str]
        except:
            movie_details['votes'] = None
        try:
            movie_details['director'] = movie.find('p',{'class':''}).find('a').text # Director name [str]
        except:
            movie_details['director'] = None
        try:
            movie_details['length'] = movie.find('span',{'class':'runtime'}).text.split()[0] # Movie length in minutes [str]
        except:
            movie_details['length'] = None
        movies_details.append(movie_details)
    return movies_details


## Commands for Storing Data in Local Memory

In [332]:
url = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie&genres=horror&view=advanced'
headers = {'user-agent': 'Safari/13.0.2 (Macintosh; Intel Mac OS X 10_15)'}
saved_all_pages = all_pages(url, headers)

In [366]:
saved_parsed_data = parse_data(saved_all_pages)

In [367]:
saved_parsed_data[1060]

{'imdb_rank': 1061,
 'imdb_id': 'tt5114154',
 'title': 'The Keeping Hours',
 'genre': ['Drama', 'Fantasy', 'Horror'],
 'metacritic_score': None,
 'imdb_score': 6.2,
 'votes': '2442',
 'director': 'Karen Moncrieff',
 'length': '91'}

# Create Tables

### Import necessary packages and then connect to movies db

In [426]:
## Connect to DB server on AWS
import mysql.connector 
import marco
import requests

cnx = mysql.connector.connect(
    host = marco.host,
    user = marco.user,
    passwd = marco.password
)
cursor = cnx.cursor()

In [427]:
from mysql.connector import errorcode
db_name = 'movies'
def create_database(cursor, database):
    try:
        cursor.execute(
            "CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(database))
    except mysql.connector.Error as err:
        print("Failed creating database: {}".format(err))
        exit(1)

try:
    cursor.execute("USE {}".format(db_name))
except mysql.connector.Error as err:
    print("Database {} does not exists.".format(db_name))
    if err.errno == errorcode.ER_BAD_DB_ERROR:
        create_database(cursor, db_name)
        print("Database {} created successfully.".format(db_name))
        cnx.database = db_name
    else:
        print(err)
        exit(1)

### Create imdb table

In [342]:
create_imdb_query = """
CREATE TABLE imdb (
    imdb_id varchar(20) NOT NULL UNIQUE,
    imbd_rank int(10) NOT NULL UNIQUE,
    imdb_score float(10),
    metacritic_score float(20),
    votes int(40),
    director varchar(50),
    length int(20),
    CONSTRAINT imdb_id PRIMARY KEY (imdb_id)
    );

"""
cursor.execute(create_imdb_query)

### Create genres table

In [435]:
create_genres_query = """
CREATE TABLE genres (
    imdb_id varchar(20) NOT NULL,
    genre varchar(15) NOT NULL
    CONSTRAINT imdb_id FOREIGN KEY (imdb_id) REFERENCES imdb (imdb_id)
    );
"""
cursor.execute(create_genres_query)

### Could in theory create both tables at once

In [None]:
#cursor.executemany([create_imdb_query,create_genres_query])

# Insert Data into imdb Table

In [376]:
def insert_imdb(parsed_movies):
    """
    Inserts movie data into movies.imdb database on MySQL.
    Param parsed_movies: [list] List of dicts containing all of the required keys outlined below.
    """
    parsed_tuples =  [(parsed_movie['imdb_id'],
                       parsed_movie['imdb_rank'],
                       parsed_movie['imdb_score'],
                       parsed_movie['metacritic_score'],
                       parsed_movie['votes'],
                       parsed_movie['director'],
                       parsed_movie['length']) for parsed_movie in parsed_movies]
    
    insert_statement = "INSERT INTO imdb (imdb_id, imbd_rank, imdb_score, metacritic_score, votes, director, length) VALUES (%s, %s, %s, %s, %s, %s, %s)"
    cursor.executemany(insert_statement,parsed_tuples)
    cnx.commit()

In [377]:
insert_imdb(saved_parsed_data)

# Format genre Data Structure

In [436]:
def make_genre_tuples(movies):
    """
    Helper function
    Returns: List of tuples in the form ('imdb_id', 'genre').
    Param movies: [list] Parsed list of movie dictionaries at least containing keys ['imdb_id', 'genre'] 
    """
    movies_and_genres = [{'imdb_id':movie['imdb_id'], 'genre':movie['genre']} for movie in movies]
    genre_tuples = []
    for movie_and_genres in movies_and_genres:
        for index in range(0,len(movie_and_genres['genre'])):
            genre_tuples.append((movie_and_genres['imdb_id'], movie_and_genres['genre'][index]))
    return genre_tuples
            
    
    
def insert_genres(parsed_movies):
    """
    Inserts movie data into movies.genres database on MySQL.
    Param parsed_movies: [list] List of dicts from parse_movies function.
    """
    parsed_tuples = make_genre_tuples(parsed_movies)
    insert_statement = "INSERT INTO genres (imdb_id, genre) VALUES (%s, %s)"
    cursor.executemany(insert_statement,parsed_tuples)
    cnx.commit()

In [437]:
insert_genres(saved_parsed_data)

In [383]:
# cursor.close()
# cnx.close()