#                                              IMDB Web Scrape

# 1) DEFINE FUNCTIONS

In [1]:
import mysql.connector 
import marco #document with my AWS credentials
import requests
import time
from bs4 import BeautifulSoup as BS




# DEFINE FUNCTIONS TO SCRAPE AND PARSE IMDB FOR TOP MOVIES



        
def get_page(url,headers):
    """
    Returns: List of 50 IMDB html tags representing each one movie block from the specified url.
    Param url: [str] IMDB url to search on.
    Param headers: [str] headers to pass into requests.get so that IMDB knows we are not russian hackers.
    """
    try:
        page = requests.get(url,headers=headers, timeout = 5)
        if page.status_code != 200:
            print(page.status_code)
            
    except requests.ConnectionError as e:
        print("OOPS!! Connection Error. Make sure you are connected to Internet. Technical Details given below.\n")
        print(str(e))
    except requests.Timeout as e:
        print("OOPS!! Timeout Error")
        print(str(e))
    except requests.RequestException as e:
        print("OOPS!! General Error")
        print(str(e))
    except KeyboardInterrupt:
        print("Someone closed the program") 
    
    soup = BS(page.content, 'html.parser')
    
    return  soup.find(class_='lister list detail sub-list').find_all(class_='lister-item mode-advanced')

def all_pages(genre, headers,no_results):
    """
    Runs get_page(url, headers) multiple times.
    Param genre: [str] genre to search on from IMDB advanced search.
    Param headers: [str] headers to pass into requests.get so that IMDB knows we are not russian hackers.
    Param no_results: [int] number of results we want to obtain.
    """
    base_url = f'https://www.imdb.com/search/title/?title_type=feature,tv_movie&genres={genre}&'
    suffix = 'view=advanced'
    cur = 1
    results = []
    while cur < no_results:
        if cur == 1:
            results.extend(get_page(base_url, headers))
        elif cur > 1:
            suffix = 'start={}&ref_=adv_nxt'.format(cur)
            results.extend(get_page(base_url + suffix, headers))
        time.sleep(0.5)
        cur += 50
    return results


def remove_comma_int(comma_str):
    """
    Helper for parse_data(movies).
    Returns: An it from a numerical string that contains commas. Meant to remove commas from imdb rank
    '1,000' --> 1000
    Param comma_str: [str] is a numerical string that may or may not contain a comma
    """
    if comma_str.find(',') == -1:
        return int(comma_str)
    else:
        place = comma_str.find(',')
        return int(comma_str[:place] + comma_str[place + 1:])

def get_directors_stars(movie):
    """
    Helper for parse_data(movies) {see function below} to format directors and stars nicely.
    Returns: A tuple in the form (['Director1','Director 2'], ['Star1', 'Star2', etc.]). May be empty lists.
    Param movie: Single IMDB html tag for a movie
    """

    # Hard to access directors seperately from stars. Here, we lump them together in a string.
    directors_stars = movie.find('p',class_='').text

    # Some string manipulation to get the directors into a list of strings with no trailing weird characters.
    directors_unsplit = directors_stars[directors_stars.find(':')+1:directors_stars.find('|')].strip()
    directors = directors_unsplit.split(', \n')

    # Some string manipulation to get the stars into a list of strings with no trailing weird characters.
    stars_unsplit = directors_stars[directors_stars.find('Stars:')+6:].strip()
    stars = stars_unsplit.split(', \n')
    
    return (directors, stars)

def parse_data(movies):
    """
    Parses through the raw html movie blocks from the data call to return usable results.
    Returns: List of dictionaries each including the keys described below. 
    Param movies: [list] of IMDB html tags, each element representing individual movies.        
    """
    movies_details = []
    for movie in movies:
        movie_details = {}
        
        movie_details['imdb_rank'] = remove_comma_int(movie.find('span',{'class':'lister-item-index unbold text-primary'}).text[:-1]) # Popularity rank within Horror genre from 29,461 titles
        movie_details['imdb_id'] = movie.find('div',{'class':'ribbonize'})['data-tconst'] # IMDB id [str]
#         movie_details['url'] = 'www.imdb.com/' + movie.find('a')['href'] # Movie's url [str]
        movie_details['title'] = movie.find('h3', {'class':'lister-item-header'}).find('a').text # Title [str]
        movie_details['genres'] = movie.find('span',{'class':'genre'}).text.strip().split(', ') # Genres [list of strs]
        
        
        # A block of try-except statements for potentially missing information. Don't want the function to break if it is missing.
        # We assign None to any missing values.
        try:
            movie_details['metascore'] = int(movie.find(class_=['span', 'metascore mixed']).text.strip()) # Metacritic score [int]
        except:
            movie_details['metascore'] = None
        try:
            movie_details['imdb_score'] = float(movie.find('div',{'class':'ratings-bar'}).find('strong').text) # IMDB score [float]
        except:
            movie_details['imdb_score'] = None
        try:
            movie_details['votes'] = int(movie.find_all('span', {'name':'nv'})[0]['data-value']) # No. of votes [int]
        except:
            movie_details['votes'] = None
        try:
            movie_details['directors'] = get_directors_stars(movie)[0] # Director(s) name(s) [list of strs]
        except:
            movie_details['directors'] = None
        try:
            movie_details['stars'] = get_directors_stars(movie)[1] # Star(s) name(s) [list of strs]
        except:
            movie_details['stars'] = None
        try:
            movie_details['length'] = movie.find('span',{'class':'runtime'}).text.split()[0] # Movie length in minutes [str]
        except:
            movie_details['length'] = None
            
        movies_details.append(movie_details)
    return movies_details




# DEFINE FUNCTIONS TO CREATE MYSQL TABLES




def create_imdb_table(name):
    """
    Makes imdb table in MySQL database movies.
    Param name: [str] the name we want to assign to this table
    """
    create_imdb_query = f"""
    CREATE TABLE {name} (
        imdb_id varchar(20) NOT NULL UNIQUE,
        imdb_rank int(10) NOT NULL UNIQUE,
        imdb_score float(10),
        metascore int(20),
        votes int(40),
        length int(20),
        CONSTRAINT imdb_id PRIMARY KEY (imdb_id)
        );

    """
    cursor.execute(create_imdb_query)

def create_other_table(table_name,column_name):
    """
    Creates MySQL table with name {table}. Must be for genres, directors, or stars.
    Param table_name: [str] Table name
    Param column_name: [str] Must be either 'genre', 'director', or 'star'.
    """
    assert column_name in ['genre', 'director', 'star'], 'The field is not "genre", "director", or "star"'
    create_genres_query = f"""
    CREATE TABLE {table_name} (
        imdb_id varchar(20) NOT NULL,
        {column_name} varchar(15)
        );
    """
    #     print(create_query) # Uncomment this and comment out the statement below to see the modularity of this fn.
    
    cursor.execute(create_genres_query)
    
    
    
    

# DEFINE FUNCTIONS TO INSERT DATA INTO MYSQL TABLES




    
def insert_imdb_data(parsed_movies,table):
    """
    Inserts movie data into movies.imdb database on MySQL.
    Param parsed_movies: [list] List of dicts containing all of the required keys outlined below.
    Param table: [str] Name of table to insert data into
    """
    parsed_tuples =  [(parsed_movie['imdb_id'],
                       parsed_movie['imdb_rank'],
                       parsed_movie['imdb_score'],
                       parsed_movie['metascore'],
                       parsed_movie['votes'],
                       parsed_movie['length']) for parsed_movie in parsed_movies]
    
    insert_statement = f"INSERT INTO {table} (imdb_id, imdb_rank, imdb_score, metascore, votes, length) VALUES (%s, %s, %s, %s, %s, %s)"
    cursor.executemany(insert_statement,parsed_tuples)
    cnx.commit()
    
def make_tuples(movies,key):
    """
    Helper for insert_other_data().
    Returns: List of tuples in the form ('imdb_id', movie['key']).
    Param movies: [list] Parsed list of movie dictionaries at least containing keys ['imdb_id', 'key'] 
    Param key: [str] key from movies that we want to pair. For genres, will be 'genre'. For directors, will be 'director'
    """
    list_of_tuples = []
    for movie in movies:
        # Loop through each element in movie[key] to assign it to a tuple
        for index in range(0,len(movie[key])):
            list_of_tuples.append((movie['imdb_id'], (movie[key][index] if movie[key][index] else None)))
                
    return list_of_tuples

def insert_other_data(parsed_movies, table, column_name):
    """
    Inserts movie data into the movies.{table} database on MySQL.
    Param parsed_movies: [list] List of dicts from parse_movies function.
    Param table: [str] Name of the table we want to insert into
    Param column_name: [str] Must be either 'genre', 'director', or 'star'.
    """
    assert column_name in ['genre', 'director', 'star'], 'The field is not "genre", "director", or "star"'
    parsed_tuples = make_tuples(parsed_movies, column_name+'s')
    insert_statement = f"INSERT INTO {table} (imdb_id, {column_name}) VALUES (%s, %s)"
    cursor.executemany(insert_statement,parsed_tuples)
    cnx.commit()

    


# 2) WRAP EVERTHING INTO ONE FUNCTION THAT PERFORMS THE WHOLE PROCESS

In [2]:
# WRAP EVERTHING INTO ONE FUNCTION THAT PERFORMS THE WHOLE PROCESS


def full_script(genre):
    
    # Headers for webscraping request
    headers = {'user-agent': 'Safari/13.0.2 (Macintosh; Intel Mac OS X 10_15)'}

    # Call the page request functions - get desired number of results, parse, and save.
    saved_all_pages = all_pages(f'{genre}', headers, 2000)
    saved_parsed_data = parse_data(saved_all_pages)
    
    # Create tables in MySQL
    create_imdb_table(f'imdb_{genre}')
    create_other_table(f'genres_{genre}','genre')
    create_other_table(f'directors_{genre}','director')
    create_other_table(f'stars_{genre}','star')

    # Insert and commit data into MySQL tables
    insert_imdb_data(saved_parsed_data,f'imdb_{genre}')
    insert_other_data(saved_parsed_data,f'stars_{genre}','star')
    insert_other_data(saved_parsed_data,f'genres_{genre}','genre')
    insert_other_data(saved_parsed_data,f'directors_{genre}','director')
    

# 3) Run all of the commands

In [12]:
# Connect to movies database
cnx = mysql.connector.connect(
host = marco.host,
user = marco.user,
passwd = marco.password,
database = 'movies'
)
cursor = cnx.cursor()

# Run full script to scrape, save, parse, create tables, and insert data into tables
full_script('drama')

# # Close connection to movies database
cursor.close()
cnx.close()
    

# Just for when things get hairy

In [None]:
def drop_table_set():
    # Connect to movies database
    cnx = mysql.connector.connect(
    host = marco.host,
    user = marco.user,
    passwd = marco.password,
    database = 'movies'
    )
    cursor = cnx.cursor()
    
    # Delete tables
    tables = ['imdb','genres','stars','directors']
    for table in tables:
        cursor.execute(f"DROP TABLE {table}")

    # Close connection to movies database
    cursor.close()
    cnx.close()
    
        

In [None]:
drop_table_set()

In [11]:
cursor.close()
cnx.close()
    