## FILMAFFINITY - Data extraction
---

In [1]:
# Imports

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException

In [2]:
# The Scraper class

class FilmaffinityScraper():
    
    """A small scraper to extract data from Filmaffinity's movie database."""
    
    def __init__(self, path="./chromedriver", wait_time=3):
        "The path of the webdriver and the time that will take the explicit wait."
        self.path = path
        self.driver = webdriver.Chrome(self.path)
        self.wait_time = wait_time
        self.wait = WebDriverWait(self.driver, self.wait_time)
    
    def __repr__(self):
        "Repr format of the scraper object."
        return f"FilmaffinityScraper(path = '{self.path}', wait_time = {self.wait_time})"
    
    def get_advanced_search(self):
        "Get the Filmaffinity's advanced search url."
        url = "https://www.filmaffinity.com/es/advsearch.php"
        self.driver.get(url)
        
    def search_by_text(self, text="", title=True, director=False, cast=False, 
                       script=False, photo=False, music=False, producer=False):
        "Filter Filmaffinity's database by text."
        checkbox = ["title", "director", "cast", "script", "photo", "music", "producer"]
    
        self.wait.until(EC.presence_of_element_located((By.ID, "text-option-container")))
        text_search = self.driver.find_element(By.ID, "text-option-container")
    
        if text:
            text_input = text_search.find_element(By.NAME, "stext")
            text_input.clear()
            text_input.send_keys(text)
    
        for category in checkbox:
            if category == "title" and not title:
                text_search.find_element(By.CSS_SELECTOR, f"input[value={category}]").click()
            elif category != "title" and eval(category):
                text_search.find_element(By.CSS_SELECTOR, f"input[value={category}]").click()
                
    def select_country(self, country_code):
        "Filter Filmaffinity's database by country."
        self.wait.until(EC.presence_of_element_located((By.ID, "country")))
        country = Select(self.driver.find_element(By.ID, "country"))
        country.select_by_value(country_code)
    
    def select_genre(self, genre_code):
        "Filter Filmaffinity's database by genre."
        self.wait.until(EC.presence_of_element_located((By.NAME, "genre")))
        genre = Select(self.driver.find_element(By.NAME, "genre"))
        genre.select_by_value(genre_code)
    
    def select_from_to_year(self, fromyear_code, toyear_code):
        "Filter Filmaffinity's database by period."
        self.wait.until(EC.presence_of_element_located((By.NAME, "fromyear")))
        fromyear = Select(self.driver.find_element(By.NAME, "fromyear"))
        fromyear.select_by_visible_text(fromyear_code)
        
        self.wait.until(EC.presence_of_element_located((By.NAME, "toyear")))
        toyear = Select(self.driver.find_element(By.NAME, "toyear"))
        toyear.select_by_visible_text(toyear_code)
    
    def search_selection(self):
        "Search the selection."
        self.wait.until(EC.presence_of_element_located((By.ID, "adv-search-button")))
        search = self.driver.find_element(By.ID, "adv-search-button")
        search.send_keys(Keys.RETURN)
        
    def get_urls(self):
        "Get the urls from a selection of movies."
        try:
            self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.mc-title [href]")))
        except TimeoutException:
            pass
        movies = self.driver.find_elements(By.CSS_SELECTOR, "div.mc-title [href]")
        urls = [movie.get_attribute("href") for movie in movies]
        return urls
    
    def extract_movie_info(self):
        "Get a movie's information."
        self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "dl.movie-info")))
        movie_info = self.driver.find_element(By.CSS_SELECTOR, "dl.movie-info")
        return movie_info.text
           
    def extract_movie_rating(self):
        "Get a movie's rating."
        try:
            movie_rating = self.driver.find_element(By.CSS_SELECTOR, "div#movie-rat-avg")
        except NoSuchElementException:
            return "-1"
        else:
            return movie_rating.text
    
    def write_movie_info(self, file_name):
        "Write movie's info and rating to a file."
        movie_info = self.extract_movie_info()
        movie_rating = self.extract_movie_rating() 
        with open(file_name, "a") as file:
            info = (movie_info + "\nRating\n" + movie_rating).replace("\n", "***")
            file.write(info)
            file.write("\n\n")
            
    def go_next_page(self):
        "Get the next page in a selection of movies."
        try: 
            pager = self.driver.find_element(By.CSS_SELECTOR, "div.pager")
            next_page = pager.find_element(By.PARTIAL_LINK_TEXT, ">>")
        except NoSuchElementException:
            return False
        else:
            next_page.send_keys(Keys.RETURN)
            return True
        
    def short_scrape(self, country, gender, fromyear, toyear, file_name):
        "Scrape Filmaffinity's database from a small selection (less than 500 movies)."
        self.get_advanced_search()
        self.select_country(country)
        self.select_genre(gender)
        self.select_from_to_year(str(fromyear), str(toyear))
        self.search_selection()

        repeat = True
        while repeat:
            urls = self.get_urls()
            for url in urls:
                self.driver.get(url)
                self.write_movie_info(file_name)
                self.driver.back()
    
            repeat = self.go_next_page()
        
        print("Everything was scraped >:)")
    
    def long_scrape(self, country, gender, fromyear, toyear, file_name):
        "Scrape Filmaffinity's database from a big selection (more than 500 movies)."
        self.get_advanced_search()
        self.select_country(country)
        self.select_genre(gender)
        years = (str(year) for year in range(fromyear, toyear + 1))

        while True:
            try:
                year = next(years)
            except StopIteration:
                print("Everything was scraped >:)")
                break
            self.select_from_to_year(year, year)
            self.search_selection()
            
            repeat = True
            while repeat:
                urls = self.get_urls()
                for url in urls:
                    self.driver.get(url)
                    self.write_movie_info(file_name)
                    self.driver.back()
    
                repeat = self.go_next_page()
        
    def verify(self, file_name):
        "Count the number of movies scraped in a file."
        with open(file_name, "r") as file:
            everything = file.read()
        every_movie = everything.split("\n\n")
        if "" in every_movie:
            every_movie.remove("")
        return len(every_movie)

In [3]:
# An example.

In [4]:
# Instantiate the scraper.

scraper = FilmaffinityScraper()
print(scraper)

FilmaffinityScraper(path = './chromedriver', wait_time = 3)


In [4]:
# Available genres.

GENRES = {"Acción": "AC", 
          "Animación": "AN", 
          "Aventuras": "AV", 
          "Bélico": "BE", 
          "Ciencia ficción": "C-F", 
          "Cine negro": "F-N",
          "Comedia": "CO", 
          "Documental": "DO", 
          "Drama": "DR", 
          "Fantástico": "FAN", 
          "Infantil": "INF",
          "Intriga": "INT", 
          "Musical": "MU", 
          "Romance": "RO", 
          "Serie de TV": "TV_SE", 
          "Terror": "TE", 
          "Thriller": "TH",
          "Western": "WE",
         }

In [5]:
# A small search. Getting the urls of a selection.

scraper.get_advanced_search()
scraper.search_by_text(text="cenicienta")
scraper.select_country("ES")
scraper.select_genre("AN")
scraper.select_from_to_year("1950", "2020")
scraper.search_selection()
urls = scraper.get_urls()

In [7]:
# Writing a movie's info and rating in a file.

scraper.driver.get(urls[0])
scraper.write_movie_info("movie_scraping.txt")

In [8]:
# A small scraping is faster when we want to scrape less than 500 movies.

scraper.short_scrape("ES", "WE", 2008, 2020, "west_short.txt")

Everything was scraped >:)


In [9]:
# A big scraping can automatize a big extraction (more than 500 movies).

scraper.long_scrape("ES", "WE", 2008, 2020, "west_long.txt")

Everything was scraped >:)


In [11]:
# Verify that both scrapers do the same.

scraper.verify("west_short.txt") == scraper.verify("west_long.txt")

True