### Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### What do I want?
    Get number of characters for each movie in order to compare budgets.
    Puntuación vs.
    Cuanto genera pixar al año?
    Directores de los cortos de pixar

## Scrapping step by step

### Common part

In [2]:
base_url = 'https://www.pixar.com'
endpoint = '/feature-films-launch'
request_url = f'{base_url}{endpoint}'
res = requests.get(request_url)
soup = BeautifulSoup(res.content, 'html.parser')
movies = soup.find_all('div', {'class':'slide'})
movies[0]

<div class="slide" data-animation-role="image" data-type="image">
<div class="margin-wrapper">
<a class="image-slide-anchor content-fit" href="/elemental">
<noscript><img alt="Elemental" src="https://images.squarespace-cdn.com/content/v1/51cdafc4e4b09eb676a64e68/1677261445468-VNTJ4L41VES1YMLW5R9G/elemental.jpg"/></noscript><img alt="Elemental" class="thumb-image" data-image="https://images.squarespace-cdn.com/content/v1/51cdafc4e4b09eb676a64e68/1677261445468-VNTJ4L41VES1YMLW5R9G/elemental.jpg" data-image-dimensions="720x1053" data-image-focal-point="0.5,0.5" data-image-id="63f8fa843cdaf815c13436de" data-load="false" data-src="https://images.squarespace-cdn.com/content/v1/51cdafc4e4b09eb676a64e68/1677261445468-VNTJ4L41VES1YMLW5R9G/elemental.jpg" data-type="image"/>
</a>
<div class="image-slide-title">Elemental</div>
</div>
</div>

In [3]:
link = movies[0].find('a')
link = link.get('href')
link

'/elemental'

In [4]:
name = movies[0].getText()
name.strip()

'Elemental'

In [5]:
endpoint = '/elemental'
request_url = f'{base_url}{endpoint}'
res = requests.get(request_url)
soup = BeautifulSoup(res.content, 'html.parser')

### Characters for each movie

In [6]:
characters = soup.find('section', {'id':'lightyear_character_main-copy'})
characters
blocks = characters.find_all('h2')
blocks

[<h2 style="white-space:pre-wrap;">Ember Lumen</h2>,
 <h2 style="white-space:pre-wrap;">Wade Ripple</h2>,
 <h2 style="white-space:pre-wrap;">Bernie Lumen</h2>,
 <h2 style="white-space:pre-wrap;">Cinder Lumen</h2>,
 <h2 style="white-space:pre-wrap;">Clod</h2>,
 <h2 style="white-space:pre-wrap;">Brook Ripple</h2>,
 <h2 style="white-space:pre-wrap;">Gale</h2>,
 <h2 style="white-space:pre-wrap;">Fern</h2>]

### Assets for each movie

In [7]:
assets = soup.find('section', {'id':'lightyear_world_design-copy'})
assets
blocks = assets.find_all('h2')
blocks

[<h2 style="white-space:pre-wrap;">Building The World</h2>,
 <h2 style="white-space:pre-wrap;">Firetown</h2>,
 <h2 style="white-space:pre-wrap;">The Water District</h2>,
 <h2 style="white-space:pre-wrap;">Cyclone Stadium</h2>,
 <h2 style="white-space:pre-wrap;">Graphics</h2>]

## Encapsulation

In [8]:
def get_html(url):
    '''
    Returns de html parsed.
    '''
    res = requests.get(url)
    return BeautifulSoup(res.content, 'html.parser')

In [9]:
def get_section_elements(m_html, num_sec):
    '''
    Returns a list of characters, assets (depends on the html number section).
    '''
    section = m_html.find_all('section')
    characters = section[num_sec].find_all('h2')
    return [i.getText() for i in characters]

def get_movies_dict(movies, num_sec):
    '''
    Returns a list of dictionaries.
    '''
    movies_dict_list = []
    for m in movies:
        name = m.getText()
        name = name.strip()
        link = m.find('a')
        link = link.get('href')
        request_url = f'{base_url}{link}'
        m_html = get_html(request_url)
        characters = get_section_elements(m_html, num_sec)
        movies_dict_list.append({name: characters})
    return movies_dict_list

def get_pixar_movie_list(url):
    soup = get_html(url)
    return soup.find_all('div', {'class':'slide'})


In [10]:
base_url = 'https://www.pixar.com'
endpoint = '/feature-films-launch'
request_url = f'{base_url}{endpoint}'

### Characters scrapping

In [11]:
characters_section = 2
movies = get_pixar_movie_list(request_url)
characters_dict = get_movies_dict(movies, characters_section)
characters_dict

[{'Elemental': ['Ember Lumen',
   'Wade Ripple',
   'Bernie Lumen',
   'Cinder Lumen',
   'Clod',
   'Brook Ripple',
   'Gale',
   'Fern']},
 {'Lightyear': ['The Human Side of Buzz',
   'Buzz Lightyear',
   'Izzy Hawthorne',
   'Sox',
   'Maurice “Mo” Morrison',
   'Darby Steel',
   'Zurg',
   'Alisha Hawthorne',
   'Zyclops',
   'Commander Burnside',
   'Airman DÍaz',
   'I.V.A.N.']},
 {'Turning Red': ['The Inner 13-Year Old',
   'Meilin Lee',
   'Red Panda Mei',
   'Ming Lee',
   'Jin Lee',
   'Miriam Mendelsohn',
   'Priya Mangal',
   'Abby Park',
   '4*Town',
   'Tyler Nguyen-Baker',
   'Grandma',
   'Sun Yee']},
 {'Luca': ['Just Add Water',
   'Luca Paguro',
   'Alberto Scorfano',
   'Giulia Marcovaldo',
   'Ercole Visconti',
   'Massimo Marcovaldo',
   'Daniela Paguro',
   'Lorenzo Paguro',
   'Grandma Paguro',
   'Uncle Ugo',
   'Machiavelli']},
 {'Soul': ['Joe Gardner',
   '22',
   'Dez',
   'Dorothea Williams ',
   'Libba Gardner',
   'Moonwind',
   'The Counselors',
   'Terry

### Assets scrapping

In [12]:
assets_section = 4
movies = get_pixar_movie_list(request_url)
assets_dict = get_movies_dict(movies, assets_section)
assets_dict

[{'Elemental': ['Building The World',
   'Firetown',
   'The Water District',
   'Cyclone Stadium',
   'Graphics']},
 {'Lightyear': ['“Warm CGI”',
   "T'kani Prime",
   'Turnip',
   'Star Command',
   'The Ships']},
 {'Turning Red': ['“Chunky Cute”',
   'Toronto',
   'Chinatown & the Lee Family Temple',
   'Home & Bedroom',
   'Middle School',
   'The Concert']},
 {'Luca': ['Italy, Summertime and Sea Monsters',
   'Portorosso',
   'Underwater',
   'Island']},
 {'Soul': ['Building The Performances',
   'New York City',
   'Barbershop',
   'The Half Note',
   'The Great Before',
   'The Astral Plane',
   'Personality Pavilions',
   'The Hall of Everything']},
 {'Onward': ['Familiar Fantasy', 'Trust Bridge', 'Homes', 'Suburbia']},
 {'Toy Story 4': ['A Toy’s World View', 'The Antique Store', 'Carnival']},
 {'Incredibles 2': ['Graphic Design',
   'The Parr Home',
   'Screenslaver’s Lair',
   'The Elasticycle',
   'DevTech',
   'Hydroliner',
   'Municiberg']},
 {'Coco': ['Research',
   'The 

## Export to csv

In [13]:
def build_df_structure(list_):
    movie_names = []
    list_of_lists = []
    for i in list_:
        for key, value in i.items():
            movie_names.append(key)
            list_of_lists.append(value)
    return [movie_names, list_of_lists]

### Characters

In [14]:
list_ = build_df_structure(characters_dict)
df_structure = {
    'film': list_[0],
    'characters': list_[1]
}
characters_df = pd.DataFrame(df_structure)
# characters_df.to_csv('../data/characters.csv')
characters_df.sample(3)

Unnamed: 0,film,characters
18,WALL-E,"[Pantomime, Live Action, WALL•E, EVE, M-O , Au..."
20,Cars,"[Cars as Characters, Lightning McQueen, Mater,..."
12,Inside Out,"[The Many Particles of Joy, Joy, Sadness, Ange..."


### Assets

In [15]:
list_ = build_df_structure(assets_dict)
df_structure = {
    'film': list_[0],
    'assets': list_[1]
}
assets_df = pd.DataFrame(df_structure)
# assets_df.to_csv('../data/assets.csv')
assets_df.sample(3)

Unnamed: 0,film,assets
23,"Monsters, Inc.","[The Door, The Apartment, Boo's Room, The Fact..."
10,Finding Dory,"[Water and Light, Great Barrier Reef, The Mari..."
19,Ratatouille,"[A World of Extremes, Old School, Gusteau’s, L..."


## Merge DataFrames

In [16]:
merged_df = pd.merge(characters_df, assets_df, on='film', how='inner')
merged_df.sample(3)

Unnamed: 0,film,characters,assets
21,The Incredibles,"[The Superpowers, Mr. Incredible, Elastigirl, ...","[The Future, The Sound Of The Incredibles, New..."
24,Toy Story 2,"[Woody, Buzz Lightyear, Jessie, Bullseye, Mrs....","[Al's Apartment, Al's Toy Barn, Woody's Roundu..."
16,Toy Story 3,"[Reintroductions, Woody, Buzz Lightyear, Lotso...","[Western Opening, The Landfill, Andy's Room, B..."


## Clean Merged DataFrame

In [17]:
def clean_string_list(list_):
    str_ = ", ".join(list_)
    return str_.strip()

merged_df['characters'] = merged_df['characters'].apply(clean_string_list)
merged_df['assets'] = merged_df['assets'].apply(clean_string_list)


In [18]:
merged_df.sample(3)

Unnamed: 0,film,characters,assets
14,Brave,"The Story of Brave, Merida , Queen Elinor, Kin...","The Stories of Scotland, Castle DunBroch, The ..."
3,Luca,"Just Add Water, Luca Paguro, Alberto Scorfano,...","Italy, Summertime and Sea Monsters, Portorosso..."
15,Cars 2,"What Would Mater Do?, Mater, Lightning McQueen...","The International World of Cars 2, (Not) Lost ..."


In [19]:
merged_df.to_csv('../data/characters_assets.csv', index=False)