# Analysis Demonstration

### (things to talk about: how we ran analysis, problems we ran into and how to solve them, how our functions could be use, things that surprised us, interesting conclusions, etc)

## Import Functions

In [26]:
# Packages
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import math

# Our Functions
from scrape_data import scrape_imdb, scrape_rotten_tomatoes, get_html_text
from clean_data import clean_rotten_tomatoes, clean_imdb


## Use Our Functions to Scrape and Clean Data
Our functions allow you to scrape data from imdb and rotten tomatoes lists of movies. The two lists we are using for our data are https://editorial.rottentomatoes.com/guide/disney-100-essential-movies/ and https://www.imdb.com/list/ls089035876/?sort=release_date,desc&st_dt=&mode=detail&page=1 which goes on to have five pages of disney movies. After scraping we create a csv file of the raw data which we then pass to our cleaning functions. The raw data as well as the final cleaned datasets are included in the data folder.

In [27]:
# Scrape from Rotten Tomatoes

webpage = "https://editorial.rottentomatoes.com/guide/disney-100-essential-movies/"
rotten_tomatoes = scrape_rotten_tomatoes(webpage)
rotten_tomatoes.to_csv('data/rotten_tomatoes_raw.csv', index = False)


KeyboardInterrupt



In [None]:
rotten_tomatoes

Unnamed: 0,title,year,score,actors,director
0,\nThe Little Mermaid (2023) 67%\n,(2023),67%,"\nStarring: Halle Bailey, Jonah Hauer-King, Ja...",\nDirected By: Rob Marshall
1,\nTurning Red (2022) 95%\n,(2022),95%,"\nStarring: Rosalie Chiang, Sandra Oh, Ava Mor...",\nDirected By: Domee Shi
2,\nRaya and the Last Dragon (2021) 93%\n,(2021),93%,"\nStarring: Kelly Marie Tran, Awkwafina, Izaac...","\nDirected By: Don Hall, Carlos López Estrada"
3,\nEncanto (2021) 92%\n,(2021),92%,"\nStarring: Stephanie Beatriz, María Cecilia B...","\nDirected By: Jared Bush, Byron Howard"
4,\nOnward (2020) 88%\n,(2020),88%,"\nStarring: Tom Holland, Chris Pratt, Julia Lo...",\nDirected By: Dan Scanlon
...,...,...,...,...,...
95,\nBambi (1942) 91%\n,(1942),91%,"\nStarring: Hardie Albright, Stan Alexander, P...",\nDirected By: David Hand
96,\nDumbo (1941) 95%\n,(1941),95%,"\nStarring: Herman Bing, Billy Bletcher, Edwar...",\nDirected By: Ben Sharpsteen
97,\nFantasia (1940) 95%\n,(1940),95%,"\nStarring: Deems Taylor, Leopold Stokowski, W...","\nDirected By: James Algar, Samuel Armstrong, ..."
98,\nPinocchio (1940) 100%\n,(1940),100%,"\nStarring: Don Brodie, Walter Catlett, Franki...","\nDirected By: Ben Sharpsteen, Hamilton Luske"


In [None]:
# Scrape from IMDB

webpages = ["https://www.imdb.com/list/ls089035876/?sort=release_date,desc&st_dt=&mode=detail&page=1", 
            "https://www.imdb.com/list/ls089035876/?sort=release_date,desc&st_dt=&mode=detail&page=2",
            "https://www.imdb.com/list/ls089035876/?sort=release_date,desc&st_dt=&mode=detail&page=3", 
            "https://www.imdb.com/list/ls089035876/?sort=release_date,desc&st_dt=&mode=detail&page=4",
            "https://www.imdb.com/list/ls089035876/?sort=release_date,desc&st_dt=&mode=detail&page=5"]

imdb = pd.DataFrame()
for webpage in webpages:
    imdb = pd.concat([imdb, scrape_imdb(webpage)])

imdb.to_csv('disney_movie_analysis/data/imdb_raw.csv', index = False)

In [None]:
# Clean Rotten Tomatoes Dataframe

rotten_tomatoes = pd.read_csv('data/rotten_tomatoes_raw.csv')

rotten_tomatoes = clean_rotten_tomatoes(rotten_tomatoes)

rotten_tomatoes.to_csv('data/rotten_tomatoes.csv', index = False)

In [29]:
# Clean IMDB Dataframe

imdb = pd.read_csv('data/imdb_raw.csv')

# Drop 19 since it has no reviews or data aside from name
imdb = imdb.drop(19, axis='index')

# Drop 170 since it is a DVD containing episodes from different shows and not a movie
imdb = imdb.drop(170, axis='index')

imdb = clean_imdb(imdb)

imdb.to_csv('disney_movie_analysis/data/imdb.csv', index = False)