### Plan of action : 
1. Scrape all Disney movies from 1937 to 2023 using Beautiful Soup and Requests 



2. Collect relevant movie attributes (e.g. title, cast, director, genre) and use an API like TMDb to obtain IMDb scores and audience ratings.




3. Clean and preprocess the data by removing duplicates, filling in missing values, standardizing data types, and converting text data into numerical or categorical data as needed.




4. Analyze the data using Matplotlib and Seaborn to find patterns such as box office trends, cast member success rates, and rating correlations.

### Importing libraries 

In [375]:
#web scraping tools
from bs4 import BeautifulSoup as bs 
from selenium import webdriver
import requests 

# Data manipulation and analysis
import pandas as pd 

#  Data visualization
import seaborn as sns 
import matplotlib.pyplot as plt 

# Data storing
import pickle 
import json

# String manipulation 
import re 

#
from tqdm import tqdm


### First step before starting project

In [12]:
#website link : 
website = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films#Released'

# requesting data from website
response = requests.get(website)

# creating soup object to parse the HTML content of a web page
soup = bs(response.text,'lxml')

---
## Task 1 :   Get the list of all disney movies 

In [83]:
# Code to get total list of movies till December 9, 2022 

all_table = soup.find_all('table', class_ = 'wikitable sortable')[:-2]

lst = []
for eachtable in all_table :
    
    movie_list = eachtable.find_all('i')
    
    for eachmovie in movie_list : 
        lst.append(eachmovie.get_text())

print(f"total number of movies : {len(lst)} ")
print()
print('first 5 movies : ')
print(lst[:5])


total number of movies : 510 

first 5 movies : 
['Snow White and the Seven Dwarfs', 'Pinocchio', 'Fantasia', 'The Reluctant Dragon', 'Dumbo']


---
## Task 2 : Get accessed to first movie : 'Snow White and the Seven Dwarfs'

In [161]:
movie_link = 'https://en.wikipedia.org/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)'

response = requests.get(movie_link)

movie_soup = bs(response.text,'lxml')

#### We need to accessed to info box of the website which contains all the details about the movie : 

In [328]:
# Extracting main table 
table = movie_soup.find('table', attrs={'class', 'infobox vevent'})

# save all the columns in the list
col_headers = [] 

# save each movie detail
movie_dict = {} 
movie = []

# function to extract content
def get_header_details(details) : 
    
    # if there is list 
    if details.find('ul') : 
        return [d.get_text(strip = True) for d  in details.find_all('li')]
    else :

        return details.get_text(strip = True)


for i,each_row  in enumerate(table.find_all('tr')) :     
    
    if i == 0  : 
        col_headers.append('title')
        movie_dict['title'] = each_row.get_text()
    
    # for image 
    elif i == 1 :
        continue
#         col_headers.append('movie_poster')
#         movie_dict['movie_poster']= each_row.find('a').img['src']
        
    else : 
            
        col_headers.append(each_row.find('th').get_text(strip = True))
        movie_dict[col_headers[-1]] = get_header_details(each_row.find('td'))

movie.append(movie_dict)



In [335]:
pd.DataFrame(movie) #first movie row 

Unnamed: 0,title,Directed by,Written by,Based on,Produced by,Starring,Music by,Productioncompany,Distributed by,Release dates,Running time,Country,Language,Budget,Box office
0,Snow White and the Seven Dwarfs,"[David Hand, William Cottrell, Wilfred Jackson...","[Ted Sears, Richard Creedon, Otto Englander, D...",Snow Whiteby TheBrothers Grimm,Walt Disney,"[Adriana Caselotti, Lucille La Verne, Harry St...","[Frank Churchill, Paul Smith, Leigh Harline]",Walt Disney Productions,RKO Radio Pictures,"[December 21, 1937(1937-12-21)(Carthay Circle ...",83 minutes,United States,English,$1.49 million,$418 million


--- 

## Task 3 : Grabbing info box for all movies : 

#### Function to extract each movie details

In [612]:
# function to extract content
def get_header_details(details) : 

    # if there is list 
    if details.find('ul') : 
        return [d.get_text(strip = True) for d  in details.find_all('li')]
    
    # if there is a tag
    elif details.find('a') :
        return [d.get_text() for d in details.find_all('a')]
    
    else :

        return details.get_text(strip = True)
    

    
# removing sup  and span tag   
def clear_tags(table) :
    for tag in table.find_all(['sup','span']):

        tag.decompose()

        
# Extracting details of each movie
def  get_infobox(movie_soup) : 
    
    # Extracting main table 
    table = movie_soup.find('table', attrs={'class', 'infobox vevent'})

    # save each movie detail
    movie_dict = {} 
    
    clear_tags(table)

    for i,each_row  in enumerate(table.find_all('tr')) :     

        if i == 0  : 
            col_headers.append('title')
            movie_dict['title'] = each_row.get_text()

        # for image 
        elif i == 1 :
            continue


        else : 
            # if there is 'th' tag
            if each_row.find('th') :



                if 'Directed' in each_row.find('th').get_text()   : 
                    movie_dict['Director'] =  get_header_details(each_row.find('td'))



                elif 'Written' in each_row.find('th').get_text()   : 
                    movie_dict['Writers'] =  get_header_details(each_row.find('td'))

                elif 'Produced' in each_row.find('th').get_text()   : 

                    movie_dict['Producers'] =  get_header_details(each_row.find('td'))

                elif 'Starring' in each_row.find('th').get_text()   : 

                    movie_dict['Starring'] =  get_header_details(each_row.find('td'))

                elif 'Production' in each_row.find('th').get_text()   : 

                    movie_dict['Production company'] =  get_header_details(each_row.find('td'))

                elif 'Release' in each_row.find('th').get_text()   : 

                    movie_dict['Release_date'] =  get_header_details(each_row.find('td'))

                elif 'Running' in each_row.find('th').get_text()   : 

                    movie_dict['Running_time'] =  get_header_details(each_row.find('td'))

                elif 'Country' in each_row.find('th').get_text()   : 

                    movie_dict['Country'] =  get_header_details(each_row.find('td'))

                elif 'language' in each_row.find('th').get_text()   : 

                    movie_dict['language'] =  get_header_details(each_row.find('td'))

                elif 'Budget' in each_row.find('th').get_text()   : 

                    movie_dict['Budget'] =  get_header_details(each_row.find('td'))

                elif 'Box' in each_row.find('th').get_text()   : 

                    movie_dict['Box_office'] =  get_header_details(each_row.find('td'))

    return movie_dict
    
    
    

#### Code to get all movie links 

In [None]:
#------------------------------------------------------------------------------------------

#website link : 
website = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films#Released'

# requesting data from website
response = requests.get(website)

# creating soup object to parse the HTML content of a web page
soup = bs(response.text,'lxml')

#------------------------------------------------------------------------------------------

main_web_link = 'https://en.wikipedia.org'

movie_links =  []
all_movies_list = []

all_tables = soup.select('table.wikitable.sortable i a')

# Store movie links in list 
for eachmovie in all_tables : 
    
    # we only need movies which released 
    if eachmovie['href'] == '/wiki/Chang_Can_Dunk' : 
        break
    
    half_path = eachmovie['href']
    full_path = main_web_link + half_path
    movie_links.append(full_path)

    
    
# Loop through each movie and get details
for eachmovie in tqdm(movie_links):

    # parsing html content from the page
    response = requests.get(eachmovie)
    movie_soup = bs(response.text,'lxml')

    try  : 

        #function to get info box of each movie ( all the details)
        eachmovie_dict = get_infobox(movie_soup)

        all_movies_list.append(eachmovie_dict)

    except Exception as e : 
        print(eachmovie)
        print(e)
        


#### List contaning all movies list which need to be cleaned : 

In [None]:
all_movies_list

#### Saving in pandas data frame :

In [620]:
disney_movies_unclean_df = pd.DataFrame(all_movies_list)

disney_movies_unclean_df.head(2)

Unnamed: 0,title,Director,Writers,Producers,Starring,Production company,Release_date,Running_time,Country,Budget,Box_office
0,Snow White and the Seven Dwarfs,"[David Hand, William Cottrell, Wilfred Jackson, Larry Morey, Perce Pearce, Ben Sharpsteen]","[Ted Sears, Richard Creedon, Otto Englander, Dick Rickard, Earl Hurd, Merrill De Maris, Dorothy Ann Blank, Webb Smith]",[Walt Disney],"[Adriana Caselotti, Lucille La Verne, Harry Stockwell, Roy Atwell, Pinto Colvig, Otis Harlan, Scotty Mattraw, Billy Gilbert, Eddie Collins, Moroni Olsen, Stuart Buchanan]",[Walt Disney Productions],"[December 21, 1937(Carthay Circle Theatre), February 4, 1938(United States)]",83 minutes,United States,$1.49 million,$418 million
1,Pinocchio,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts, Norman Ferguson, Jack Kinney, Wilfred Jackson, T. Hee]",,[Walt Disney],"[Cliff Edwards, Dickie Jones, Christian Rub, Walter Catlett, Charles Judels, Evelyn Venable, Frankie Darro]",[Walt Disney Productions],"[February 7, 1940(Center Theatre), February 23, 1940(United States)]",88 minutes,United States,$2.6 million,$164 million


#### Overview of disney movies data set (unclean) : 

1. No of columns 
2. No of rows 

In [644]:
print(f"No of columns : {disney_movies_unclean_df.shape[0]}")
print(f"No of rows : {disney_movies_unclean_df.shape[1]} \n")

print('*' * 50)

print('Names of columns : ')

for col in disney_movies_unclean_df.columns : 
    print(col)

print('*' * 50)

No of columns : 509
No of rows : 11 

**************************************************
Names of columns : 
title
Director
Writers
Producers
Starring
Production company
Release_date
Running_time
Country
Budget
Box_office
**************************************************


#### Saving unclean dataset in .csv file

In [None]:
disney_movies_unclean_df.to_csv('disney_movies_unclean.csv',index= 'False')

---
# Task 4 : Data cleaning (Most imp) 
---

* Columns __[Release_date,Running_time,Budget,Box_office]__ needs to be cleaned for data analysis
* We need to convert columns  in to its right format 

In [674]:
disney_movies_unclean_df.head(1)


Unnamed: 0,title,Director,Writers,Producers,Starring,Production company,Release_date,Running_time,Country,Budget,Box_office
0,Snow White and the Seven Dwarfs,"[David Hand, William Cottrell, Wilfred Jackson, Larry Morey, Perce Pearce, Ben Sharpsteen]","[Ted Sears, Richard Creedon, Otto Englander, Dick Rickard, Earl Hurd, Merrill De Maris, Dorothy Ann Blank, Webb Smith]",[Walt Disney],"[Adriana Caselotti, Lucille La Verne, Harry Stockwell, Roy Atwell, Pinto Colvig, Otis Harlan, Scotty Mattraw, Billy Gilbert, Eddie Collins, Moroni Olsen, Stuart Buchanan]",[Walt Disney Productions],"[December 21, 1937(Carthay Circle Theatre), February 4, 1938(United States)]",83 minutes,United States,$1.49 million,$418 million


### 4.1 Release Date :  Data cleaning and formating 

* Converting date in to datetime object 
* Removing text part of the date 
* formating dates 

In [754]:
# first 5 rows of release date column : 

disney_movies_unclean_df['Release_date'].head(5).to_frame()

Unnamed: 0,Release_date
0,"[December 21, 1937(Carthay Circle Theatre), February 4, 1938(United States)]"
1,"[February 7, 1940(Center Theatre), February 23, 1940(United States)]"
2,"[November 13, 1940]"
3,"[June 27, 1941]"
4,"[October 23, 1941(New York City), October 31, 1941(U.S.)]"


In [720]:
#filling missing values with 'N/A'

disney_movies_unclean_df['Release_date'].fillna('N/A',inplace = True)

In [737]:
# datetime library 
from datetime import datetime
import numpy as np


#function to clean date  
def clean_date(dt) : 
    return dt.split('(')[0].replace('\xa0','')


# Stored unclean date in to list 
unclean = list(disney_movies_unclean_df['Release_date'])


string_dates = []

for dt in unclean : 
    
    if isinstance(dt,list) : 
        dt = dt[0]

    date_str = clean_date(dt)
    
    date_str = date_str.replace(' ','')
    
    
#     string_dates.append(date_str.replace(' ',''))


In [738]:
# first 5 rows of string_dates 
string_dates[:5]

['December21,1937',
 'February7,1940',
 'November13,1940',
 'June27,1941',
 'October23,1941']

####  These are few different formats of date we need to handle : 
 * December21,1937
 * 26October1953
 * 13March1952
 * 1948–1960
 * 2009
 * September1988
 * WinniethePoohandTiggerToo (Needs to be replace with 'N/A')

In [765]:
datetime.strptime('1948',"%Y").date()

datetime.date(1948, 1, 1)

In [783]:
date_formats = ["%B%d,%Y" , "%d%B%Y" , "%Y", "%B%Y"]
clean_dates = []
for dt in string_dates : 
    
    for fr in date_formats : 
        try :

            if datetime.strptime(dt,fr).date() : 
                clean_dates.append(datetime.strptime(dt,fr).date())
                break
        
        except : 
            pass
        
    else : clean_dates.append('N/A')
    
    