### Plan of action : 
1. Scrape all Disney movies from 1937 to 2023 using Beautiful Soup and Requests 



2. Collect relevant movie attributes (e.g. title, cast, director, genre) and use an API like TMDb to obtain IMDb scores and audience ratings.




3. Clean and preprocess the data by removing duplicates, filling in missing values, standardizing data types, and converting text data into numerical or categorical data as needed.




4. Analyze the data using Matplotlib and Seaborn to find patterns such as box office trends, cast member success rates, and rating correlations.

### Importing libraries 

In [1255]:
#web scraping tools
from bs4 import BeautifulSoup as bs 
from selenium import webdriver
import requests 

# Data manipulation and analysis
import pandas as pd 

#  Data visualization
import seaborn as sns 
import matplotlib.pyplot as plt 

# Data storing
import pickle 
import json

# String manipulation 
import re 

# To check progress
from tqdm import tqdm

# for encoding dic as url
import urllib 


### First step before starting project

In [12]:
#website link : 
website = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films#Released'

# requesting data from website
response = requests.get(website)

# creating soup object to parse the HTML content of a web page
soup = bs(response.text,'lxml')

---
## Task 1 :   Get the list of all disney movies 

In [83]:
# Code to get total list of movies till December 9, 2022 

all_table = soup.find_all('table', class_ = 'wikitable sortable')[:-2]

lst = []
for eachtable in all_table :
    
    movie_list = eachtable.find_all('i')
    
    for eachmovie in movie_list : 
        lst.append(eachmovie.get_text())

print(f"total number of movies : {len(lst)} ")
print()
print('first 5 movies : ')
print(lst[:5])


total number of movies : 510 

first 5 movies : 
['Snow White and the Seven Dwarfs', 'Pinocchio', 'Fantasia', 'The Reluctant Dragon', 'Dumbo']


--- 

## Task 2 : Grabbing info box for all movies : 

#### Function to extract each movie details

In [612]:
# function to extract content
def get_header_details(details) : 

    # if there is list 
    if details.find('ul') : 
        return [d.get_text(strip = True) for d  in details.find_all('li')]
    
    # if there is a tag
    elif details.find('a') :
        return [d.get_text() for d in details.find_all('a')]
    
    else :

        return details.get_text(strip = True)
    

    
# removing sup  and span tag   
def clear_tags(table) :
    for tag in table.find_all(['sup','span']):

        tag.decompose()

        
# Extracting details of each movie
def  get_infobox(movie_soup) : 
    
    # Extracting main table 
    table = movie_soup.find('table', attrs={'class', 'infobox vevent'})

    # save each movie detail
    movie_dict = {} 
    
    clear_tags(table)

    for i,each_row  in enumerate(table.find_all('tr')) :     

        if i == 0  : 
            col_headers.append('title')
            movie_dict['title'] = each_row.get_text()

        # for image 
        elif i == 1 :
            continue


        else : 
            # if there is 'th' tag
            if each_row.find('th') :



                if 'Directed' in each_row.find('th').get_text()   : 
                    movie_dict['Director'] =  get_header_details(each_row.find('td'))



                elif 'Written' in each_row.find('th').get_text()   : 
                    movie_dict['Writers'] =  get_header_details(each_row.find('td'))

                elif 'Produced' in each_row.find('th').get_text()   : 

                    movie_dict['Producers'] =  get_header_details(each_row.find('td'))

                elif 'Starring' in each_row.find('th').get_text()   : 

                    movie_dict['Starring'] =  get_header_details(each_row.find('td'))

                elif 'Production' in each_row.find('th').get_text()   : 

                    movie_dict['Production company'] =  get_header_details(each_row.find('td'))

                elif 'Release' in each_row.find('th').get_text()   : 

                    movie_dict['Release_date'] =  get_header_details(each_row.find('td'))

                elif 'Running' in each_row.find('th').get_text()   : 

                    movie_dict['Running_time'] =  get_header_details(each_row.find('td'))

                elif 'Country' in each_row.find('th').get_text()   : 

                    movie_dict['Country'] =  get_header_details(each_row.find('td'))

                elif 'language' in each_row.find('th').get_text()   : 

                    movie_dict['language'] =  get_header_details(each_row.find('td'))

                elif 'Budget' in each_row.find('th').get_text()   : 

                    movie_dict['Budget'] =  get_header_details(each_row.find('td'))

                elif 'Box' in each_row.find('th').get_text()   : 

                    movie_dict['Box_office'] =  get_header_details(each_row.find('td'))

    return movie_dict
    
    
    

#### Code to get all movie links 

In [None]:
#------------------------------------------------------------------------------------------

#website link : 
website = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films#Released'

# requesting data from website
response = requests.get(website)

# creating soup object to parse the HTML content of a web page
soup = bs(response.text,'lxml')

#------------------------------------------------------------------------------------------

main_web_link = 'https://en.wikipedia.org'

movie_links =  []
all_movies_list = []

all_tables = soup.select('table.wikitable.sortable i a')

# Store movie links in list 
for eachmovie in all_tables : 
    
    # we only need movies which released 
    if eachmovie['href'] == '/wiki/Chang_Can_Dunk' : 
        break
    
    half_path = eachmovie['href']
    full_path = main_web_link + half_path
    movie_links.append(full_path)

    
    
# Loop through each movie and get details
for eachmovie in tqdm(movie_links):

    # parsing html content from the page
    response = requests.get(eachmovie)
    movie_soup = bs(response.text,'lxml')

    try  : 

        #function to get info box of each movie ( all the details)
        eachmovie_dict = get_infobox(movie_soup)

        all_movies_list.append(eachmovie_dict)

    except Exception as e : 
        print(eachmovie)
        print(e)
        


#### List contaning all movies list which need to be cleaned : 

In [None]:
all_movies_list

#### Saving in pandas data frame :

In [620]:
disney_movies_unclean_df = pd.DataFrame(all_movies_list)

disney_movies_unclean_df.head(2)

Unnamed: 0,title,Director,Writers,Producers,Starring,Production company,Release_date,Running_time,Country,Budget,Box_office
0,Snow White and the Seven Dwarfs,"[David Hand, William Cottrell, Wilfred Jackson, Larry Morey, Perce Pearce, Ben Sharpsteen]","[Ted Sears, Richard Creedon, Otto Englander, Dick Rickard, Earl Hurd, Merrill De Maris, Dorothy Ann Blank, Webb Smith]",[Walt Disney],"[Adriana Caselotti, Lucille La Verne, Harry Stockwell, Roy Atwell, Pinto Colvig, Otis Harlan, Scotty Mattraw, Billy Gilbert, Eddie Collins, Moroni Olsen, Stuart Buchanan]",[Walt Disney Productions],"[December 21, 1937(Carthay Circle Theatre), February 4, 1938(United States)]",83 minutes,United States,$1.49 million,$418 million
1,Pinocchio,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts, Norman Ferguson, Jack Kinney, Wilfred Jackson, T. Hee]",,[Walt Disney],"[Cliff Edwards, Dickie Jones, Christian Rub, Walter Catlett, Charles Judels, Evelyn Venable, Frankie Darro]",[Walt Disney Productions],"[February 7, 1940(Center Theatre), February 23, 1940(United States)]",88 minutes,United States,$2.6 million,$164 million


> This is our unclean Dataset of disney movies

#### Overview of disney movies data set (unclean) : 

1. No of columns 
2. No of rows 

In [644]:
print(f"No of columns : {disney_movies_unclean_df.shape[0]}")
print(f"No of rows : {disney_movies_unclean_df.shape[1]} \n")

print('*' * 50)

print('Names of columns : ')

for col in disney_movies_unclean_df.columns : 
    print(col)

print('*' * 50)

No of columns : 509
No of rows : 11 

**************************************************
Names of columns : 
title
Director
Writers
Producers
Starring
Production company
Release_date
Running_time
Country
Budget
Box_office
**************************************************


#### Saving unclean dataset in .csv file

In [None]:
disney_movies_unclean_df.to_csv('disney_movies_unclean.csv',index= 'False')

---
# Task 3 : Data cleaning (Most imp) 
---

* Columns __[Release_date,Running_time,Budget,Box_office]__ needs to be cleaned for data analysis
* We need to convert columns  in to its right format 

In [674]:
disney_movies_unclean_df.head(1)


Unnamed: 0,title,Director,Writers,Producers,Starring,Production company,Release_date,Running_time,Country,Budget,Box_office
0,Snow White and the Seven Dwarfs,"[David Hand, William Cottrell, Wilfred Jackson, Larry Morey, Perce Pearce, Ben Sharpsteen]","[Ted Sears, Richard Creedon, Otto Englander, Dick Rickard, Earl Hurd, Merrill De Maris, Dorothy Ann Blank, Webb Smith]",[Walt Disney],"[Adriana Caselotti, Lucille La Verne, Harry Stockwell, Roy Atwell, Pinto Colvig, Otis Harlan, Scotty Mattraw, Billy Gilbert, Eddie Collins, Moroni Olsen, Stuart Buchanan]",[Walt Disney Productions],"[December 21, 1937(Carthay Circle Theatre), February 4, 1938(United States)]",83 minutes,United States,$1.49 million,$418 million


### 3.1 Release Date :  Data cleaning and formating 

* Converting date in to datetime object 
* Removing text part of the date 
* formating dates 

In [754]:
# first 5 rows of release date column : 

disney_movies_unclean_df['Release_date'].head(5).to_frame()

Unnamed: 0,Release_date
0,"[December 21, 1937(Carthay Circle Theatre), February 4, 1938(United States)]"
1,"[February 7, 1940(Center Theatre), February 23, 1940(United States)]"
2,"[November 13, 1940]"
3,"[June 27, 1941]"
4,"[October 23, 1941(New York City), October 31, 1941(U.S.)]"


In [737]:
# datetime library 
from datetime import datetime
import numpy as np

#filling missing values with 'N/A'

disney_movies_unclean_df['Release_date'].fillna('N/A',inplace = True)

#function to clean date  
def clean_date(dt) : 
    return dt.split('(')[0].replace('\xa0','')


# Stored unclean date in to list 
unclean = list(disney_movies_unclean_df['Release_date'])


string_dates = []

for dt in unclean : 
    
    if isinstance(dt,list) : 
        dt = dt[0]

    date_str = clean_date(dt)
    
    date_str = date_str.replace(' ','')
    
    string_dates.append(date_str.replace(' ',''))


In [738]:
# first 5 rows of string_dates 
string_dates[:5]

['December21,1937',
 'February7,1940',
 'November13,1940',
 'June27,1941',
 'October23,1941']

####  These are few different formats of date we need to handle : 
 * December21,1937
 * 26October1953
 * 13March1952
 * 1948–1960
 * 2009
 * September1988
 * WinniethePoohandTiggerToo (Needs to be replace with 'N/A')

In [792]:
date_formats = ["%B%d,%Y" , "%d%B%Y" , "%Y", "%B%Y"]
clean_dates = []
for dt in string_dates : 
    
    for fr in date_formats : 
        try :

            if datetime.strptime(dt,fr).date() : 
                clean_dates.append(datetime.strptime(dt,fr).date())
                break
        
        except : 
            pass
        
    else : clean_dates.append('N/A')
    
disney_movies_unclean_df['Cleaned_release_date'] = clean_dates
disney_movies_unclean_df[['Release_date','Cleaned_release_date']].head(5)

Unnamed: 0,Release_date,Cleaned_release_date
0,"[December 21, 1937(Carthay Circle Theatre), February 4, 1938(United States)]",1937-12-21
1,"[February 7, 1940(Center Theatre), February 23, 1940(United States)]",1940-02-07
2,"[November 13, 1940]",1940-11-13
3,"[June 27, 1941]",1941-06-27
4,"[October 23, 1941(New York City), October 31, 1941(U.S.)]",1941-10-23


>  We have succesfully clean and formatted released dates !!!

*** 
### 3.2 Running time : Data cleaning and formating

* Converting running time in to integer 
 * Some Edge case : 
   * 84\xa0minutes
   * ['Los Angeles', 'New York City', "Director's cut"] -- Some string values in column


In [848]:
disney_movies_unclean_df['Running_time'].head().to_frame()

Unnamed: 0,Running_time
0,83 minutes
1,88 minutes
2,126 minutes
3,74 minutes
4,64 minutes


In [849]:
list(disney_movies_unclean_df['Running_time'].str.split(' ').str[0])

cleaned_time = []
def running_time_cleaning(running_time):

    #handling edge case 
    if isinstance(running_time,list) : 
        return 'N/A'

    elif  running_time == '84\xa0minutes' : 
        return int(running_time[:2])

    elif pd.isna(running_time) : 
        return 'N/A'

    else : 
        time = running_time.split(' ')[0]
        return int(time)
        
    


# Creating new column 
disney_movies_unclean_df['Cleaned Running_time'] = disney_movies_unclean_df['Running_time'].apply(running_time_cleaning)

In [850]:
disney_movies_unclean_df[['Cleaned Running_time','Running_time']].head()

Unnamed: 0,Cleaned Running_time,Running_time
0,83,83 minutes
1,88,88 minutes
2,126,126 minutes
3,74,74 minutes
4,64,64 minutes


> We have succesfully clean and formatted Running time !!!

### 3.3 Budget and box office : Data cleaning and formating : 
   
* We need to structure both the columns in to standarize form for quick analysis
* As both column will play imp role in analysis
* We will use regular expression for complex string matching


In [1253]:
disney_movies_unclean_df[['Budget', 'Box_office']].head()

Unnamed: 0,Budget,Box_office
0,$1.49 million,$418 million
1,$2.6 million,$164 million
2,$2.28 million,$76.4–$83.3 million (United States and Canada)
3,"$600,000","$960,000 (worldwide rentals)"
4,"$950,000",">$1.3 million (est. United States/Canada rentals, 1941)"


#### Steps for cleaning and formatting budget and box office column  :

value in (million, billion, thousand)
##### Different formatting we need to handle
*   $ 418 million -  numeric value + units

*   $164\xa0million - garbage value

*   $ 1 million  -  

*   $1.3 million (est. United States/Canada rentals, 1941) - float value + unit

*    $ 799,000

*  ['$ 2.4 million (1951, domestic)', '$ 3.5 million (1974, domestic)']  -- list

*  ['rentals'],['YY] -erroes

* ['₽2.196 billion', '$27 million'] --extract value + million from list


*  est., 86.85 crore, -- value in cr

*  $ 175million -join string

*   175–200 million dash problem

*  $2,150,000 (US) -- single string with text

*  $2,550,000 (US/ Canada)

*  $462million

*  64900000.00000001    -- rounding off

*  est.(US$ 9.9 million) 

*  $1.025billion 
*  est.- ( Some text)

In [1247]:
#pattern for value & unit : 
value_pattern = r'\$[\d.,]+'
unit_pattern = r'million|billion|thousand|crore'


#dict for currency_conversion : 
currency_dict ={'thousand': 1000,'million':1000000, 'billion':100000000,'crore' : 1000000,'single_string' :1}


#function for currency conversion : 
def currency_conversion(currency_value,unit) : 
    
    currency_value = re.sub(r'[\$,]','',currency_value)
    clean_currency = float(currency_value) * currency_dict.get(unit)
    return round(clean_currency,2)


def currency_cleaning(currency_string) : 
    
    if isinstance(currency_string,list) : 
        #print(currency_string)
        
        for li in currency_string :
            if re.search(value_pattern,li) and re.search(unit_pattern,li) :
                currency_value = re.search(value_pattern,li).group()
                unit = re.search(unit_pattern,li).group()
                
                clean_value = currency_conversion(currency_value,unit)
                return clean_value
                
                break
        else : 
            return 'N/A'
        
    elif pd.isna(currency_string) : 
        return 'N/A'

    
    #handling single string
    else : 
    
    
       # for single string
        try :
            #handling garbage value
            currency_string = currency_string.replace('\xa0','')
            
            currency_value = re.search(value_pattern,currency_string).group()

            if re.search(unit_pattern,currency_string) : 
                unit = re.search(unit_pattern,currency_string).group()

            else : 
                unit = 'single_string'    
            
            #function to get value in (number $)
            clean_value = currency_conversion(currency_value,unit)

            return clean_value

        except Exception as e : 
            return e
        
# Adding new column in dataset : 
disney_movies_unclean_df['Cleaned Budget'] = disney_movies_unclean_df['Budget'].apply(currency_cleaning)
disney_movies_unclean_df['Cleaned Box office'] = disney_movies_unclean_df['Box_office'].apply(currency_cleaning)

In [1250]:
disney_movies_unclean_df[['Box_office','Cleaned Box office','Budget','Cleaned Budget']]

Unnamed: 0,Box_office,Cleaned Box office,Budget,Cleaned Budget
0,$418 million,418000000.0,$1.49 million,1490000.0
1,$164 million,164000000.0,$2.6 million,2600000.0
2,$76.4–$83.3 million (United States and Canada),76400000.0,$2.28 million,2280000.0
3,"$960,000 (worldwide rentals)",960000.0,"$600,000",600000.0
4,">$1.3 million (est. United States/Canada rentals, 1941)",1300000.0,"$950,000",950000.0
...,...,...,...,...
504,,,$40 million,40000000.0
505,,,,
506,$73.6 million,73600000.0,$135–180 million,135000000.0
507,,,,


> We have succesfully cleaned and formatted Box office and budget values !!!

# Task 4 : Fetching IMDB score and key metrics using API's

* The OMDB API is a tool that allows developers to get information about movies in an automated way. 
* This can be useful for building websites or apps that display information about movies, like their titles, release dates, and plots.



### Steps for fetching movie data from OMDB  :

1. Create Account on https://www.omdbapi.com/

2. Generate API key : https://www.omdbapi.com/apikey.aspx

3. Send all data requests to : http://www.omdbapi.com/?apikey=[yourkey]&



#### To fetch details using ID or title

***
        <-- +----------------+-----------+------------------------+---------------+----------------------------------------+
        | By ID or Title |           |                        |               |                                        |
        +----------------+-----------+------------------------+---------------+----------------------------------------+
        | Parameter      | Required  | Valid Options          | Default Value | Description                            |
        | i              | Optional* |                        | <empty>       | A valid IMDb ID (e.g. tt1285016)       |
        | t              | Optional* |                        | <empty>       | Movie title to search for.             |
        | type           | No        | movie, series, episode | <empty>       | Type of result to return.              |
        | y              | No        |                        | <empty>       | Year of release.                       |
        | plot           | No        | short, full            | short         | Return short or full plot.             |
        | r              | No        | json, xml              | json          | The data type to return.               |
        | callback       | No        |                        | <empty>       | JSONP callback name.                   |
        | v              | No        |                        | 1             | API version (reserved for future use). |
        +----------------+-----------+------------------------+---------------+----------------------------------------+ -->
*** 
    
####  There are three options to make request:

- Search(s=): Retrieves you all possible options.
- Title(t=): A movie title.
- ID(i=): a valid IMDB ID (e.g. tt1234567).

There are also more options we could include in our request like the data type to return (JSON / XML), the type of content (movie, series, or episode), year of the movie, etc.

Things we  will fetch through APi and attached to our dataset  : 

1. Year
2. Genre
4. imdb rating 
5. imdb votes
6. boxoffice 
7. Poster


In [1279]:
import warnings

# Ignore warning messages
warnings.filterwarnings('ignore')

In [None]:
# base url 
url = 'http://www.omdbapi.com/?'

# Creating new columns from api data
disney_movies_unclean_df['Year'] = 0
disney_movies_unclean_df['Genre'] = 0
disney_movies_unclean_df['imdbRating'] = 0
disney_movies_unclean_df['imdbVotes'] = 0
disney_movies_unclean_df['Poster'] = 0
disney_movies_unclean_df['Boxoffice_omdb'] = 0

# Function to make short url : 

def create_short_url(img_url):
    # initialize the pyshorteners client
    s = pyshorteners.Shortener()

    # shorten the image URL
    short_url = s.tinyurl.short(img_url)

    return short_url


# function to fetch movie detail from omdb website 
def get_movie_details(title) :
    parameters = {'apikey' : '6da02554', 't' : title}
    response = requests.get(url,params= parameters)
    movie_data = response.json()
    #print(movie_data)
    return movie_data



for i,title in tqdm(enumerate(list(disney_movies_unclean_df['title']))) :

    try : 
        each_movie_data = get_movie_details(title)

        # Get movie details 
        disney_movies_unclean_df['Year'].iloc[i] = each_movie_data.get('Year','N/A')
        disney_movies_unclean_df['Genre'].iloc[i] = each_movie_data.get('Genre','N/A')
        disney_movies_unclean_df['imdbRating'].iloc[i] = each_movie_data.get('imdbRating','N/A')
        disney_movies_unclean_df['imdbVotes'].iloc[i] = each_movie_data.get('imdbVotes','N/A')
        disney_movies_unclean_df['Poster'].iloc[i] = create_short_url(each_movie_data.get('Poster','N/A'))
        disney_movies_unclean_df['Boxoffice_omdb'].iloc[i] = each_movie_data.get('BoxOffice','N/A')
    
    except Exception as e : 
        print('Error ===>', eachmovie,'== ',e)


In [1341]:
# Printing new columns that are added : 

disney_movies_unclean_df[['Year','Genre','imdbRating','imdbVotes','Poster','Boxoffice_omdb']].iloc[:5]

Unnamed: 0,Year,Genre,imdbRating,imdbVotes,Poster,Boxoffice_omdb
0,1937,"Animation, Adventure, Family",7.6,203307,https://tinyurl.com/2ostnkqc,"$184,925,486"
1,1940,"Animation, Adventure, Comedy",7.5,150712,https://tinyurl.com/2njyxz8t,"$84,254,167"
2,1940,"Animation, Family, Fantasy",7.7,98591,https://tinyurl.com/2l6q69hd,"$76,408,097"
3,1941,"Animation, Comedy, Family",6.8,3505,https://tinyurl.com/2mq342nz,"$872,000"
4,1941,"Animation, Adventure, Drama",7.2,134418,https://tinyurl.com/2qq4d3tt,


#### 4.1 Data cleaning : imdbVotes Column :  Converting in to integer format

In [1346]:

def clean_votes(votes) : 
    if votes == 'N/A' : 
        return 'N/A'
    else  : return int(votes.replace(',','').strip())

disney_movies_unclean_df['imdbVotes']= disney_movies_unclean_df['imdbVotes'].apply(clean_votes)

#### 4.2 Data cleaning : box office Column : 

In [None]:
def clean_boxoffice_omdb(value) :
    print(re.sub(r'[\$,]+','',value))
    
disney_movies_unclean_df['Boxoffice_omdb in $'] = disney_movies_unclean_df['Boxoffice_omdb'].apply(clean_boxoffice_omdb)

--- 
# Task 5 : Preparing  Final Data Set for analysis

In [None]:
# Saving into clean data set : 
disney_movies_clean_df = disney_movies_unclean_df[['title',
                          'Cleaned_release_date',
                          'Year','Genre','Director','Starring','Production company',
                          'Country', 'Cleaned_release_date', 'Cleaned Running_time',
                          'Cleaned Budget','Boxoffice_omdb', 'Cleaned Box office', 'imdbRating',
                          'imdbVotes', 'Poster', 
                         ]]


# Renaming Columns : 
disney_movies_clean_df.rename({'Cleaned_release_date':'Release Date',
                               'Cleaned Running_time' :'Running_time',
                                'Cleaned Budget' : 'Budget',
                                 'Cleaned Box office' :'Boxoffice'
                               },axis = 1,inplace = True)

In [1404]:
# Changing dtype of the columns : 


# Coverting Dates in to Date time dtype 
disney_movies_clean_df['Release Date'] = pd.to_datetime(disney_movies_clean_df['Release Date'],errors = 'coerce')


# Converting in to float dtype : 
columns_list = ['Running_time',
                'Budget','Boxoffice_omdb','Boxoffice', 
                'imdbRating', 'imdbVotes']


for col in columns_list : 
    disney_movies_clean_df[col] = pd.to_numeric(disney_movies_clean_df[col],errors = 'coerce')


In [1412]:
# Printing first row of cleaned dataset : 
disney_movies_clean_df[:1]

Unnamed: 0,title,Release Date,Year,Genre,Director,Starring,Production company,Country,Running_time,Budget,Boxoffice_omdb,Boxoffice,imdbRating,imdbVotes,Poster
0,Snow White and the Seven Dwarfs,1937-12-21,1937,"Animation, Adventure, Family","[David Hand, William Cottrell, Wilfred Jackson, Larry Morey, Perce Pearce, Ben Sharpsteen]","[Adriana Caselotti, Lucille La Verne, Harry Stockwell, Roy Atwell, Pinto Colvig, Otis Harlan, Scotty Mattraw, Billy Gilbert, Eddie Collins, Moroni Olsen, Stuart Buchanan]",[Walt Disney Productions],United States,83.0,1490000.0,,418000000.0,7.6,203307.0,https://tinyurl.com/2ostnkqc


In [1436]:
# Basics info of the dataset : 
disney_movies_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509 entries, 0 to 508
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   title               509 non-null    object        
 1   Release Date        505 non-null    datetime64[ns]
 2   Year                509 non-null    object        
 3   Genre               509 non-null    object        
 4   Director            507 non-null    object        
 5   Starring            475 non-null    object        
 6   Production company  509 non-null    object        
 7   Country             444 non-null    object        
 8   Running_time        504 non-null    float64       
 9   Budget              300 non-null    float64       
 10  Boxoffice           380 non-null    float64       
 11  imdbRating          483 non-null    float64       
 12  imdbVotes           483 non-null    float64       
 13  Poster              509 non-null    object        

In [1430]:

def clean_boxoffice_omdb(value):
    
    if value == 0 :
        return 'N/A'
    
    elif re.sub(r'[\$,]+','',str(value)) :
        return re.sub(r'[\$,]+','',value)

    else : return 'N/A'
    

   
  
    
disney_movies_clean_df['Boxoffice_omdb_new']= disney_movies_unclean_df['Boxoffice_omdb'].apply(clean_boxoffice_omdb)



# Task 5 : Analytics Report : 