In [4]:
# !pip install gazpacho
from gazpacho import get, Soup
import pandas as pd

In [6]:
# Created Function to parse data from web

def parse(movie):
    rank = movie.find("td", {"class" : "rank"}).text
    title = movie.find("td", {"class" : "title"}).text
    world_wide_gross = movie.find("td", {"class" : "money"})[0].text
    domestic_gross = movie.find("td", {"class" : "money"})[1].text
    domestic_per = movie.find("td", {"class" : "percent"})[0].text
    foreign_gross  = movie.find("td", {"class" : "money"})[2].text
    foreign_per = movie.find("td", {"class" : "percent"})[1].text
    year = movie.find("td", {"class" : "year"}).text

    return rank, title, world_wide_gross, domestic_gross, domestic_per, foreign_gross, foreign_per, year


def parse_link_left(movie_left):
  international_percent = movie_left.find("span" ,{"class" : "percent"})[1].text
  international_gross = movie_left.find("span" ,{"class" : "money"})[1].text

  return [international_percent, international_gross]


def parse_link_right(movie_right):
  domestic_distributor = movie_right.find("span")[1].text
  domestic_opening = movie_right.find("span")[3].text

  return [domestic_distributor, domestic_opening]


In [8]:
urls = ['https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?area=XWW']
columns_main = []
rows_main = []

columns_sub = ['international_percent', 'international_gross', 'domestic_distributor', 'domestic_opening']
rows_sub = []


# Web scrapping in urls list

for u in urls:

  url = u
  html = get(url)
  soup = Soup(html)

  movies = soup.find("tr")
  header = movies.pop(0)
  # movies = movies[:100]

  if len(columns_main) == 0:
    column_name = header.find('span')

    for col in column_name:
      new_col = col.text.lower().replace(" ", "_").replace("%", "percent")
      columns_main.append(new_col)


  for movie in movies:
    rows_main.append(parse(movie))

    movie_url = movie.find('a', {"href" : "title"}).attrs["href"]
    movie_html = get("https://www.boxofficemojo.com" + movie_url)
    soup = Soup(movie_html)

    movie_left = soup.find("div", {"class" : "a-section a-spacing-none mojo-performance-summary-table"})
    movie_right = soup.find("div", {"class" : "a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile"})


    row = parse_link_left(movie_left) + parse_link_right(movie_right)

    rows_sub.append(row)


In [10]:
# Create DataFrame

df_main = pd.DataFrame(rows_main, columns=columns_main)
df_sub = pd.DataFrame(rows_sub, columns=columns_sub)

result = pd.concat([df_main, df_sub], axis=1)

result.index.name = 'index'

result.isnull().sum()

result.drop(result.index[95], inplace=True)

col_to_num = ['worldwide_lifetime_gross', 'domestic_lifetime_gross', 'foreign_lifetime_gross', 'domestic_opening', 'international_gross']
col_to_float = ['domestic_percent', 'foreign_percent', 'international_percent']

for col in col_to_num:
  result[col] = result[col].replace('-', '0')
  result[col] = result[col].replace(r'\D+', '', regex=True).astype(int)

for col in col_to_float:
  result[col] = result[col].replace('-', '0')
  result[col] = result[col].replace('<', '0')
  result[col] = result[col].replace(r'%' , '', regex=True).astype(float)

result['year'] = result['year'].astype(int)
result['rank'] = result['rank'].astype(int)


#result.dtypes
result.to_csv('movies.csv')