<a href="https://colab.research.google.com/github/nimitsajal/Count-Down-Timer/blob/master/Movie_Data_Set_Creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Imports**


In [1]:
from bs4 import BeautifulSoup as bs
from datetime import datetime as dt
import requests
import re

## **TASK 1**
*Grab all the Info from the Side Panel from the webpage and store them in a Dictionary - https://en.wikipedia.org/wiki/Toy_Story_3*

In [2]:
# Loading the Webpage

r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")
webpage = bs(r.content)
# print(webpage.pret

In [3]:
# Grabbing the Side Info Panel and storing them in a dictionary

movie_info = {}
panel = webpage.find("table", attrs={"class": "infobox vevent"}).find_all("tbody")[0].find_all("tr")
for index, tr in enumerate(panel):
  if index == 0:
    movie_info["Title"] = tr.get_text(" ", strip=True)
  elif index == 1:
    continue
  else:
    key = tr.find("th").get_text(" ", strip=True)
    if tr.find("li"):
      value = [item.get_text(" ", strip=True).replace("\xa0", "") for item in tr.find_all("li")]
    else:
      value = tr.find("td").get_text(" ", strip=True).replace("\xa0", "")

    movie_info[key] = value
movie_info

{'Box office': '$1.067billion [1]',
 'Budget': '$200million [1]',
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Country': 'United States',
 'Directed by': 'Lee Unkrich',
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Edited by': 'Ken Schretzmann',
 'Language': 'English',
 'Music by': 'Randy Newman',
 'Produced by': 'Darla K. Anderson',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Release date': ['June12,2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June18,2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Screenplay by': 'Michael Arndt',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Title': 'Toy Story 3'}

## **TASK 2**
*Grab all the Info from the Side Panel from all the Movies from the main Disney Movies webpage and store them in a Dictionary - https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films*

In [4]:
# Loading the Webpage

r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
webpage = bs(r.content)
# print(webpage.prettify())

In [5]:
# Grabbing the links to individual movie pages

baseLink = "https://en.wikipedia.org/"
movie_infos = []

links = webpage.select("table.wikitable.sortable tbody tr td i a")

for index, link in enumerate(links):
  try:
    actualLink = baseLink + link["href"]

    # Loading the Webpage
    r = requests.get(actualLink)
    moviePage = bs(r.content)
    
    # Grabbing the Side Info Panel and storing them in a dictionary
    movie_info = {}
    panel = moviePage.find("table", attrs={"class": "infobox vevent"})
    # print(panel)
    panel = panel.find("tbody")
    panel = panel.select("tr")
    for index, tr in enumerate(panel):
      if index == 0:
        movie_info["Title"] = tr.get_text(" ", strip=True)
      elif index == 1:
        continue
      else:
        key = tr.find("th").get_text(" ", strip=True)
        if tr.find("li"):
          value = [item.get_text(" ", strip=True).replace("\xa0", "") for item in tr.find_all("li")]
        else:
          value = tr.find("td").get_text(" ", strip=True).replace("\xa0", "")

        movie_info[key] = value
        
  except Exception as e:
    E = e
  
  movie_infos.append(movie_info)
for movies in movie_infos:
  print(movies.get("Release date"))

['May19,1937 ( 1937-05-19 )']
['December21,1937 ( 1937-12-21 ) ( Carthay Circle Theatre )']
['February7,1940 ( 1940-02-07 ) ( Center Theatre ) [1]', 'February23,1940 ( 1940-02-23 ) (United States) [2]']
['November13,1940 ( 1940-11-13 )']
['June27,1941 ( 1941-06-27 ) [1]']
['October23,1941 ( 1941-10-23 ) (New York City) [1]', 'October31,1941 ( 1941-10-31 ) (U.S.)']
['August9,1942 ( 1942-08-09 ) (World Premiere – London)', 'August13,1942 ( 1942-08-13 ) (Premiere – New York City)', 'August21,1942 ( 1942-08-21 ) (U.S.) [1]']
['August24,1942 ( 1942-08-24 ) (World Premiere – Rio de Janeiro)', 'February6,1943 ( 1943-02-06 ) (U.S. Premiere – Boston)', 'February19,1943 ( 1943-02-19 ) (U.S.) [1]']
['July17,1943 ( 1943-07-17 )']
['December21,1944 ( 1944-12-21 ) (Mexico City)', 'February3,1945 ( 1945-02-03 ) (US) [1]']
['April20,1946 ( 1946-04-20 ) (New York City premiere) [1]', 'August15,1946 ( 1946-08-15 ) (U.S.) [1]']
['November12,1946 ( 1946-11-12 ) (Premiere: Atlanta, Georgia) [1]', 'November

## **TASK 3**
*Clean-up Process*

*   Clean-up references like [1]
*   Convert running time to Integer
*   Convert Dates into DateTime Object
*   Convert Budget & Box Office Amounts to Integers
*   Split up Long Strings

In [6]:
# title Default title text
# Grabbing the links to individual movie pages

def removeParenthesis(date):
  try:
    return date.split(" (")[0]
  except:
    pass
  return None

def convertDate(date):
  if isinstance(date, list):
    date = date[0]
  # date = removeParenthesis(date)
  # print(removeParenthesis(date))
  # print(date)
  formats = ["%B %d, %Y", "%B%d,%Y", "%d %B %Y", "%d %B, %Y", "%d%B%Y", "%d%B,%Y"]
  for format in formats:
    try:
      # print(dt.strptime(date, format))
      return dt.strptime(date, format)
    except:
      pass
    return None

def convertToMoney(temp):
  if isinstance(temp, list):
    temp = temp[0]
  temp = temp.replace(",", "")
  # print(temp)
  numeric_const_pattern = "[-+]? [$]? (?: (?: \d* \. \d+ ) | (?: \d+ \.? ) )(?: [Ee] [+-]? \d+ ) ?"
  rx = re.compile(numeric_const_pattern, re.VERBOSE)
  if "crore" in temp:
    # num = re.sub("[^0-9]", "", temp)
    # num = rx.findall(temp)[0]
    # result = int(140095 * float(num))
    return None
  elif "$" in temp:
    if "–" in temp:
      temp = temp.split("–")[0]
      num = rx.findall(temp)[0]
      num = num.replace("$", "")
      if "million" in temp or "million" in temp:
        result = int(1000000 * float(num))
      elif "billion" in temp or "Billion" in temp:
        result = int(1000000000 * float(num))
      elif "thousand" in temp or "thousand" in temp:
        result = int(1000 * float(num))
      else:
        result = int(num)
      try:
        result = int(num)
      except:
        result = float(num)
    else:
      lists = rx.findall(temp)
      # print(len(lists))
      num = rx.findall(temp)[0]

      if len(lists) > 1:
        num0 = lists[0]
        num1 = lists[1]
        # print(f"num0 = {num0}, num1 = {num1}")
        if "$" in num[0]:
          num = num0
        else:
          num = num1
      
      num = num.replace("$", "")
      # print(num)
      if "million" in temp or "million" in temp:
        result = int(1000000 * float(num))
      elif "billion" in temp or "Billion" in temp:
        result = int(1000000000 * float(num))
      elif "thousand" in temp or "thousand" in temp:
        result = int(1000 * float(num))
      else:
        try:
          result = int(num)
        except:
          result = float(num)
      return result
  else:
    return temp

baseLink = "https://en.wikipedia.org/"
movie_infos = []

links = webpage.select("table.wikitable.sortable tbody tr td i a")

for index, link in enumerate(links):
  try:
    actualLink = baseLink + link["href"]

    # Loading the Webpage
    r = requests.get(actualLink)
    moviePage = bs(r.content)
    
    # removing the "[1]" Tag & "extra (date)"               ###
    for tag in moviePage.find_all(["sup", "span"]):         ###
      tag.decompose()                                       ###

    # Grabbing the Side Info Panel and storing them in a dictionary
    movie_info = {}

    panel = moviePage.find("table", attrs={"class": "infobox vevent"})
    # print(panel)
    panel = panel.find("tbody")
    panel = panel.select("tr")
    for index, tr in enumerate(panel):
      if index == 0:
        movie_info["Title"] = tr.get_text(" ", strip=True)
      else:
        header = tr.find("th")
        if header:
          key = tr.find("th").get_text(" ", strip=True)
          if tr.find("li"):
            value = [item.get_text(" ", strip=True).replace("\xa0", "").replace(" minutes", "") for item in tr.find_all("li")]
          elif tr.find("br"):
            value = [text for text in tr.stripped_strings]
          else:
            temp = tr.find("td").get_text(" ", strip=True).replace("\xa0", "").replace(" minutes", "").replace(" Minutes", "").replace(" min", "").replace("minutes", "").replace("Minutes", "").replace("min", "")
            temp = convertToMoney(temp)
            value = temp
          movie_info[key] = value
          # print(movie_info.get("Release data"))  
        
  except:
    pass
  
  movie_infos.append(movie_info)

for movies in movie_infos:
  temp = movies.get("Release date")
  temp = convertDate(temp)
  # movies.get("Release date") = convertDate(movies.get("Release date"))

In [7]:
from datetime import datetime as dt

def removeParenthesis(date):
  return date.split("(")[0].rstrip()

def convertDate(date):
  if isinstance(date, list):
    date = date[0]
  
  date = removeParenthesis(date)

  formats = ["%B %d, %Y", "%B%d%Y", "%d %B %Y", "%d %B, %Y", "%d%B%Y", "%d%B,%Y"]

  for format in formats:
    try:
      return dt.strptime(date, format)
    except:
      pass
    return None

  return date

date = ['January 28, 1965 ']

print(convertDate(date))

1965-01-28 00:00:00
