# Data Scrapping from Wikipedia - American Movies List from 2005 - 2021

In [1]:
import os
import time
import re
import json
import numpy as np
import pandas as pd
import requests
import urllib.request
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [2]:
base_url = 'https://en.wikipedia.org'
movies_base = '/wiki/List_of_American_films_of_'

In [3]:
def create_json(json_dict, file_name):
    json_string = json.dumps(json_dict, indent=4)
    with open(F"./{file_name}.json", "w") as outfile:
        outfile.write(json_string)

In [4]:
def get_all_page_links(url):
    year_links = [base_url + movies_base + str(year) for year in range(2005, 2022)]
    return year_links

In [5]:
def read_html_from_url(url):
    raw_html = urlopen(url)
    bs_page = BeautifulSoup(raw_html, 'lxml')
    return bs_page

In [6]:
def extract_all_tables_from_html(bs_html, class_name=None):
    if(class_name):
        tables = bs_html.findAll('table', class_=class_name)
    else:
        tables = bs_html.findAll('table')
    return tables

In [7]:
def extract_plot_from_page(url):
    plot = ''
    bs_movie_page = read_html_from_url(url)
    plot_header = bs_movie_page.find(id='Plot').find_parent('h2')
    paragraph = plot_header.find_next_sibling()
    while paragraph.name == 'p':
        plot = plot + paragraph.text
        paragraph = paragraph.find_next_sibling()
    return plot

In [72]:
def get_month_from_column(col, prev_month):
    month = ''
    month = col.text.replace("<br>", "").replace("\n", "")
    if(month == ''):
        month = prev_month
    
    return month

In [73]:
def get_date_from_column(col, prev_date):
    date = ''
    date = col.text.replace("\n", "")
    if(date == ''):
        date = prev_date
    
    return date

In [74]:
def get_title_and_link_from_column(col):
    title = col.i.a['title']
    url = base_url + col.i.a['href']
    return title, url

In [282]:
def extract_movie_info_from_table(table, year):
    movies_list = []
    failed_rows = []
    failed_count = 0
  
    rows = table.tbody.find_all('tr')

    month = ''
    date = ''

    
    for row in rows[1:]:
          
        try:

            row_with_th = row.find_all('th')
            if(len(row_with_th) > 0):
                month = get_month_from_column(row_with_th[0], month)
            
            cols = row.find_all('td')  
            b_tags = row.find_all('b')

    
            if(len(b_tags) == 2):
                if(month != ''):
                    date = get_date_from_column(cols[0], date)
                    title, url =  get_title_and_link_from_column(cols[1])
                else:
                    month = get_month_from_column(cols[0], month)
                    date = get_date_from_column(cols[1], date)
                    title, url =  get_title_and_link_from_column(cols[2])
            
            if(len(b_tags) == 1):
                date = get_date_from_column(cols[0], date)
                title, url =  get_title_and_link_from_column(cols[1])
            
            if(len(b_tags) == 0):
                title, url =  get_title_and_link_from_column(cols[0])


            print(month, date, title, url)
           
            plot = extract_plot_from_page(url)
                
            movie_detail = {
                "Year" : year,
                "Month" : month,
                "Date" : date,
                "Title": title,
                "URL" : url,
                "Plot": plot
            }  
               
            movies_list.append(movie_detail)

        except:
            failed_count = failed_count + 1
            failed_rows.append(row)
    return movies_list, failed_rows, failed_count

In [None]:
movies_list = []
failed_rows = []
failed_count = 0

movie_urls = get_all_page_links(f"{base_url + movies_base}")
for url in movie_urls:
    html = read_html_from_url(url)
    year = url[-4:]
    print(year)
    tables = extract_all_tables_from_html(html, class_name="wikitable sortable")
    for table in tables:
        movies, failed, failedcount = extract_movie_info_from_table(table, year)
        movies_list.extend(movies)
        failed_rows.extend(failed)
        failed_count = failed_count + failedcount
create_json(movies_list, "movie_list")
# create_json(failed_rows, "failed_drows")

In [284]:
failed_count

464

In [285]:
len(movies_list)

3985