In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re


julia_roberts_wiki_url = 'https://en.wikipedia.org/wiki/Julia_Roberts_filmography'
page = urlopen(julia_roberts_wiki_url)
soup = BeautifulSoup(page)
table=soup.find('table', class_='wikitable plainrowheaders sortable')
base_url = 'https://en.wikipedia.org/'
awards_prefix = 'https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_'



years=[]
names=[]
countries=[]
awards=[]

# get all available links for all movie pages
movies_url=[]
for row in table.findAll("tr"):
    headers=row.findAll('th')
    if(len(headers)==1):
        link = headers[0].find(href=True)
        if(link is not None):
            movies_url.append(base_url + link.get('href'))
            

#Description: get soup object from given url
#params: url : string
#return value: BeautifulSoup objects addressed to the url
def getSoupFromUrl(url):
    page = urlopen(url)
    return BeautifulSoup(page)


#Description: extract year of birth from born details
#params: born details : string []
#return value: year of birth
def getYearOfBirth(born_details):
    dataRegex = re.compile(r'\d\d\d\d')
    for text in born_details:
        match = dataRegex.search(text)
        if match is not None:
          group = match.group()
          return group
    #year not found   
    return 'NA'



#Description: extract place of birth from wikipedia table row
#params: row html object 
#return value:  place : string 
def getPlaceOfBirth(row):
    div = row.find('div', class_='birthplace')
    if div is not None:
        return div.find(text=True)
    else:
        #msometimes there is only link to the country
        country = row.find(href=True)
        if(country is not None):
            return country.find(text=True)
        else:
            #Country not found
            return "NA"



#Description: get  number of awards
#params: name : string 
#return value:  number 
def getNumOfAwards(name):
    actor_awards = awards_prefix + name.replace(' ', '_')
    try:
        awards_soup = BeautifulSoup(urlopen(actor_awards), "lxml")
        awards_list = awards_soup.find_all("td", class_ = "yes table-yes2", text = re.compile('Won'))
        num_of_awards = len(awards_list)
    except:
        num_of_awards = 'NA'
    return num_of_awards
 
#Description: insert year , place, country to the global array from actor url 
#params: actor url
#return value:  none   
def setArraysWithActorsData(actor_url):
    try:
        soup = getSoupFromUrl(actor_url)
        table = soup.find('table', class_='infobox')
        if table is not None:
            year=False
            country=False
            name=False
            #find only the name of the actor
            first_row = table.findAll("tr")[0]
            header = first_row.findAll('th')
            name = header[0].find(text=True)
            if name is None:
                #name not found
                name='NA'

            #find born details
            for row in table.findAll("tr"):
                if row.find(text=True)=='Born':
                    born_details =row.findAll('td')[0].findAll(text=True)
                    year = getYearOfBirth(born_details)
                    country = getPlaceOfBirth(row)
                    a_awards = getNumOfAwards(name)
                    break
                    
            if name and year and country and a_awards:
                years.append(year)
                countries.append(country)
                names.append(name)
                awards.append(a_awards)
           
    except: 
      pass


#Description: get actors urls from ul html object and update global arrays
#params: ul object
#return value:  none 
def extract_data_from_ul(ul):
    actor_urls=[]
    for li in ul.findAll('li'):
        actor_url = li.find(href=True)
        if actor_url is not None :
            actor_urls.append(base_url + actor_url.get('href'))
    for actor_url in actor_urls:
        setArraysWithActorsData(actor_url)
        

    
#Description: get actors urls from table html object and update global arrays
#params: actor url
#return value:  none  
def extract_data_from_table(table):
    actor_urls=[]
    for row in table.findAll("tr"):
        cells=row.findAll('td')
        if(len(cells)==3 or len(cells)==2):
            actor_url = cells[0].find(href=True)
            if actor_url is not None:
                actor_urls.append(base_url + actor_url.get('href'))
            else:
                years.append("NA")
                countries.append("NA")
                names.append(cells[0].find(text=True))
        
    if len(actor_urls)==0:
       print('cells lenght problem')

    for actor_url in actor_urls:
        setArraysWithActorsData(actor_url)

    
for url in movies_url:
    actor_urls=[]
    try:
        soup = getSoupFromUrl(url)
        cast_span_tag = soup.find('span', class_='mw-headline', text=re.compile('(cast)|(Cast)'))
        if cast_span_tag is None:
            continue
        header_tag= cast_span_tag.parent
        actors = header_tag.findNext(['ul','table'])
        if(actors.name=='ul'):
            extract_data_from_ul(actors)
        else:
            extract_data_from_table(actors)
    except:
#       print("Problem: " + url)

    
    

    

    
    
    
    
    
df=pd.DataFrame(years,columns=['Year'])
df['Name']=names
df['Country']=countries
df['Awards']=awards

df
   

   







 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


===== https://en.wikipedia.org//wiki/Firehouse_(1987_film)
===== https://en.wikipedia.org//wiki/Satisfaction_(film)
===== https://en.wikipedia.org//wiki/Mystic_Pizza
===== https://en.wikipedia.org//wiki/Blood_Red_(film)
===== https://en.wikipedia.org//wiki/Steel_Magnolias
===== https://en.wikipedia.org//wiki/Pretty_Woman
===== https://en.wikipedia.org//wiki/Flatliners
===== https://en.wikipedia.org//wiki/Sleeping_with_the_Enemy
===== https://en.wikipedia.org//wiki/Dying_Young
===== https://en.wikipedia.org//wiki/Hook_(film)
===== https://en.wikipedia.org//wiki/The_Player_(1992_film)
===== https://en.wikipedia.org//wiki/The_Pelican_Brief_(film)
===== https://en.wikipedia.org//wiki/I_Love_Trouble_(1994_film)
===== https://en.wikipedia.org//wiki/Pr%C3%AAt-%C3%A0-Porter_(film)
===== https://en.wikipedia.org//wiki/Something_to_Talk_About_(film)
===== https://en.wikipedia.org//wiki/Mary_Reilly_(film)
===== https://en.wikipedia.org//wiki/Michael_Collins_(film)
===== https://en.wikipedia.org//

Unnamed: 0,Year,Name,Country,Awards
0,1953,Peter Onorati,"Boonton, New Jersey",na
1,1961,Peter Mackenzie,Boston,na
2,1964,Craig Mitchell,"Hempstead, New York",na
3,1967,Julia Roberts,"Smyrna, Georgia",23
4,1966,Justine Bateman,Rye,na
5,1952,Liam Neeson,Ballymena,5
6,1967,Trini Alvarado,New York City,na
7,1964,Scott Coffey,"Honolulu, Hawaii",na
8,1963,Britta Phillips,"Boyne City, Michigan",na
9,1967,Julia Roberts,"Smyrna, Georgia",23
