In [6]:
# Importing the libraries
import numpy as np
import pandas as pd
import time
import re



In [7]:
# Function find_net_worth
# Massage the star name to fit the proper url syntax
# Call the url and use bs4 to extract the net worth, returns 0 if star not found
def find_net_worth (star):
    import requests
    from bs4 import BeautifulSoup
    worth = 0
    # Replace space with '-' in star name, add trailer as expected by url
    # example: url = "https://wealthygorilla.com/brad-pitt-net-worth"
    url = "https://wealthygorilla.com/"+star.replace(" ", "-") + "-net-worth"
    # print(url)
    try:
        response = requests.get(url)
        if not response.status_code == 200:
            return -1
        results_page = BeautifulSoup(response.content,'lxml')
        var = results_page.find('th', attrs={'class':'column-2'})
        val = var.text.strip()
        worth = int(re.findall(r'[0-9]+',val)[0])
        # print (star+str(worth))
        return worth
    except:
        # Worst value
        # print("worst value : "+star)
        worth = 0
    return worth

In [8]:
# Function StarsCategories
# Takes a DataFrame with Index, star as input
# Returns a DataFrame with the same columns, plus a column for the Net Worth and the category 
# Categories are from the BCG (Boston Consulting Group), {'Cash Cows', 'Dog', '', ''}
def addstarsCat (starpd):
    all_cats=[]
    all_net_worth=[]
    # Fatima lowest category is less than $1M (or not found), then less than $50 millions, next is less than $100 millions
    # Top stars are category 2, above $100 millions. We have 3 categories total: 0, 1 and 2.
    categories=[1,50,100]
    all_stars = starpd['star'].unique()
    for star in all_stars:
        worth_val = find_net_worth (star)
        if worth_val<categories[0]: cat=0
        elif worth_val<categories[1]: cat=1
        else: cat=2
        all_cats.append(cat)
        all_net_worth.append(worth_val)
        # print(star+": "+str(worth_val)+" "+str(cat))
        
    starpd['category']=np.nan
    starpd['net_worth']=np.nan

    # Update argument with net worth categories
    # For each star, update the category column of the argument DF where star is found

    for star, cat, worth in zip(all_stars, all_cats, all_net_worth):
        starpd['category'] = np.where (starpd['star']==star, cat, starpd['category'])
        starpd['net_worth'] = np.where (starpd['star']==star, worth, starpd['net_worth'])

    return starpd

In [9]:
myboxoffice = pd.read_csv('box_office_predictions.csv')
# myboxoffice.head()

In [5]:
# Making a deep copy because I am going to only modify partially the original DF for debugging purpose
# Deep copy could probably be avoided eventually if we process the entire DF
# Call the line below myboxoffice[:10] to only process the first 10 entries in 5 seconds instead of 25 minutes
newboxoffice = myboxoffice.copy(deep=True)
start_time = time.time()
newpd = addstarsCat(newboxoffice)
print(newpd)
newpd.to_csv("Box_office_with_stars.csv", header=True)
total_time_in_sec = time.time()-start_time
print(total_time_in_sec)


         budget  country           director   genre      gross  \
0     237000000       UK      James Cameron  Action  760507625   
1     200000000      USA      James Cameron   Drama  658672302   
2     150000000      USA    Colin Trevorrow  Action  652270625   
3     220000000      USA        Joss Whedon  Action  623357910   
4     185000000      USA  Christopher Nolan  Action  534858444   
...         ...      ...                ...     ...        ...   
5995          0       UK    Dominic Anciano  Comedy       1143   
5996          0  Ireland     Ian Fitzgibbon  Action        828   
5997          0   Norway       Petter NÃ¦ss  Action        547   
5998          0       UK     Sheree Folkson  Comedy        542   
5999          0      USA   Steven R. Monroe   Crime        441   

                               name rating  runtime  score               star  \
0                     Avatar (2009)  PG-13      162    7.8    Sam Worthington   
1                    Titanic (1997)  PG-13   