In [1]:

import requests, pandas, pathlib, datetime
import uuid, pydash, numpy
from IPython.display import clear_output
from bs4 import BeautifulSoup

def clean_tags(row, col):
    return(BeautifulSoup(str(row[col]), "lxml").text)

def retrieve(page):
    response = requests.get(str(page))
    data = str(BeautifulSoup(response.text, 'html.parser'))
    data = data.split('<h4 id="group-A">A</h4>')[1].split('<h3>Latest from the BFI</h3>')[0]
    data = pandas.DataFrame([data], columns=['DATA'])
    data['DATA'] = data['DATA'].str.split('<tr>')
    data = data.explode('DATA').reset_index(drop=True)
    data['DATA'] = data['DATA'].str.split('<td>')
    data = data['DATA'].apply(pandas.Series)
    data = data.rename(columns = lambda x : 'DATA_' + str(x))
    data = data[[x for x in data.columns.values[1:]]]
    data['ADDR'] = data[data.columns.values[0]].str.split('https://').str[1].str.split('"').str[0].str.split('/').str[-1]
    for x in data.columns.values:
        data[x] = data[x].str.replace('\n','')
    data = data.dropna(how='all')    
    for x in data.columns.values:
        data[x] = data.apply(clean_tags, col=x, axis=1) 
    return(data)

def year_extract(row, year_list, col):
    title = row[col]
    year = list()
    for y in year_list:
        if y in title:
            year.append(y)
    year = ''.join(year).replace('(','').replace(')','')
    return(year)

def title_clean(row, year_list, col):
    title = row[col]
    for y in year_list:
        title = title.replace(y,'').strip()
    if str(title[-5:]) == ', The':
        title = f'The {title[:-5]}'
    if str(title[-4:]) == ', An':
        title = f'An {title[:-4]}'        
    if str(title[-3:]) == ', A':      
        title = f'A {title[:-3]}'        
    return(title)

def split_column(data, col, sym):
    data[col] = data[col].str.split(sym)
    data = data.explode(col)
    data[col] = data[col].str.strip()
    return(data)

def apply_uuid(data, col):
    variance = data[[col]].drop_duplicates().dropna()
    variance[f'{col}_ID'] = [uuid.uuid4() for x in range(0, len(variance))]
    data = pandas.merge(data, variance, on=col, how='left')
    return(data)
    
voter_data = retrieve('https://www.bfi.org.uk/films-tv-people/sightandsoundpoll2012/voters')
voter_data.columns = ['VOTER', 'ROLE', 'COUNTRY', 'GENDER', 'VOTER_ID']
for x in ['Female', 'Male']:
    voter_data.loc[voter_data.GENDER.str.contains(x, na=False), 'GENDER'] = x
voter_data.loc[voter_data.GENDER.isin(['']), 'GENDER'] = 'Unspecified' 
voter_data.loc[voter_data.ROLE.isin(['']), 'ROLE'] = 'Unspecified' 
voter_data.loc[voter_data.COUNTRY.isin(['']), 'COUNTRY'] = 'Unspecified' 
voter_data['ROLE'] = voter_data['ROLE'].str.split(';').str[0].str.title()
voter_data = split_column(voter_data, 'COUNTRY', '/')

# voter_data = voter_data.loc[voter_data.VOTER.str.contains('Turnour', na=False)]
print(datetime.datetime.now())
print(len(voter_data)) 
voter_data.head() 


2020-09-29 19:38:34.879298
1274


Unnamed: 0,VOTER,ROLE,COUNTRY,GENDER,VOTER_ID
1,Gulnara Abikeyeva,Programmer,Kazakhstan,Female,558
2,Lenny Abrahamson,Director,Ireland,Male,1033
3,Mehmet Açar,Critic,Turkey,Male,316
4,Newton Aduaka,Director,Nigerla,Male,1090
5,Eva af Geijerstam,Critic,Sweden,Female,624


In [2]:

vote_data = pandas.DataFrame(list(voter_data.VOTER_ID.unique()), columns=['VOTER_ID'])
commencer = datetime.datetime.now()                   
def vote_extract(row):
    time_to_finish = ((((datetime.datetime.now()-commencer)/(row.name+1))*(len(vote_data)))+commencer).strftime("%Y-%m-%d %H:%M:%S")
    print(f'Processing: {row.name+1} of {len(vote_data)}; eta {time_to_finish}.')                   
    clear_output(wait=True)                       
    page = f"https://www.bfi.org.uk/films-tv-people/sightandsoundpoll2012/voter/{row['VOTER_ID']}"
    response = requests.get(str(page))
    data = str(BeautifulSoup(response.text, 'html.parser'))
    data = data.split('<table class="sas-poll">')[1].split('</table>')[0]
    data = data.split('films-tv-people/')[1:]
    data = ','.join([x.split('"')[0] for x in data])
    return(data)

vote_data['FILM_ID'] = vote_data.apply(vote_extract, axis=1)
vote_data = split_column(vote_data, 'FILM_ID', ',')
print(datetime.datetime.now())
print(len(vote_data))
vote_data.head()  


2020-09-29 20:24:23.565590
11841


Unnamed: 0,VOTER_ID,FILM_ID
0,558,4ce2b7fd6185b
0,558,4ce2b7ee0f12e
0,558,4ce2b78f168f3
0,558,4ce2b81295544
0,558,4ce2b6bbbf1e2


In [3]:

film_data = retrieve('https://www.bfi.org.uk/films-tv-people/sightandsoundpoll2012/films')
film_data.columns = ['FILM', 'DIRECTOR', 'COUNTRY', 'FILM_ID'] 
film_data['YEAR'] = film_data.apply(year_extract, year_list=[f'({x})' for x in range(1850,2050)], col='FILM', axis=1)
film_data['FILM'] = film_data.apply(title_clean, year_list=[f'({x})' for x in range(1850,2050)], col='FILM', axis=1)
film_data = film_data.loc[film_data.FILM_ID.str.contains('4|5')]

film_data = split_column(film_data, 'COUNTRY', ',')
film_data.loc[film_data.COUNTRY.isin(['']), 'COUNTRY'] = numpy.nan
film_data['GENDER'] = 'Unspecified'
film_data['ROLE'] = 'Director'

film_data = film_data.loc[film_data.FILM_ID.isin(list(vote_data.FILM_ID))]
print(datetime.datetime.now())
print(len(film_data)) 
film_data.head()  


2020-09-29 20:24:33.291150
3225


Unnamed: 0,FILM,DIRECTOR,COUNTRY,FILM_ID,YEAR,GENDER,ROLE
1,A nos Amours,Maurice Pialat,France,4ce2b69a3415f,1983,Unspecified,Director
2,A Place in the Sun,George Stevens,USA,4ce2b6b321437,1951,Unspecified,Director
3,A Valparaíso,Joris Ivens,Chile,4ce2b6a36c8ca,1963,Unspecified,Director
3,A Valparaíso,Joris Ivens,France,4ce2b6a36c8ca,1963,Unspecified,Director
4,A.I. Artificial Intelligence,Steven Spielberg,USA,4ce2b850c45d7,2001,Unspecified,Director


In [4]:

stacked_data = pandas.concat([voter_data, vote_data, film_data])
for x in ['COUNTRY', 'DIRECTOR', 'GENDER', 'ROLE']:
    stacked_data = apply_uuid(stacked_data, x)
    
stacked_data = stacked_data[['YEAR', 'FILM', 'FILM_ID', 'COUNTRY', 'COUNTRY_ID', 'VOTER', 'VOTER_ID', 
                             'DIRECTOR', 'DIRECTOR_ID', 'GENDER', 'GENDER_ID', 'ROLE', 'ROLE_ID']]    
stacked_data.to_csv(pathlib.Path.cwd().resolve().parents[0] / '1-scrape' / 'sightandsound-data.csv', index=False)
print(datetime.datetime.now())
print(len(stacked_data))
stacked_data.head()


2020-09-29 20:24:33.519360
16340


Unnamed: 0,YEAR,FILM,FILM_ID,COUNTRY,COUNTRY_ID,VOTER,VOTER_ID,DIRECTOR,DIRECTOR_ID,GENDER,GENDER_ID,ROLE,ROLE_ID
0,,,,Kazakhstan,86fe85da-add8-4970-9b60-740ac00074c2,Gulnara Abikeyeva,558,,,Female,ca894567-3c6f-461d-a696-c8f5ed4a3553,Programmer,62113a53-d867-4086-b2ce-14b59cca8e65
1,,,,Ireland,f534b851-6753-424d-9aa4-b5b1e77e5017,Lenny Abrahamson,1033,,,Male,790d9f39-1b04-46b6-b219-de70134e291f,Director,2c9720aa-8afb-4799-a836-e9a6a463e355
2,,,,Turkey,6c077823-74da-40b9-8be4-b0cd57bd46b1,Mehmet Açar,316,,,Male,790d9f39-1b04-46b6-b219-de70134e291f,Critic,8d7a5d46-a4b0-4b90-97e3-a7dac1b986be
3,,,,Nigerla,603477b6-e5f7-4e36-815f-95819849a606,Newton Aduaka,1090,,,Male,790d9f39-1b04-46b6-b219-de70134e291f,Director,2c9720aa-8afb-4799-a836-e9a6a463e355
4,,,,Sweden,09941d9c-a83d-43ba-9592-c4f19bd27129,Eva af Geijerstam,624,,,Female,ca894567-3c6f-461d-a696-c8f5ed4a3553,Critic,8d7a5d46-a4b0-4b90-97e3-a7dac1b986be
