Utility for tweet gathering. Thanks to <a href = 'https://tardis.fandom.com/wiki/Doctor_Who:_Lockdown'> the doctor who wiki </a>.

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [2]:
source = 'https://tardis.fandom.com/wiki/Doctor_Who:_Lockdown!'
response = requests.get(source)
soup = BeautifulSoup(response.text,'lxml')
ep_table = soup.find('table')

In [3]:
info = {}
for row in ep_table.findAll('tr')[1:]:
    episode_info = row.text.split('\n')[2:-1]
    ep_name = episode_info[0]
    hashtag = episode_info[1]
    commentators = re.sub('([a-z])([A-Z])', '\\1#\\2', episode_info[2]).split('#')
    date = re.sub('([0-9]{4})([0-9]|[a-z]|[A-Z])', '\\1#\\2', episode_info[3]).split('#')[0]
    info[ep_name] = {'hashtag': hashtag,\
                    'commentators': commentators,\
                    'date': date}

In [4]:
scraping_df = pd.DataFrame.from_dict(info,orient='index')
scraping_df.head()

Unnamed: 0,hashtag,commentators,date
The Day of the Doctor,#SaveTheDay,[Steven Moffat],21 March 2020
Rose,#TripofaLifetime,"[Russell T Davies, Mark Benton]",26 March 2020
Vincent and the Doctor,#TheUltimateGinger,"[Richard Curtis, Matt Smith, Tony Curran, Kare...",30 March 2020
The Eleventh Hour,#FishCustard,"[Steven Moffat, Matt Smith, Karen Gillan, Arth...",3 April 2020
The Doctor's Wife,#BiggerOnTheInside,"[Neil Gaiman, Richard Clark, Michael Sheen]",11 April 2020


#### We distinguish crew and actors

In [5]:
crew = set(['Russell T Davies','Steven Moffat'])
base_url = 'https://tardis.fandom.com/wiki/'
for episode in info.keys():
    ep_name = re.split('[\s]/[\s]',episode)[0]
    ep_url = base_url + (ep_name.replace(' ','_')) + '_(TV_story)'
    response = requests.get(ep_url)
    soup = BeautifulSoup(response.text,'lxml')
    crew_table = soup.find('section',class_='pi-item pi-group pi-border-color')
    crew_ep = set([crew_table.text.split('\n')[4],crew_table.text.split('\n')[8],crew_table.text.split('\n')[12]])
    crew |= crew_ep

In [6]:
crew_list = list(map(lambda x: re.split(r',[\s]',x)[1] if ',' in x else x,list(crew)))

In [7]:
actors_col = []
crew_col = []
for _,y in scraping_df.iterrows():
    commentators = y['commentators']
    actors = [commentator for commentator in commentators if commentator not in crew_list]
    crew = [commentator for commentator in commentators if commentator in crew_list]
    actors_col.append(actors)
    crew_col.append(crew)

In [8]:
df_info = scraping_df.drop(columns=['commentators'])
df_info['actors'] = actors_col
df_info['crew'] = crew_col
df_info.tail()

Unnamed: 0,hashtag,date,actors,crew
Dalek,#TheMetaltron,30 April 2020,"[Nicholas Briggs, Barnaby Edwards]",[Robert Shearman]
The Girl in the Fireplace,#Clockdown,6 May 2020,[Sophia Myles],"[Steven Moffat, Russell T Davies]"
The Zygon Invasion / The Zygon Inversion,#TruthOrConsequences,10 May 2020,[Ingrid Oliver],[Peter Harness]
The Fires of Pompeii,#VolcanoDay,17 May 2020,"[Tracey Childs, Francesca Fowler, Francois Pan...",[James Moran]
Listen,#FearIsASuperpower,20 May 2020,[],"[Steven Moffat, Douglas Mackinnon]"


In [9]:
df_info.to_json('info/dw_lockdown_info.json')