#MSW Interactions Parser

by [Michele Lacchia](https://www.linkedin.com/in/michele-lacchia/) 
(Github: @rubik)

In [1]:
import tqdm
!pip install parsel
import parsel
!pip install requests
import requests
import re



In [2]:
r = requests.get('https://www.imdb.com/title/tt0086765/episodes?season=6')

In [3]:
s = parsel.Selector(r.text)

In [4]:
e = s.css('[itemprop=episodes]')[0]

In [5]:
e.css('a[itemprop=name]::text').get()

'Appointment in Athens'

In [6]:
e.css('a[itemprop=name]::attr(href)').get().split('/')[2]

'tt0653460'

In [7]:
SEASON_URL = 'https://www.imdb.com/title/tt0086765/episodes?season={i}'

In [8]:
def extract_episodes(season):
    r = requests.get(SEASON_URL.format(i=season))
    if r.status_code != 200:
        return []
    s = parsel.Selector(r.text)
    episodes = []
    for episode in s.css('[itemprop=episodes]'):
        episodes.append({
            'n': episode.css('meta[itemprop=episodeNumber]::attr(content)').get(),
            'date': episode.css('.airdate::text').get().strip(),
            'title': episode.css('a[itemprop=name]::text').get(),
            'id': episode.css('a[itemprop=name]::attr(href)').get().split('/')[2],
        })
    return episodes

In [9]:
extract_episodes(6)

[{'date': '24 Sep. 1989',
  'id': 'tt0653460',
  'n': '1',
  'title': 'Appointment in Athens'},
 {'date': '1 Oct. 1989',
  'id': 'tt0653607',
  'n': '2',
  'title': 'Seal of the Confessional'},
 {'date': '8 Oct. 1989',
  'id': 'tt0653645',
  'n': '3',
  'title': 'The Grand Old Lady'},
 {'date': '15 Oct. 1989',
  'id': 'tt0653642',
  'n': '4',
  'title': 'The Error of Her Ways'},
 {'date': '29 Oct. 1989',
  'id': 'tt0653535',
  'n': '5',
  'title': 'Jack and Bill'},
 {'date': '5 Nov. 1989', 'id': 'tt0653482', 'n': '6', 'title': 'Dead Letter'},
 {'date': '12 Nov. 1989',
  'id': 'tt0653587',
  'n': '7',
  'title': 'Night of the Tarantula'},
 {'date': '19 Nov. 1989',
  'id': 'tt0653698',
  'n': '8',
  'title': 'When the Fat Lady Sings'},
 {'date': '26 Nov. 1989',
  'id': 'tt0653628',
  'n': '9',
  'title': 'Test of Wills'},
 {'date': '3 Dec. 1989', 'id': 'tt0653472', 'n': '10', 'title': 'Class Act'},
 {'date': '17 Dec. 1989',
  'id': 'tt0653681',
  'n': '11',
  'title': 'Town Father'},
 {'

In [10]:
CAST_URL = 'https://www.imdb.com/title/{title_id}/fullcredits'

In [11]:
def extract_cast(title_id):
    r = requests.get(CAST_URL.format(title_id=title_id))
    if r.status_code != 200:
        return []
    s = parsel.Selector(r.text)
    cast = []
    for row in s.css('.cast_list tr'):
        cols = row.css('td')
        if len(cols) < 4:
            continue
        actor_href = cols[1].css('a::attr(href)').get()
        character_name = (cols[3].css('a::text').get() or cols[3].css('::text').get()).strip()
        character_name = character_name.replace('\n ', '').replace('    ', ' ')
        cast.append({
            'actor_id': actor_href.split('/')[2],
            'actor_name': cols[1].css('a::text').get().strip(),
            'character_name': character_name,
        })
    return cast

In [12]:
extract_cast('tt0653661')

[{'actor_id': 'nm0001450',
  'actor_name': 'Angela Lansbury',
  'character_name': 'Jessica Fletcher'},
 {'actor_id': 'nm0046635',
  'actor_name': 'Vincent Baggetta',
  'character_name': 'Antonio Carboni'},
 {'actor_id': 'nm0129955',
  'actor_name': 'Joseph Cali',
  'character_name': 'Priest'},
 {'actor_id': 'nm0137230',
  'actor_name': 'Len Cariou',
  'character_name': "Michael Hagarty /    Monsignore O'Shaugnessy"},
 {'actor_id': 'nm0225191',
  'actor_name': 'George DiCenzo',
  'character_name': 'Mario Carboni (as George Dicenzo)'},
 {'actor_id': 'nm0004984',
  'actor_name': 'Deidre Hall',
  'character_name': 'Claudia Carboni'},
 {'actor_id': 'nm0592188',
  'actor_name': 'Robert Miranda',
  'character_name': 'Gino Carboni'},
 {'actor_id': 'nm0644707',
  'actor_name': 'Ian Ogilvy',
  'character_name': 'Peter Baines'},
 {'actor_id': 'nm0822062',
  'actor_name': 'John Standing',
  'character_name': 'Chief Daniel Trent'},
 {'actor_id': 'nm0872141',
  'actor_name': 'Daniel Trent',
  'chara

In [13]:
def extract_all_seasons():
    seasons = {}
    for i in range(1, 13):
        print(f'Season {i}')
        seasons[i] = []
        for ep in tqdm.tqdm(extract_episodes(i)):
            ep['cast'] = extract_cast(ep['id'])
            seasons[i].append(ep)
    return seasons

In [14]:
seasons = extract_all_seasons()

Season 1


100%|██████████| 22/22 [00:12<00:00,  1.71it/s]


Season 2


100%|██████████| 22/22 [00:12<00:00,  1.77it/s]


Season 3


100%|██████████| 22/22 [00:12<00:00,  1.70it/s]


Season 4


100%|██████████| 22/22 [00:12<00:00,  1.74it/s]


Season 5


100%|██████████| 22/22 [00:12<00:00,  1.72it/s]


Season 6


100%|██████████| 22/22 [00:12<00:00,  1.82it/s]


Season 7


100%|██████████| 22/22 [00:11<00:00,  1.86it/s]


Season 8


100%|██████████| 22/22 [00:12<00:00,  1.78it/s]


Season 9


100%|██████████| 22/22 [00:12<00:00,  1.79it/s]


Season 10


100%|██████████| 21/21 [00:11<00:00,  1.79it/s]


Season 11


100%|██████████| 21/21 [00:11<00:00,  1.78it/s]


Season 12


100%|██████████| 24/24 [00:13<00:00,  1.81it/s]


In [15]:
seasons

{1: [{'cast': [{'actor_id': 'nm0001450',
     'actor_name': 'Angela Lansbury',
     'character_name': 'Jessica Fletcher'},
    {'actor_id': 'nm0058532',
     'actor_name': 'Eddie Barth',
     'character_name': 'Bernie'},
    {'actor_id': 'nm0115017',
     'actor_name': 'Jessica Browne',
     'character_name': 'Kitty Donovan'},
    {'actor_id': 'nm0176622',
     'actor_name': 'Bert Convy',
     'character_name': 'Peter Brill'},
    {'actor_id': 'nm0248983',
     'actor_name': 'Herb Edelman',
     'character_name': 'George'},
    {'actor_id': 'nm0004282',
     'actor_name': 'Anne Francis',
     'character_name': 'Louise McCallum'},
    {'actor_id': 'nm0395649',
     'actor_name': 'Michael Horton',
     'character_name': 'Grady Fletcher'},
    {'actor_id': 'nm0642078',
     'actor_name': "Tricia O'Neil",
     'character_name': 'Ashley Vickers'},
    {'actor_id': 'nm0665838',
     'actor_name': 'Dennis Patrick',
     'character_name': 'Dexter Baxendale'},
    {'actor_id': 'nm0820566',
    

In [16]:
import collections

In [17]:
interactions = collections.defaultdict(lambda: collections.defaultdict(int))

In [18]:
for s in seasons.values():
    for ep in s:
        for c1 in ep['cast']:
            if c1['character_name'] == 'Loretta Speigel':                  # some manual corrections for IMDb misspellings
                c1['character_name'] = 'Loretta Spiegel'
            elif c1['character_name'] == 'Victoria Griffin' or c1['character_name'] == 'Victoria Brandon': 
                c1['character_name'] = 'Victoria Brandon Griffin'
            elif c1['character_name'] == 'Donna Marie Mayberry' or c1['character_name'] == 'Donna Mayberry': 
                c1['character_name'] = 'Donna Mayberry Fletcher'
            elif c1['character_name'] == 'Bennet J. Devlin' or c1['character_name'] == 'Ben Devlin': 
                    c1['character_name'] = 'Ben Devlin'
            for c2 in ep['cast']:                                            
                if c2['character_name'] == 'Loretta Speigel':
                    c2['character_name'] = 'Loretta Spiegel'
                elif c2['character_name'] == 'Victoria Griffin' or c2['character_name'] == 'Victoria Brandon': 
                    c2['character_name'] = 'Victoria Brandon Griffin'
                elif c2['character_name'] == 'Donna Marie Mayberry' or c2['character_name'] == 'Donna Mayberry': 
                    c2['character_name'] = 'Donna Mayberry Fletcher'
                elif c2['character_name'] == 'Bennet J. Devlin' or c2['character_name'] == 'Ben Devlin': 
                    c2['character_name'] = 'Ben Devlin'
                n1, n2 = c1['character_name'], c2['character_name']
                if n1 != n2:
                    interactions[n1][n2] += 1

In [19]:
interactions

defaultdict(<function __main__.<lambda>>,
            {'Jessica Fletcher': defaultdict(int,
                         {'Bernie': 1,
                          'Kitty Donovan': 1,
                          'Peter Brill': 1,
                          'George': 2,
                          'Louise McCallum': 1,
                          'Grady Fletcher': 11,
                          'Ashley Vickers': 1,
                          'Dexter Baxendale': 1,
                          'Doctor': 13,
                          'Chief Roy Gunderson': 1,
                          'Preston Giles': 2,
                          'Caleb McCallum': 1,
                          'Eleanor Thompson': 1,
                          'Lois Hoey': 1,
                          'Muriel': 1,
                          'Davis': 1,
                          'Eggman': 1,
                          'Talk Show Host': 1,
                          'Daniel': 1,
                          'Cellini': 1,
                          'Mar

In [20]:
interactions_big = {
    n1: {n2: v for n2, v in int_n1.items() if v > 2 and 'uncredited' not in n2 and '#' not in n2}
    for n1, int_n1 in interactions.items()
    if 'uncredited' not in n1 and '#' not in n1
}

In [21]:
interactions_big

{'Jessica Fletcher': {'Alex': 3,
  'Bartender': 4,
  'Bellman': 3,
  'Cabbie': 4,
  'Capt. Ethan Cragg': 3,
  'Cashier': 4,
  'Charlie': 4,
  'Charlie Garrett': 5,
  'Clerk': 11,
  'Coroner': 4,
  'Customer': 3,
  'Delivery Man': 3,
  'Dennis Stanton': 9,
  'Deputy': 11,
  'Deputy Andy Broom': 23,
  'Deputy Floyd': 14,
  'Desk Clerk': 7,
  'Desk Sergeant': 3,
  'Detective': 4,
  'Detective Henderson': 5,
  'Director': 3,
  'Doctor': 13,
  'Donna Mayberry Fletcher': 4,
  'Doorman': 4,
  'Dr. Seth Hazlitt': 52,
  'Driver': 6,
  'Elderly Lady': 3,
  'Eve Simpson': 10,
  'Female Reporter': 3,
  'First Man': 3,
  'Grady Fletcher': 11,
  'Guard': 11,
  'Harry McGraw': 6,
  'Harry Pierce': 3,
  'Hotel Clerk': 3,
  'Housekeeper': 4,
  'Howard Griffin': 3,
  'Ideal Molloy': 3,
  "Jean O'Neill": 4,
  'Lieutenant Gabriel Caceras': 3,
  'Loretta Spiegel': 3,
  'Lt. Catalano': 5,
  'Maid': 5,
  "Maitre D'": 3,
  'Male Reporter': 3,
  'Man': 9,
  'Mayor Sam Booth': 7,
  'Medical Examiner': 3,
  'Mic

In [22]:
import csv

In [23]:
with open('fletcher-interactions-2.csv', 'w', newline='') as o:
    w = csv.writer(o)
    for n1, int_n1 in interactions_big.items():
        for n2, v in int_n1.items():
            for _ in range(v):
                w.writerow([n1, n2])