# Analysing Star Wars

by `Nurseiit Abdimomyn`

id `20172001`

at `nurs@unist.ac.kr`

In [1]:
import csv # here we go again :P
import networkx as nx
from operator import itemgetter
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# all episodes
episodes = {}

for i in range(1, 7):
    with open('data/StarWars_ep%d.csv' % i, 'r') as f:
        reader = csv.reader(f)
        ep = [event for event in reader][1:] # ignore header
        episodes[i] = ep

In [3]:
AKA = {
    'EMPEROR': 'PALPATINE',
    'DARTH SIDIOUS': 'PALPATINE',
    'PALAPATINE': 'PALPATINE',
    'BOUSHH': 'LEIA',
    'ACKBAR ': 'ACKBAR',
    'DARTH VADER': 'ANAKIN',
    'VADER': 'ANAKIN',
    'ANAKN': 'ANAKIN',
    'ANAKNI': 'ANAKIN',
    'ANKAIN': 'ANAKIN',
    'ANAKINN': 'ANAKIN',
    'HAN/PILOT': 'HAN',
    'LUKE\'S VOICE': 'LUKE',
    'LURE': 'LUKE',
    'QUI -GON': 'QUI-GON',
    'QU-IG0N': 'QUI-GON',
    'GUI-GON': 'QUI-GON',
    'THREEPIO': 'C-3PO',
    'G-3PO': 'C-3PO',
    'PADMÉ': 'PADME',
    'PAMDE': 'PADME',
    'CAPTAIN PANAKA': 'CAPT. PANAKA',
    'AUNT BERU': 'BERU',
    'MAD AMEDDA': 'MAS AMEDDA',
    'PR-4': 'PK-4',
    'TC14': 'TC-14',
    'BEN\'S VOICE': 'BEN',
    'RED LEADER\'S VOICE': 'RED LEADER',
    'READ LEADER': 'RED LEADER',
    'RED TEN\'S VOICE': 'RED TEN',
    'RED NINE\'S VOICE': 'RED NINE',
    'WINGMAN\'S VOICE': 'WINGMAN',
    'BERU (V.O.)': 'BERU',
    'BOBA': 'BOBA FETT',
    'JANGO': 'JANGO FETT',
    'CLIEGG (O.S.)': 'CLIEGG',
    'ZAM': 'ZAM WESSEL'
}

invalid_names = ['FADE OUT:', 'THE END']

def normalise_names(_episode):
    episode = []
    for event in _episode:
        actor = event[5]
        if actor in AKA:
            actor = event[5] = AKA[actor]
        if actor in invalid_names:
            continue
        episode.append(event)
    return episode

In [4]:
def statistics(episode, ep_str):
    episode = normalise_names(episode)
    
    scenes = {}
    actors = {}

    for event in episode:
        sequence = int(event[2])
        scenes[sequence] = int(event[1])
        actors[sequence] = event[5]
        
    all_actors = list(set([actor[1] for actor in actors.items()]))
    all_scenes = list(set([scene[1] for scene in scenes.items()]))

    scenes_with_actors = {}

    for sequence, scene in scenes.items():
        actor = actors[sequence]
        if scene not in scenes_with_actors:
            scenes_with_actors[scene] = set()
        scenes_with_actors[scene].add(actor)

    actor_edges = []
    for i in scenes_with_actors:
        for _to in scenes_with_actors[i]:
            for _from in scenes_with_actors[i]:
                if _to != _from:
                    actor_edges.append((_from, _to))

    actor_edges = list(set(actor_edges))
    
    Q = nx.Graph()
    Q.add_nodes_from(all_actors)
    Q.add_edges_from(actor_edges)
    
    betweennes = nx.betweenness_centrality(Q)
    sorted_betweennes = sorted(betweennes.items(), key=itemgetter(1), reverse=True)
    
    sorted_degree = sorted(Q.degree(), key=itemgetter(1), reverse=True)
    
    average_clustering = nx.average_clustering(Q)
    
    print('Episode %s' % ep_str)
    print(nx.info(Q))
    print('Top 5 by Betweennes:\n', sorted_betweennes[:5])
    print('Top 5 by Degree:\n', sorted_degree[:5])
    print('Average Clustering: ', average_clustering)
    print('Density: ', nx.density(Q))

In [6]:
statistics(episodes[1], 'I')

Episode I
Name: 
Type: Graph
Number of nodes: 61
Number of edges: 221
Average degree:   7.2459
Top 5 by Betweennes:
 [('QUI-GON', 0.28706958923060616), ('ANAKIN', 0.16576299406807876), ('AMIDALA', 0.16325085303898865), ('JAR JAR', 0.1566198090986227), ('PADME', 0.11644378299039317)]
Top 5 by Degree:
 [('QUI-GON', 35), ('ANAKIN', 29), ('JAR JAR', 24), ('AMIDALA', 23), ('PADME', 20)]
Average Clustering:  0.7606364098085769
Density:  0.12076502732240438


In [10]:
statistics(episodes[2], 'II')

Episode II
Name: 
Type: Graph
Number of nodes: 61
Number of edges: 204
Average degree:   6.6885
Top 5 by Betweennes:
 [('OBI-WAN', 0.3829453550216262), ('PADME', 0.3263046087622359), ('ANAKIN', 0.23107680925477533), ('YODA', 0.09360567869042448), ('MACE WINDU', 0.050818484716789784)]
Top 5 by Degree:
 [('PADME', 38), ('OBI-WAN', 35), ('ANAKIN', 31), ('YODA', 21), ('MACE WINDU', 16)]
Average Clustering:  0.7896364012977173
Density:  0.11147540983606558


In [11]:
statistics(episodes[4], 'IV')

Episode IV
Name: 
Type: Graph
Number of nodes: 59
Number of edges: 140
Average degree:   4.7458
Top 5 by Betweennes:
 [('LUKE', 0.27923187530002896), ('LEIA', 0.15895431254586856), ('ANAKIN', 0.14454855882523288), ('C-3PO', 0.08336743144389529), ('TROOPER', 0.07856534515957576)]
Top 5 by Degree:
 [('LUKE', 28), ('C-3PO', 18), ('LEIA', 16), ('ANAKIN', 16), ('BEN', 13)]
Average Clustering:  0.5006791950710195
Density:  0.08182349503214495


In [12]:
statistics(episodes[5], 'V')

Episode V
Name: 
Type: Graph
Number of nodes: 49
Number of edges: 108
Average degree:   4.4082
Top 5 by Betweennes:
 [('ANAKIN', 0.2924159912191827), ('LUKE', 0.27472982100641674), ('HAN', 0.17043439716312053), ('C-3PO', 0.1425384160756501), ('LEIA', 0.11263297872340423)]
Top 5 by Degree:
 [('HAN', 18), ('C-3PO', 17), ('LUKE', 16), ('LEIA', 15), ('ANAKIN', 15)]
Average Clustering:  0.5530159682920788
Density:  0.09183673469387756
