# Analysing Star Wars

by `Nurseiit Abdimomyn`

id `20172001`

at `nurs@unist.ac.kr`

In [1]:
import csv # here we go again :P
import networkx as nx
from operator import itemgetter
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# all episodes
episodes = {}

for i in range(1, 7):
    with open('data/StarWars_ep%d.csv' % i, 'r') as f:
        reader = csv.reader(f)
        ep = [event for event in reader][1:] # ignore header
        episodes[i] = ep

In [3]:
AKA = {
    'EMPEROR': 'PALPATINE',
    'DARTH SIDIOUS': 'PALPATINE',
    'PALAPATINE': 'PALPATINE',
    'BOUSHH': 'LEIA',
    'ACKBAR ': 'ACKBAR',
    'DARTH VADER': 'ANAKIN',
    'VADER': 'ANAKIN',
    'ANAKN': 'ANAKIN',
    'ANAKNI': 'ANAKIN',
    'ANKAIN': 'ANAKIN',
    'ANAKINN': 'ANAKIN',
    'HAN/PILOT': 'HAN',
    'LUKE\'S VOICE': 'LUKE',
    'LURE': 'LUKE',
    'QUI -GON': 'QUI-GON',
    'QU-IG0N': 'QUI-GON',
    'GUI-GON': 'QUI-GON',
    'THREEPIO': 'C-3PO',
    'G-3PO': 'C-3PO',
    'PADMÉ': 'PADME',
    'PAMDE': 'PADME',
    'CAPTAIN PANAKA': 'CAPT. PANAKA',
    'AUNT BERU': 'BERU',
    'MAD AMEDDA': 'MAS AMEDDA',
    'PR-4': 'PK-4',
    'TC14': 'TC-14',
    'BEN\'S VOICE': 'BEN',
    'RED LEADER\'S VOICE': 'RED LEADER',
    'READ LEADER': 'RED LEADER',
    'RED TEN\'S VOICE': 'RED TEN',
    'RED NINE\'S VOICE': 'RED NINE',
    'WINGMAN\'S VOICE': 'WINGMAN',
    'BERU (V.O.)': 'BERU',
    'BOBA': 'BOBA FETT',
    'JANGO': 'JANGO FETT',
    'CLIEGG (O.S.)': 'CLIEGG',
    'ZAM': 'ZAM WESSEL'
}

invalid_names = ['FADE OUT:', 'THE END']

def normalise_names(_episode):
    episode = []
    for event in _episode:
        actor = event[5]
        if actor in AKA:
            actor = event[5] = AKA[actor]
        if actor in invalid_names:
            continue
        episode.append(event)
    return episode

In [4]:
def statistics(episode, ep_str):
    episode = normalise_names(episode)
    
    scenes = {}
    actors = {}

    for event in episode:
        sequence = int(event[2])
        scenes[sequence] = int(event[1])
        actors[sequence] = event[5]
        
    all_actors = list(set([actor[1] for actor in actors.items()]))
    all_scenes = list(set([scene[1] for scene in scenes.items()]))

    scenes_with_actors = {}

    for sequence, scene in scenes.items():
        actor = actors[sequence]
        if scene not in scenes_with_actors:
            scenes_with_actors[scene] = set()
        scenes_with_actors[scene].add(actor)

    actor_edges = []
    for i in scenes_with_actors:
        for _to in scenes_with_actors[i]:
            for _from in scenes_with_actors[i]:
                if _to != _from:
                    actor_edges.append((_from, _to))

    actor_edges = list(set(actor_edges))
    
    Q = nx.Graph()
    Q.add_nodes_from(all_actors)
    Q.add_edges_from(actor_edges)
    
    betweennes = nx.betweenness_centrality(Q)
    sorted_betweennes = sorted(betweennes.items(), key=itemgetter(1), reverse=True)
    
    sorted_degree = sorted(Q.degree(), key=itemgetter(1), reverse=True)
    
    average_clustering = nx.average_clustering(Q)
    
    print('Episode %s' % ep_str)
    print(nx.info(Q))
    print('Top 5 by Betweennes:\n', sorted_betweennes[:5])
    print('Top 5 by Degree:\n', sorted_degree[:5])
    print('Average Clustering: ', average_clustering)
    print('Density: ', nx.density(Q))

In [5]:
statistics(episodes[6], 'VI')

Episode VI
Name: 
Type: Graph
Number of nodes: 45
Number of edges: 109
Average degree:   4.8444
Top 5 by Betweennes:
 [('LANDO', 0.2677059022724984), ('LUKE', 0.213659930626104), ('C-3PO', 0.13529991854516166), ('HAN', 0.10522592277349146), ('LEIA', 0.08693839634643016)]
Top 5 by Degree:
 [('LANDO', 19), ('LUKE', 16), ('C-3PO', 16), ('LEIA', 15), ('HAN', 14)]
Average Clustering:  0.546926280347333
Density:  0.1101010101010101
