In [33]:
from collections import defaultdict
import pandas as pd
import networkx as nx
from netwulf import visualize as _visualize

movie_data_path = '../data/movies/'

In [34]:
def edges(df, nodes, directed=True):
    edge_weights = defaultdict(int)
    for _, row in df.iterrows():
        f, t = row['from'], row['to']
        if pd.isna(t) or f not in nodes:
            continue
        edge = (f, t) if directed else tuple(sorted([f, t]))
        edge_weights[edge] += 1
    return [(f, t, w) for (f, t), w in edge_weights.items()]

def network(df, nodes, directed=True):
    G = nx.DiGraph() if directed else nx.Graph()
    G.add_nodes_from(nodes)
    for f, t, w in edges(df, nodes, directed):
        if G.has_edge(f, t):
            G[f][t]['weight'] += w
        else:
            G.add_edge(f, t, weight=w)
    return G

def visualize(df, nodes, directed=True, **kwargs):
    G = network(df, nodes, directed)
    return _visualize(G, **kwargs)


In [35]:
characters = []
for movie_num in range(1,7):
    movie = pd.read_csv(movie_data_path + f'star_wars_{movie_num}_data.csv', sep=';')

    movie.replace({'from': {'nan': pd.NA}, 'to': {'nan': pd.NA}}, inplace=True)

    characters.extend(movie['from'].dropna().to_list())
    characters.extend(movie['to'].dropna().to_list())

unique_characters = list(set(characters))


In [None]:
df_characters = pd.DataFrame({'character': characters})

df_counts = df_characters.groupby('character').size().reset_index(name='count')

df_counts = df_counts.sort_values(by='count', ascending=False)

TypeError: '>' not supported between instances of 'method' and 'int'

In [None]:
# pd.DataFrame({'movie_name': unique_characters}).to_csv('../data/movies/movie_characters_unedited.csv', sep=';')

In [None]:
df_script = pd.DataFrame(columns=['from', 'where', 'text', 'where', 'time', 'movie_num'])

for movie_num in range(1, 7):
    current_movie_path = f'star_wars_{movie_num}_data.csv'
    current_movie = pd.read_csv(movie_data_path + current_movie_path, sep=';')

    current_movie.replace({'from': {'nan': pd.NA}, 'to': {'nan': pd.NA}}, inplace=True)
    current_movie = current_movie[['from', 'to', 'text', 'where', 'time']]
    current_movie['movie_num'] = movie_num

    df_script = pd.concat([df_script, current_movie], ignore_index=True)

df_script['from'] = df_script['from'].str.lower()
df_script['to'] = df_script['to'].str.lower()
df_script['text_processed'] = df_script['text'].str.lower().str.replace(r'[^a-z\s]', '', regex=True)

df_script.to_csv(f'{movie_data_path}moviescripts_compiled.csv', index=False)

In [None]:
df_script

Unnamed: 0,from,to,text,where,time,movie_num,text_processed
0,qui-gon,captain,Captain.,INT. REPUBLIC CRUISER - COCKPIT,9,1,captain
1,captain,qui-gon,"Yes, sir?",INT. REPUBLIC CRUISER - COCKPIT,18,1,yes sir
2,qui-gon,captain,Tell them we wish to board at once.,INT. REPUBLIC CRUISER - COCKPIT,72,1,tell them we wish to board at once
3,captain,qui-gon,"Yes, sir.",INT. REPUBLIC CRUISER - COCKPIT,18,1,yes sir
4,captain,nute,With all due respect the ambassadors for the s...,INT. REPUBLIC CRUISER - COCKPIT,126,1,with all due respect the ambassadors for the s...
...,...,...,...,...,...,...,...
5508,leia,,He wasn't. I can feel it.,136 EXT ENDOR FOREST,,6,he wasnt i can feel it
5509,han,,"You love him, don't you?",136 EXT ENDOR FOREST,,6,you love him dont you
5510,leia,,Yes.,136 EXT ENDOR FOREST,,6,yes
5511,han,,All right. I understand. Fine. When he comes b...,136 EXT ENDOR FOREST,,6,all right i understand fine when he comes back...


In [None]:
df_counts.character.to_csv('character_list.csv',index=None)

In [None]:
nodes = pd.read_csv('character_list.csv').character.str.lower().to_list()
nodes

['anakin',
 'obi-wan',
 'luke',
 'padme',
 'qui-gon',
 'threepio',
 'han',
 'leia',
 'yoda',
 'palpatine',
 'han solo',
 'darth vader',
 'jar jar',
 'amidala',
 'lando',
 'mace windu',
 'nute',
 'artoo',
 'shmi',
 'watto',
 'capt. panaka',
 'c-3po',
 'bail organa',
 'emperor',
 'darth sidious',
 'red leader',
 'count dooku',
 'biggs',
 'public',
 'boss nass',
 'wedge',
 'owen',
 'piett',
 'jabba',
 'tarkin',
 'general grievous',
 'rune',
 'ric olie',
 'speaker b',
 'lama su',
 'trooper',
 'bibble',
 'jango fett',
 'speaker a',
 'jocasta nu',
 'x_x',
 'gold leader',
 'ackbar',
 'darth sldious',
 'rieekan',
 'officer',
 'taun we',
 'dexter jettster',
 'pilot',
 'sebulba',
 'mas amedda',
 'boba fett',
 'captain',
 'valorum',
 'clone commander cody',
 'commander',
 'queen jamillia',
 'oom-9',
 'captain typho',
 'kitster',
 'mon mothma',
 'controller',
 'tey how',
 'ki-adi',
 'intercom voice',
 'senat',
 'red ten',
 'guard',
 'jobal',
 'nute gunray',
 'gold five',
 'ruwee',
 'sola',
 'deck 

In [None]:
G = network(df_script,nodes,directed=False)

_visualize(G)

The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.


(None, None)