# HW3 - Social Network Analysis

## Import modules

In [1]:
import pandas as pd
import nltk
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm

## Load the data

In [2]:
casts = pd.read_csv('./../data/casts.csv', error_bad_lines = False, sep=';')

## Convert „casts“ data to a graph

In [22]:
def extractEntities(ne_chunked):
    data = {}
    for entity in ne_chunked:
        if isinstance(entity, nltk.tree.Tree):
            text = " ".join([word for word, tag in entity.leaves()])
            ent = entity.label()
            data[text] = ent
        else:
            continue
    return data

G = nx.Graph()

# Print casts CSV column headers
print(casts.columns.tolist())

# Group movies by actors
movies_by_actor = casts[casts['actor_type'] < 'AA15'].groupby('actor')['movie']
# movies_by_actor = casts.groupby('actor')['movie']
movies_by_actor_dict = movies_by_actor.apply(list).to_dict()
# print(len(movies_by_actor_dict))
# print(movies_by_actor_dict['A.E. Matthews'])
# print(movies_by_actor_dict)

# Create the graph
for actor in movies_by_actor_dict.keys():
    G.add_node(actor)

# Test if two lists overlaps
# print(bool(set(movies_by_actor_dict['A.E. Matthews']) & set(movies_by_actor_dict['David Tree'])))
# print(bool(set(movies_by_actor_dict['A.E. Matthews']) & set(movies_by_actor_dict['Athene Seyler'])))

for actor1, movies1 in tqdm(movies_by_actor_dict.items(), total = len(movies_by_actor_dict)):
    for actor2, movies2 in movies_by_actor_dict.items():
        if actor1 != actor2 and bool(set(movies1) & set(movies2)):
            G.add_edge(actor1, actor2)

nx.write_gexf(G, './../results/actors_casts.gexf')

100%|██████████| 18/18 [00:00<00:00, 6610.41it/s]

['actor_type', 'movie', 'actor', 'role_type', 'role']





## Dataset general statistics

In [23]:
n = len(G.nodes())
e = len(G.edges())

print('Number of nodes: ', n)
print('Number of edges: ', e)
print('Density: ', e / (n*(n-1)/2))

components = list(nx.connected_components(G))

print('Components cnt: ', len(components))

Number of nodes:  18
Number of edges:  42
Density:  0.27450980392156865
Components cnt:  7


## Visualisation

In [None]:
# Filter the data

# print(casts[casts['actor_type'] < 'AA37']['movie'].value_counts().to_dict())
# print(casts['movie'].value_counts().to_dict())

movies_casts_cnt = casts['movie'].value_counts().to_dict()
print(len(movies_casts_cnt))
# print(movies_casts_cnt)

movies_filtered = list(k for (k, v) in movies_casts_cnt.items() if v < 6)
print(len(movies_filtered))
# print(movies_filtered)