# Network Analysis

## Grupo

- María Cruz 		2220279
- Pedro Guedes 	1812061
- Rodrigo Motta 	1811524
- Tatiana Reimer 	1720679

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations as comb
from networkx.algorithms import bipartite

In [127]:
name_basics = "name.basics.tsv"
nb_columns = ['nconst', 'primaryName', 'birthYear', 'deathYear', 'primaryProfession']

nb_df = pd.read_csv(name_basics,sep='\t',header=0,usecols=nb_columns)
nb_df = nb_df[~nb_df['birthYear'].str.contains("N", na=False)]
nb_df.dropna()
nb_df.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor"


In [4]:
title_akas = "title.akas.tsv"
ta_columns = ['titleId', 'ordering', 'title', 'region', 'language', 'types', 'attributes', 'isOriginalTitle']
convert_dict = {'titleId': str, 'ordering': int, 'title': str, 
                'region': str, 'language': str, 'types': str, 'attributes': str, 'isOriginalTitle': str}

ta_df = pd.read_csv(title_akas,sep='\t',header=0,usecols=ta_columns,dtype=convert_dict)
ta_df = ta_df[ta_df['isOriginalTitle'] == '1']
ta_df.rename(columns={'titleId':'tconst'}, inplace=True)
ta_df.dropna()
ta_df.head()

Unnamed: 0,tconst,ordering,title,region,language,types,attributes,isOriginalTitle
6,tt0000001,7,Carmencita,\N,\N,original,\N,1
8,tt0000002,1,Le clown et ses chiens,\N,\N,original,\N,1
21,tt0000003,6,Pauvre Pierrot,\N,\N,original,\N,1
25,tt0000004,1,Un bon bock,\N,\N,original,\N,1
34,tt0000005,11,Blacksmith Scene,\N,\N,original,\N,1


In [124]:
title_basics = "title.basics.tsv"
tb_columns = ['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'startYear', 'isAdult', 'endYear', 'runtimeMinutes', 'genres']
convert_dict = {'tconst': str, 'titleType': str, 'primaryTitle': str, 'originalTitle': str, 
                'startYear': str, 'isAdult': object,'endYear': str, 'runtimeMinutes': str, 'genres': object}

tb_df = pd.read_csv(title_basics,sep='\t',header=0,usecols=tb_columns,dtype=convert_dict)
tb_df.dropna()
tb_df = tb_df[~tb_df['genres'].str.contains('N', na=False)]
tb_df = tb_df[~tb_df['startYear'].str.contains("N", na=False)]
tb_df = tb_df[~tb_df['isAdult'].str.contains("N", na=False)]
tb_df = tb_df[tb_df['isAdult'].astype(int) == 0]
tb_df = tb_df[tb_df['titleType'] == "movie"]
tb_df = tb_df[tb_df['startYear'].astype(int) >= 2002]
tb_df = tb_df[tb_df['startYear'].astype(int) <= 2022]
tb_df = (tb_df.drop(columns='genres').join(tb_df['genres'].str.get_dummies(sep=',')))
tb_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,Action,Adult,...,Mystery,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019,\N,\N,1,0,...,0,0,0,0,0,0,0,0,0,0
13079,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,\N,133,0,0,...,0,0,0,0,0,0,0,0,0,0
61094,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,\N,70,0,0,...,0,0,0,0,0,0,0,0,0,0
66309,tt0067683,movie,Workers '71: Nothing About Us Without Us,Robotnicy 1971 - Nic o nas bez nas,0,2006,\N,47,0,0,...,0,0,0,0,0,0,0,0,0,0
67640,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,\N,122,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
title_principals = "title.principals.tsv"
tp_columns = ['tconst', 'ordering', 'nconst', 'category', 'job', 'characters']

tp_df = pd.read_csv(title_principals,sep='\t',header=0,usecols=tp_columns)
tp_df = tp_df[tp_df['category'] == "actor"]
tp_df.drop('ordering', axis=1, inplace=True) 
tp_df.drop('category', axis=1, inplace=True) 
tp_df.drop('job', axis=1, inplace=True) 
tp_df.drop('characters', axis=1, inplace=True) 
tp_df.dropna()
tp_df.head()

Unnamed: 0,tconst,nconst
11,tt0000005,nm0443482
12,tt0000005,nm0653042
16,tt0000007,nm0179163
17,tt0000007,nm0183947
21,tt0000008,nm0653028


In [6]:
title_ratings = "title.ratings.tsv"
tr_columns = ['tconst', 'averageRating', 'numVotes']

tr_df = pd.read_csv(title_ratings,sep='\t',header=0,usecols=tr_columns)
tr_df.dropna()
tr_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1914
1,tt0000002,5.8,259
2,tt0000003,6.5,1720
3,tt0000004,5.6,172
4,tt0000005,6.2,2537


### Merge

#### title.akas + title.basics + title.ratings

In [7]:
'''
ta_tb_df = pd.merge(ta_df, tb_df, how='inner',on='tconst')
movies = pd.merge(ta_tb_df, tr_df, how='inner',on='tconst')
movies.head()
'''

"\nta_tb_df = pd.merge(ta_df, tb_df, how='inner',on='tconst')\nmovies = pd.merge(ta_tb_df, tr_df, how='inner',on='tconst')\nmovies.head()\n"

In [8]:
'''
actors = nb_df
actors.head()
'''

'\nactors = nb_df\nactors.head()\n'

In [128]:
bipartite_df = pd.merge(tp_df, tb_df, how='inner',on='tconst')
bipartite_df = bipartite_df.drop(bipartite_df.columns[2:],axis = 1)
bipartite_df.head()

Unnamed: 0,tconst,nconst
0,tt0011801,nm0459029
1,tt0011801,nm0681726
2,tt0011801,nm0726256
3,tt0011801,nm0776458
4,tt0011801,nm0666006


In [129]:
v = bipartite_df["nconst"].value_counts()
v = v.to_frame().reset_index()
v.rename(columns={'nconst' :'value', 'index' :'nconst'}, inplace=True)
v.head()

Unnamed: 0,nconst,value
0,nm0103977,339
1,nm0000616,206
2,nm0695177,184
3,nm0043199,167
4,nm0019382,159


In [130]:
res = pd.merge(v, nb_df, how='inner',on='nconst')
res.head(50)

Unnamed: 0,nconst,value,primaryName,birthYear,deathYear,primaryProfession
0,nm0103977,339,Brahmanandam,1956,\N,"actor,music_department,soundtrack"
1,nm0000616,206,Eric Roberts,1956,\N,"actor,producer,soundtrack"
2,nm0695177,184,Prakash Raj,1965,\N,"actor,producer,director"
3,nm0019382,159,Mohammad Ali,1968,\N,actor
4,nm0621937,134,Nassar,1958,\N,"actor,writer,director"
5,nm0080238,128,Tanikella Bharani,1954,\N,"actor,writer,director"
6,nm0154164,123,Soumitra Chatterjee,1935,2020,"actor,miscellaneous"
7,nm0457410,120,Ravi Kishan,1971,\N,"actor,producer"
8,nm3372956,115,Mihir Das,1959,2022,actor
9,nm0430803,114,Mohan Joshi,1945,\N,actor


In [131]:
df3 = res[~(res['value'] <= 5)]  
df3.head()

Unnamed: 0,nconst,value,primaryName,birthYear,deathYear,primaryProfession
0,nm0103977,339,Brahmanandam,1956,\N,"actor,music_department,soundtrack"
1,nm0000616,206,Eric Roberts,1956,\N,"actor,producer,soundtrack"
2,nm0695177,184,Prakash Raj,1965,\N,"actor,producer,director"
3,nm0019382,159,Mohammad Ali,1968,\N,actor
4,nm0621937,134,Nassar,1958,\N,"actor,writer,director"


In [133]:
bipartite_df = pd.merge(bipartite_df, df3, how='inner',on='nconst')
bipartite_df = bipartite_df.drop(bipartite_df.columns[2:],axis = 1)
bipartite_df.head()

Unnamed: 0,tconst,nconst
0,tt0062336,nm0016013
1,tt0100275,nm0016013
2,tt0484037,nm0016013
3,tt1434618,nm0016013
4,tt2597342,nm0016013


In [134]:
G = nx.Graph()
G.add_nodes_from(bipartite_df['tconst'], bipartite=0)
G.add_nodes_from(bipartite_df['nconst'], bipartite=1)

In [135]:
G.add_edges_from(zip(bipartite_df['tconst'], bipartite_df['nconst']))

In [136]:
nx.is_bipartite(G)

True

In [137]:
G.number_of_nodes()

71697

In [138]:
nx.is_connected(G)

False

In [139]:
left_or_top = bipartite_df['tconst']
pos = nx.bipartite_layout(G, left_or_top)

In [54]:
'''
nx.draw(G, pos=pos, node_size=100, alpha=1, linewidths=10)
plt.show()
'''

'\nnx.draw(G, pos=pos, node_size=100, alpha=1, linewidths=10)\nplt.show()\n'

In [140]:
actors_nodes = set(n for n,d in G.nodes(data=True) if d['bipartite']==1)
movies_nodes = set(G) - actors_nodes

In [141]:
A = bipartite.projected_graph(G, actors_nodes)

In [142]:
A.number_of_nodes()

7400

In [143]:
nx.is_connected(A)

False

In [144]:
degree_centrality = nx.degree_centrality(A)
degree_centrality

{'nm0531195': 0.002162454385727801,
 'nm4626627': 0.0013515339910798758,
 'nm1346931': 0.0012163805919718882,
 'nm0000437': 0.007298283551831329,
 'nm0643805': 0.0012163805919718882,
 'nm0276998': 0.000946073793755913,
 'nm3136459': 0.000946073793755913,
 'nm1775614': 0.0014866873901878633,
 'nm0766233': 0.0067576699553993785,
 'nm1209657': 0.0033788349776996893,
 'nm0361717': 0.0012163805919718882,
 'nm0731575': 0.0004054601973239627,
 'nm2108643': 0.0025679145830517637,
 'nm0610832': 0.0013515339910798758,
 'nm1478079': 0.0005406135964319503,
 'nm0608012': 0.0008109203946479254,
 'nm0855039': 0.0020273009866198137,
 'nm1307343': 0.0006757669955399379,
 'nm0368745': 0.00013515339910798757,
 'nm3508701': 0.0,
 'nm0636562': 0.0008109203946479254,
 'nm1328859': 0.0012163805919718882,
 'nm0155913': 0.0005406135964319503,
 'nm4045010': 0.0006757669955399379,
 'nm1113662': 0.0010812271928639006,
 'nm0754396': 0.0012163805919718882,
 'nm0957193': 0.0014866873901878633,
 'nm0952498': 0.001892

In [145]:
most_influential = list()
for node in sorted(degree_centrality, key=degree_centrality.get, reverse=True):
    most_influential.append(node)
    print(node, degree_centrality[node])

nm0000616 0.01770509528314637
nm0695177 0.016488714691174484
nm0001744 0.014055953507230707
nm0149822 0.012974726314366806
nm0103977 0.012839572915258819
nm0000514 0.012704419516150832
nm0896573 0.01216380591971888
nm0621937 0.011893499121502907
nm0256628 0.011893499121502907
nm0001803 0.01108257872685498
nm0451600 0.010677118529531017
nm0000168 0.010677118529531017
nm0000367 0.010271658332207055
nm0004109 0.010271658332207055
nm0700875 0.010271658332207055
nm0474774 0.010136504933099068
nm1078422 0.010136504933099068
nm0457410 0.01000135153399108
nm0000353 0.01000135153399108
nm0410902 0.009731044735775105
nm0000246 0.009731044735775105
nm0000579 0.009731044735775105
nm0006763 0.009595891336667118
nm0000115 0.00946073793755913
nm0000821 0.00946073793755913
nm0007106 0.009325584538451143
nm0008346 0.009325584538451143
nm0000800 0.009190431139343154
nm0945189 0.009055277740235167
nm0712546 0.00892012434112718
nm0000151 0.008784970942019191
nm0430803 0.008784970942019191
nm0465503 0.0086

In [146]:
mi = most_influential[:20]
mi_df = pd.DataFrame (mi, columns = ['nconst'])
mi_df.head()

Unnamed: 0,nconst
0,nm0000616
1,nm0695177
2,nm0001744
3,nm0149822
4,nm0103977


In [147]:
res = pd.merge(mi_df, nb_df, how='inner',on='nconst')
res.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession
0,nm0000616,Eric Roberts,1956,\N,"actor,producer,soundtrack"
1,nm0695177,Prakash Raj,1965,\N,"actor,producer,director"
2,nm0001744,Tom Sizemore,1961,\N,"actor,producer,writer"
3,nm0149822,Mithun Chakraborty,1950,\N,"actor,music_department,writer"
4,nm0103977,Brahmanandam,1956,\N,"actor,music_department,soundtrack"


In [149]:
print(list(nx.connected_components(A)))

[{'nm0531195', 'nm4626627', 'nm1346931', 'nm0000437', 'nm0643805', 'nm0276998', 'nm3136459', 'nm1775614', 'nm0766233', 'nm1209657', 'nm0361717', 'nm0731575', 'nm2108643', 'nm0610832', 'nm1478079', 'nm0608012', 'nm0855039', 'nm1307343', 'nm0368745', 'nm0636562', 'nm1328859', 'nm0155913', 'nm4045010', 'nm1113662', 'nm0754396', 'nm0957193', 'nm0062359', 'nm0952498', 'nm1670601', 'nm0302304', 'nm0073162', 'nm0000195', 'nm3065061', 'nm0665555', 'nm0038673', 'nm0109277', 'nm3606001', 'nm0659241', 'nm0227317', 'nm0087237', 'nm1368482', 'nm0223464', 'nm0301178', 'nm0000664', 'nm1698608', 'nm0925220', 'nm0661845', 'nm0123309', 'nm1002641', 'nm1125989', 'nm0239498', 'nm0893449', 'nm0206359', 'nm0001166', 'nm0000191', 'nm2748640', 'nm0050437', 'nm0719637', 'nm0530564', 'nm0849916', 'nm5354037', 'nm0223506', 'nm3271373', 'nm0219939', 'nm0346962', 'nm0004286', 'nm0177933', 'nm0082972', 'nm0645760', 'nm1140345', 'nm0333702', 'nm0795169', 'nm0299078', 'nm3067749', 'nm0372117', 'nm0018745', 'nm0001487

In [150]:
print(nx.number_connected_components(A))

244


In [151]:
#A.remove_nodes_from(list(nx.isolates(A)))

In [156]:
closeness_centrality = nx.closeness_centrality(A)
closeness_centrality

{'nm0531195': 0.22098114741944733,
 'nm4626627': 0.18514120236009307,
 'nm1346931': 0.20472466835210698,
 'nm0000437': 0.26551135344855514,
 'nm0643805': 0.2084366303349446,
 'nm0276998': 0.158975369523875,
 'nm3136459': 0.1770464476017459,
 'nm1775614': 0.21985369258567466,
 'nm0766233': 0.22373042305838442,
 'nm1209657': 0.19319649435314584,
 'nm0361717': 0.19656645845019408,
 'nm0731575': 0.207157298051036,
 'nm2108643': 0.19925899304173228,
 'nm0610832': 0.20891329661361033,
 'nm1478079': 0.2068196231508079,
 'nm0608012': 0.2178682420421001,
 'nm0855039': 0.23375253235099328,
 'nm1307343': 0.1936935084878287,
 'nm0368745': 0.17782300741603851,
 'nm3508701': 0.0,
 'nm0636562': 0.21816654913994474,
 'nm1328859': 0.1884304742922352,
 'nm0155913': 0.19761531180929204,
 'nm4045010': 0.17954718227830097,
 'nm1113662': 0.1988426292405016,
 'nm0754396': 0.17364084534796403,
 'nm0957193': 0.21866787335205579,
 'nm0952498': 0.20393857827538886,
 'nm1670601': 0.22424299521306423,
 'nm0062359'

In [159]:
most_influential2 = list()
for node in sorted(closeness_centrality, key=closeness_centrality.get, reverse=True):
    most_influential2.append(node)
    print(node, closeness_centrality[node])

nm0000514 0.29245008006402445
nm0000353 0.28375385056489544
nm0000616 0.2825835385523371
nm0000800 0.28011122141209993
nm0001698 0.27851709250975465
nm0000104 0.2783021436995802
nm0001426 0.2779183253859556
nm0000579 0.2763266979528117
nm0000172 0.27607061309291503
nm0000553 0.27492959655471144
nm0001803 0.27454401649863613
nm0000115 0.27391838147123354
nm0001744 0.2736886025301576
nm0000246 0.2736011690655684
nm0001667 0.27337192178669606
nm0000418 0.27293632138549034
nm0124930 0.2726973323677598
nm0000518 0.2726322261644966
nm0005068 0.27192889648118607
nm0290556 0.2716270407445276
nm0396812 0.2709931762741749
nm0000151 0.27050102553357674
nm0000620 0.27042628969608323
nm0005351 0.26965996834618877
nm0002076 0.2696493555724335
nm0000323 0.2690775036617824
nm0001993 0.26864493709770876
nm0305558 0.26845546884021493
nm0654110 0.26818226380695026
nm0000168 0.26812978811646243
nm0000146 0.2681088035899027
nm0000532 0.2680668443890592
nm0000174 0.2678363033399775
nm0000134 0.2675957067544

In [160]:
mi2 = most_influential2[:20]
mi2_df = pd.DataFrame (mi2, columns = ['nconst'])
mi2_df.head()
res2 = pd.merge(mi2_df, nb_df, how='inner',on='nconst')
res2.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession
0,nm0000514,Michael Madsen,1957,\N,"actor,producer,writer"
1,nm0000353,Willem Dafoe,1955,\N,"actor,soundtrack,producer"
2,nm0000616,Eric Roberts,1956,\N,"actor,producer,soundtrack"
3,nm0000800,Armand Assante,1949,\N,"actor,producer,soundtrack"
4,nm0001698,John Savage,1949,\N,"actor,producer,soundtrack"


In [161]:
'''
import scipy.sparse
import scipy.sparse.csgraph

T = nx.adjacency_matrix(A).tolil()
D = scipy.sparse.csgraph.floyd_warshall( T, directed=False, unweighted=False)

n = D.shape[0]
closeness_centrality = {}
for r in range(0, n):
    
    cc = 0.0
    
    possible_paths = list(enumerate(D[r, :]))
    shortest_paths = dict(filter( \
        lambda x: not x[1] == np.inf, possible_paths))
    
    total = sum(shortest_paths.values())
    n_shortest_paths = len(shortest_paths) - 1.0
    if total > 0.0 and n > 1:
        s = n_shortest_paths / (n - 1)
        cc = (n_shortest_paths / total) * s
    closeness_centrality[r] = cc
'''

'\nimport scipy.sparse\nimport scipy.sparse.csgraph\n\nT = nx.adjacency_matrix(A).tolil()\nD = scipy.sparse.csgraph.floyd_warshall( T, directed=False, unweighted=False)\n\nn = D.shape[0]\ncloseness_centrality = {}\nfor r in range(0, n):\n    \n    cc = 0.0\n    \n    possible_paths = list(enumerate(D[r, :]))\n    shortest_paths = dict(filter(         lambda x: not x[1] == np.inf, possible_paths))\n    \n    total = sum(shortest_paths.values())\n    n_shortest_paths = len(shortest_paths) - 1.0\n    if total > 0.0 and n > 1:\n        s = n_shortest_paths / (n - 1)\n        cc = (n_shortest_paths / total) * s\n    closeness_centrality[r] = cc\n'