## EE232E Project 2
### Problem 1

In [1]:
from igraph import *
from collections import defaultdict

In [2]:
def combine(outfile, threshold):
    outfile = open("./project_2_data/" + outfile, "w")

    with open("./project_2_data/actor_movies.txt", "rb") as infile:
        for line in infile:
            arr = line.split("\t\t")
            if len(arr) > threshold:
                outfile.write(line)
    with open("./project_2_data/actress_movies.txt", "rb") as infile:
        for line in infile:
            arr = line.strip().split("\t\t")
            if len(arr) > threshold:
                outfile.write(line)
    outfile.close()

### Problem 2
Construct a weighted directed graph $G(V, E)$ from the list, while:

$V = \text{all actors/actresses in the list.} $

$S_i = \{m | i \in V, m \text{ is a movie in which } i \text{ has acted.}$

$ E = \{(i,j)|i,j ∈ V,S_i \cap S_j \neq ∅\} \text{ and for each directed Edge } i → j, \text{ a weight is assigned as } \frac{|S_i \cap S_j|}{|S_i|}.$

In [3]:
# combine actor and actress into one file
combine("combined_10.txt", 10)

In [5]:
# create dictionaries
actor2movie = defaultdict(list)
movie2actor = defaultdict(list)

with open("./project_2_data/combined_10.txt", "r") as infile:
    for line in infile:
        arr = line.strip().translate(None, "&$ ").split("\t\t")
        for i in range(1, len(arr)):
            actor2movie[arr[0]].append(arr[i])
            movie2actor[arr[i]].append(arr[0])


In [6]:
import pickle
# use pickle to save the actor2movie dict and movie2actor dict
pickle.dump(actor2movie, open("actor2movie.pkl", "wb" ))
pickle.dump(movie2actor, open("movie2actor.pkl", "wb" ))

# uncomment this to load pickle file
# actor2movie = pickle.load(open("actor2movie.pkl", "rb" ))
# movie2actor = pickle.load(open("movie2actor.pkl", "rb" ))



In [5]:
# create edge list
count = 0

outfile = open("./edgelist.txt", "w")

for i, movies in actor2movie.iteritems():
    # for an actor i, get the list of actors that appear in the same movie
    i2k = defaultdict(int)
    for j in movies:
        for k in movie2actor[j]:
            i2k[k]=i2k[k]+1
    for k, w in i2k.iteritems():
        weight = float(w) / len(movies)
        # g.add_edge(i, k, weight=weight)
        line = i + '\t' + k + '\t' + str(weight) + '\n'
        outfile.write(line)
        count = count + 1
print(count)
outfile.close()

25903265


In [6]:
g = Graph.Read_Ncol('edgelist.txt', directed=True)
print(g.vcount())
print(g.ecount())

113129
25903265


### Problem 3
Run the Page Rank algorithm on the network. And list the top 10 actors according to page rank.
We also listed the top 10 actors ranked by the number of movie he/she has acted in.

In [7]:
page_rank = g.pagerank(vertices=None, directed=True)

In [8]:
sorted_pr = sorted(range(len(page_rank)), key=lambda k: page_rank[k], reverse=True)

In [9]:
for actor in g.vs[sorted_pr[0:10]]["name"]:
    print actor, len(actor2movie[actor]), page_rank[g.vs.find(actor).index]

Roberts,Eric(I) 298 0.000126941657071
Jeremy,Ron 637 0.000106443168928
Trejo,Danny 241 0.000106343502328
Flowers,Bess 828 0.000100092467495
Riehle,Richard 197 9.70123730473e-05
David,Keith(I) 178 9.01901768746e-05
Harris,Sam(II) 600 8.93933290134e-05
Kaufman,Lloyd 301 8.7001456383e-05
Madsen,Michael(I) 218 8.6824507889e-05
Jackson,SamuelL. 159 8.47608644893e-05


In [10]:
# sort by number of movies appeared in
sorted_actor2movie = sorted(actor2movie, key=lambda k: len(actor2movie[k]), reverse=True)
for actor in sorted_actor2movie[:10]:
    print actor, len(actor2movie[actor]), page_rank[g.vs.find(actor).index]

Blanc,Mel 1065 1.06620164461e-05
Brahmanandam 973 3.60927886012e-05
Onoe,Matsunosuke 927 8.13233828772e-06
Flowers,Bess 828 0.000100092467495
Hack,Herman 668 3.77710838846e-05
Phelps,Lee(I) 647 7.08439653641e-05
Jeremy,Ron 637 0.000106443168928
Cobb,Edmund 633 5.63713647626e-05
O'Connor,Frank(I) 623 6.78001730158e-05
Kapoor,Shakti 618 4.86308269766e-05


### Problem 4
Construct a movie network according to the set of actors/actresses, with weight assigned as the jaccard index of the actor sets of 2 movies. Now we have an undirected network instead.

In [19]:
# helper function that computes jaccard index of two set
def jaccard_index(first_set, second_set):
    """ Computes jaccard index of two sets
        Arguments:
          first_set(set):
          second_set(set):
        Returns:
          index(float): Jaccard index between two sets; it is 
            between 0.0 and 1.0
    """
    # If both sets are empty, jaccard index is defined to be 1
    index = 1.0
    if first_set or second_set:
        index = (float(len(first_set.intersection(second_set))) / len(first_set.union(second_set)))

    return index


In [26]:
combine("combined_0.txt", 5)

In [27]:
# create dictionaries
actor2movie_m = defaultdict(list)
movie2actor_m = defaultdict(list)

with open("./project_2_data/combined_0.txt", "r") as infile:
    for line in infile:
        arr = line.strip().translate(None, "&$ ").split("\t\t")
        for i in range(1, len(arr)):
            actor2movie_m[arr[0]].append(arr[i])
            movie2actor_m[arr[i]].append(arr[0])

In [28]:
# remove movies that have less than 5 actors
threshold = 5
movie2actor_m_less = {}
for movie, actors in movie2actor_m.iteritems():
    if len(actors) >= threshold:
        movie2actor_m_less[movie] = actors


In [14]:
pickle.dump(movie2actor_m_less, open("movie2actor_m_less.pkl", "wb" ))
pickle.dump(actor2movie_m, open("actor2movie_m.pkl", "wb" ))


In [33]:
# create edge list for movie with at least 5 actors/actresses
e_count = 0
v_count = 0
outfile = open("./movie_edgelist.txt", "w")

processed_movies = set()
for i, i_actors in movie2actor_m_less.iteritems():
    for j, j_actors in movie2actor_m_less.iteritems():
        # TODO: check whether they have intersections
        if j in processed_movies or i == j:
            continue
        # add edge
        e_count = e_count + 1
    processed_movies.add(i)
        
#     # actors: set()
#     # i2k: set()   set of movie that share actors with i
#     i2k = set()
#     for j in actors:
#         for k in actor2movie_m[j]:
#             if k in movie2actor_m_less:
#                 i2k.add(k)
#     for k in i2k:
#         # weight = jaccard_index(movie2actor_m_less[i], movie2actor_m_less[k])
#         # line = i + '\t' + k + '\t' + str(weight) + '\n'
#         # outfile.write(line)
#         e_count = e_count + 1
#     v_count = v_count + 1
outfile.close()

KeyboardInterrupt: 

In [30]:
print v_count

253752


In [None]:
print e_count