## EE232E Project 2
### Problem 1

In [23]:
from igraph import *
from collections import defaultdict
import string
import pickle
from tqdm import tqdm_notebook as timer

In [25]:
outfile = open("./project_2_data/" + "combined.txt", "w")
infiles = ["./project_2_data/actor_movies.txt", "./project_2_data/actress_movies.txt"]
printable = set(string.printable)
fl = {"./project_2_data/actor_movies.txt": 2167653, "./project_2_data/actress_movies.txt": 1182813}
for f in infiles:
    with open(f, "rb") as infile:
        for line in timer(infile, total=fl[f], desc="actor list"):
            line = filter(lambda x : x in printable, line.decode('latin1')).encode('ascii')
            arr = line.split("\t\t")
            outfile.write(line)
outfile.close()







### Problem 2
Construct a weighted directed graph $G(V, E)$ from the list, while:

$V = \text{all actors/actresses in the list.} $

$S_i = \{m | i \in V, m \text{ is a movie in which } i \text{ has acted.}$

$ E = \{(i,j)|i,j ∈ V,S_i \cap S_j \neq ∅\} \text{ and for each directed Edge } i → j, \text{ a weight is assigned as } \frac{|S_i \cap S_j|}{|S_i|}.$

In [None]:
# combine actor and actress into one file
combine("combined_10.txt", 10)

In [None]:
def clean_string(s):
    return(re.sub(r'\(.*\)|\{.*\}|\'|\"', "", s).lstrip().rstrip())

In [None]:
# create dictionaries
actor2movie = defaultdict(list)
movie2actor = defaultdict(list)

with open("./project_2_data/combined_10.txt", "r") as infile:
    for line in infile:
        arr = line.strip().translate(None, "&$ ").split("\t\t")
        for i in range(1, len(arr)):
            actor2movie[arr[0]].append(arr[i])
            movie2actor[arr[i]].append(arr[0])


In [None]:
# use pickle to save the actor2movie dict and movie2actor dict
pickle.dump(actor2movie, open("actor2movie.pkl", "wb" ))
pickle.dump(movie2actor, open("movie2actor.pkl", "wb" ))

# uncomment this to load pickle file
# actor2movie = pickle.load(open("actor2movie.pkl", "rb" ))
# movie2actor = pickle.load(open("movie2actor.pkl", "rb" ))



In [None]:
# create edge list
count = 0

outfile = open("./edgelist.txt", "w")

for i, movies in timer(actor2movie.iteritems(), total):
    # for an actor i, get the list of actors that appear in the same movie
    i2k = defaultdict(int)
    for j in movies:
        for k in movie2actor[j]:
            i2k[k]=i2k[k]+1
    for k, w in i2k.iteritems():
        weight = float(w) / len(movies)
        # g.add_edge(i, k, weight=weight)
        line = i + '\t' + k + '\t' + str(weight) + '\n'
        outfile.write(line)
        count = count + 1
print(count)
outfile.close()

In [None]:
g = Graph.Read_Ncol('edgelist.txt', directed=True)
print(g.vcount())
print(g.ecount())

### Problem 3
Run the Page Rank algorithm on the network. And list the top 10 actors according to page rank.
We also listed the top 10 actors ranked by the number of movie he/she has acted in.

In [None]:
page_rank = g.pagerank(vertices=None, directed=True)

In [None]:
sorted_pr = sorted(range(len(page_rank)), key=lambda k: page_rank[k], reverse=True)

In [None]:
for actor in g.vs[sorted_pr[0:10]]["name"]:
    print actor, len(actor2movie[actor]), page_rank[g.vs.find(actor).index]

In [None]:
# sort by number of movies appeared in
sorted_actor2movie = sorted(actor2movie, key=lambda k: len(actor2movie[k]), reverse=True)
for actor in sorted_actor2movie[:10]:
    print actor, len(actor2movie[actor]), page_rank[g.vs.find(actor).index]

### Problem 4
Construct a movie network according to the set of actors/actresses, with weight assigned as the jaccard index of the actor sets of 2 movies. Now we have an undirected network instead.

In [None]:
# helper function that computes jaccard index of two set
def jaccard_index(first_set, second_set):
    """ Computes jaccard index of two sets
        Arguments:
          first_set(set):
          second_set(set):
        Returns:
          index(float): Jaccard index between two sets; it is 
            between 0.0 and 1.0
    """
    # If both sets are empty, jaccard index is defined to be 1
    index = 1.0
    if first_set or second_set:
        index = (float(len(first_set.intersection(second_set))) / len(first_set.union(second_set)))

    return index


In [None]:
combine("combined_0.txt", 0)

In [None]:
# create dictionaries
actor2movie_m = defaultdict(list)
movie2actor_m = defaultdict(set)

with open("./project_2_data/combined_0.txt", "r") as infile:
    for line in infile:
        arr = line.strip().translate(None, "&$ ").split("\t\t")
        for i in range(1, len(arr)):
            actor2movie_m[arr[0]].append(arr[i])
            movie2actor_m[arr[i]].add(arr[0])

In [None]:
# remove movies that have less than 10 actors
threshold = 10
movie2actor_m_less = {}
for movie, actors in movie2actor_m.iteritems():
    if len(actors) >= threshold:
        movie2actor_m_less[movie] = actors
        
print len(movie2actor_m_less)

In [None]:
pickle.dump(movie2actor_m_less, open("movie2actor_m_less.pkl", "wb" ))
pickle.dump(actor2movie_m, open("actor2movie_m.pkl", "wb" ))


In [None]:
# create edge list for movie with at least 5 actors/actresses
e_count = 0
v_count = 0
# outfile = open("./movie_edgelist.txt", "w")

processed_movies = set()
for i, i_actors in movie2actor_m_less.iteritems():
    for j, j_actors in movie2actor_m_less.iteritems():
        # TODO: check whether they have intersections
        #if j in processed_movies or i == j or i_actors.isdisjoint(j_actors):
        #    continue
        # add edge
        e_count = e_count + 1
    #processed_movies.add(i)
# outfile.close()
print e_count

In [None]:
print v_count

In [None]:
print e_count

In [None]:
s1 = set([1,2,3])
print s1
s2 = set([3, 4,5])
print s2
s1.isdisjoint(s2)

In [None]:
len(movie2actor_m_less)