---

_You are currently looking at **version 1.1** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-social-network-analysis/resources/yPcBs) course resource._

---

# Assignment 1 - Creating and Manipulating Graphs

Eight employees at a small company were asked to choose 3 movies that they would most enjoy watching for the upcoming company movie night. These choices are stored in the file `Employee_Movie_Choices.txt`.

A second file, `Employee_Relationships.txt`, has data on the relationships between different coworkers. 

The relationship score has value of `-100` (Enemies) to `+100` (Best Friends). A value of zero means the two employees haven't interacted or are indifferent.

Both files are tab delimited.

In [1]:
import networkx as nx
import pandas as pd
import numpy as np
from networkx.algorithms import bipartite


# This is the set of employees
employees = set(['Pablo',
                 'Lee',
                 'Georgia',
                 'Vincent',
                 'Andy',
                 'Frida',
                 'Joan',
                 'Claude'])

# This is the set of movies
movies = set(['The Shawshank Redemption',
              'Forrest Gump',
              'The Matrix',
              'Anaconda',
              'The Social Network',
              'The Godfather',
              'Monty Python and the Holy Grail',
              'Snakes on a Plane',
              'Kung Fu Panda',
              'The Dark Knight',
              'Mean Girls'])


# you can use the following function to plot graphs
# make sure to comment it out before submitting to the autograder
def plot_graph(G, weight_name=None):
    '''
    G: a networkx G
    weight_name: name of the attribute for plotting edge weights (if G is weighted)
    '''
    #%matplotlib notebook
    #import matplotlib.pyplot as plt
    
    plt.figure()
    pos = nx.spring_layout(G)
    edges = G.edges()
    weights = None
    
    if weight_name:
        weights = [int(G[u][v][weight_name]) for u,v in edges]
        labels = nx.get_edge_attributes(G,weight_name)
        nx.draw_networkx_edge_labels(G,pos,edge_labels=labels)
        nx.draw_networkx(G, pos, edges=edges, width=weights);
    else:
        nx.draw_networkx(G, pos, edges=edges);

### Question 1

Using NetworkX, load in the bipartite graph from `Employee_Movie_Choices.txt` and return that graph.

*This function should return a networkx graph with 19 nodes and 24 edges*

In [2]:
import networkx as nx
from networkx.algorithms import bipartite


# Read the 'Employee_Movie_Choices.txt' file
with open('Employee_Movie_Choices.txt', 'r') as f:
    text = f.read()

# List with every row of the table (2 columns + headers)
text = text.splitlines()

# For every element in the text, split it by '\t', then save the results in a list of lists
text_splited = list(map(lambda x: x.split('\t'), text))

# Save the split result in a DataFrame (skip the headers(1st row))
G_df = pd.DataFrame(text_splited, columns=['Employee', 'Movie']).iloc[1:, :]

# Make column of name, movie tuple for the edged of the graph
G_df['employee_movie_tuple'] = G_df.apply(lambda x: (x['Employee'], x['Movie']), axis=1)

# List of tuples used to create the edges of the graph
edges = G_df['employee_movie_tuple'].tolist()

# Step 1: Set the Graph object
# There's no separate class for bipartite graphs (need to import bipartite algorithm)
B = nx.Graph()

# Step 2: Add the Nodes fron the left side of the bipartite groups represented by 0 (in the examples: employees)
B.add_nodes_from(set(G_df['Employee']), bipartite=0)  # labels for the left side / group 0 of bipartite

# Step 3: Add the Nodes fron the right side of the bipartite groups represented by 1 (in the examples: movies)
B.add_nodes_from(set(G_df['Movie']), bipartite=1)  # labels for the right side / group 1 of bipartite

# Step 4: Add the Edges for all the Nodes
B.add_edges_from(edges)

# Check if a Graph is Bipartite (True)
#bipartite.is_bipartite(B)  # Check if B is bipartite 

In [3]:
import networkx as nx
from networkx.algorithms import bipartite


# Read the 'Employee_Movie_Choices.txt' file
with open('Employee_Movie_Choices.txt', 'r') as f:
    text = f.read()

# List with every row of the table (2 columns + headers)
text = text.splitlines()

# For every element in the text, split it by '\t', then save the results in a list of lists
text_splited = list(map(lambda x: x.split('\t'), text))

# Save the split result in a DataFrame (skip the headers(1st row))
G_df = pd.DataFrame(text_splited, columns=['Employee', 'Movie']).iloc[1:, :]

# Make column of name, movie tuple for the edged of the graph
G_df['employee_movie_tuple'] = G_df.apply(lambda x: (x['Employee'], x['Movie']), axis=1)

# List of tuples used to create the edges of the graph
edges = G_df['employee_movie_tuple'].tolist()

# Step 1: Set the Graph object
# There's no separate class for bipartite graphs (need to import bipartite algorithm)
B = nx.Graph()

# Step 2: Add the Nodes fron the left side of the bipartite groups represented by 0 (in the examples: employees)
B.add_nodes_from(set(G_df['Employee']), bipartite=0)  # labels for the left side / group 0 of bipartite

# Step 3: Add the Nodes fron the right side of the bipartite groups represented by 1 (in the examples: movies)
B.add_nodes_from(set(G_df['Movie']), bipartite=1)  # labels for the right side / group 1 of bipartite

# Step 4: Add the Edges for all the Nodes
B.add_edges_from(edges)

# Check if a Graph is Bipartite (True)
#bipartite.is_bipartite(B)  # Check if B is bipartite 

In [4]:
def answer_one():
        
    # Your Code Here
    import networkx as nx
    from networkx.algorithms import bipartite

    # Read the 'Employee_Movie_Choices.txt' file
    with open('Employee_Movie_Choices.txt', 'r') as f:
        text = f.read()

    # List with every row of the table (2 columns + headers)
    text = text.splitlines()

    # For every element in the text, split it by '\t', then save the results in a list of lists
    text_splited = list(map(lambda x: x.split('\t'), text))

    # Save the split result in a DataFrame (skip the headers(1st row))
    G_df = pd.DataFrame(text_splited, columns=['Employee', 'Movie']).iloc[1:, :]

    # Make column of name, movie tuple for the edged of the graph
    G_df['employee_movie_tuple'] = G_df.apply(lambda x: (x['Employee'], x['Movie']), axis=1)

    # List of tuples used to create the edges of the graph
    edges = G_df['employee_movie_tuple'].tolist()

    # Step 1: Set the Graph object
    # There's no separate class for bipartite graphs (need to import bipartite algorithm)
    B = nx.Graph()

    # Step 2: Add the Nodes fron the left side of the bipartite groups represented by 0 (in the examples: employees)
    B.add_nodes_from(set(G_df['Employee']), bipartite=0)  # labels for the left side / group 0 of bipartite

    # Step 3: Add the Nodes fron the right side of the bipartite groups represented by 1 (in the examples: movies)
    B.add_nodes_from(set(G_df['Movie']), bipartite=1)  # labels for the right side / group 1 of bipartite

    # Step 4: Add the Edges for all the Nodes
    B.add_edges_from(edges)
    
    
    return B


#plot_graph(answer_one())

### Question 2

Using the graph from the previous question, add nodes attributes named `'type'` where movies have the value `'movie'` and employees have the value `'employee'` and return that graph.

*This function should return a networkx graph with node attributes `{'type': 'movie'}` or `{'type': 'employee'}`*

In [5]:
def answer_two():
    
    # Your Code Here
    import networkx as nx
    from networkx.algorithms import bipartite

    # Read the 'Employee_Movie_Choices.txt' file
    with open('Employee_Movie_Choices.txt', 'r') as f:
        text = f.read()

    # List with every row of the table (2 columns + headers)
    text = text.splitlines()

    # For every element in the text, split it by '\t', then save the results in a list of lists
    text_splited = list(map(lambda x: x.split('\t'), text))

    # Save the split result in a DataFrame (skip the headers(1st row))
    G_df = pd.DataFrame(text_splited, columns=['Employee', 'Movie']).iloc[1:, :]

    # Make column of name, movie tuple for the edged of the graph
    G_df['employee_movie_tuple'] = G_df.apply(lambda x: (x['Employee'], x['Movie']), axis=1)

    # List of tuples used to create the edges of the graph
    edges = G_df['employee_movie_tuple'].tolist()

    # Step 1: Set the Graph object
    # There's no separate class for bipartite graphs (need to import bipartite algorithm)
    B = nx.Graph()

    # Step 2: Add the Nodes fron the left side of the bipartite groups represented by 0 (in the examples: employees)
    B.add_nodes_from(set(G_df['Employee']), bipartite=0, type='employee')  # labels for the left / 0 side

    # Step 3: Add the Nodes fron the right side of the bipartite groups represented by 1 (in the examples: movies)
    B.add_nodes_from(set(G_df['Movie']), bipartite=1, type='movie')  # labels for the right / 1 side 

    # Step 4: Add the Edges for all the Nodes
    B.add_edges_from(edges)
    

    return B


#plot_graph(answer_two())

### Question 3

Find a weighted projection of the graph from `answer_two` which tells us how many movies different pairs of employees have in common.

*This function should return a weighted projected graph.*

In [6]:
def answer_three():
        
    # Your Code Here
    G = answer_two()
    weighted_projection_common_movies_pair_employees = bipartite.weighted_projected_graph(G, employees)
    
    return weighted_projection_common_movies_pair_employees


#plot_graph(answer_three())

### Question 4

Suppose you'd like to find out if people that have a high relationship score also like the same types of movies.

Find the Pearson correlation ( using `DataFrame.corr()` ) between employee relationship scores and the number of movies they have in common. If two employees have no movies in common it should be treated as a 0, not a missing value, and should be included in the correlation calculation.

*This function should return a float.*

In [7]:
# Load Employee_Relationships.txt into a DataFrame
rel = nx.read_edgelist('Employee_Relationships.txt',
                       data=[('relationship_score', int)])

employee_relation_df = pd.DataFrame(rel.edges(data=True),
                                    columns=['employee_0', 'employee_1', 'relationship_score'])

# Extract the 'relationship_score' value from the dictionary and keep the number
employee_relation_df['relationship_score'] = employee_relation_df['relationship_score'].apply(lambda x: x['relationship_score'])

# Load Answer 3 and save it in a DataFrame
employee_movie_df = pd.DataFrame(answer_three().edges(data=True),
                                 columns=['employee_0', 'employee_1', 'pair_movies_common'])

# Extract the 'weight' value from the dictionary and keep the number
employee_movie_df['pair_movies_common'] = employee_movie_df['pair_movies_common'] \
                                            .apply(lambda x: x['weight'] if type(x)==dict else None)

# Create a copy of the employee_movie_df
reverse_employees_names_df = employee_movie_df.copy()
reverse_employees_names_df = reverse_employees_names_df[['employee_1', 'employee_0', 'pair_movies_common']].rename(columns={'employee_1': 'employee_0', 'employee_0': 'employee_1'})    

# Append the reverse_employees_names_df at the end of the employee_movie_df
employee_movie_df = employee_movie_df.append(reverse_employees_names_df).reset_index(drop=True)

# Merge the movies and the relationship_score df's
merged_df = pd.merge(left=employee_movie_df, right=employee_relation_df, how='right')

# Fill na values with 0
merged_df.fillna(0, inplace=True)

# Estimate the correlation between # of pairs of movies in common among employees and their relationship score
correlation = merged_df['pair_movies_common'].corr(merged_df['relationship_score'])

In [8]:
def answer_four():
        
    # Your Code Here
    # Load Employee_Relationships.txt into a DataFrame
    rel = nx.read_edgelist('Employee_Relationships.txt', data=[('relationship_score', int)])
    employee_relation_df = pd.DataFrame(rel.edges(data=True),
                                        columns=['employee_0', 'employee_1', 'relationship_score'])

    # Extract the 'relationship_score' value from the dictionary and keep the number
    employee_relation_df['relationship_score'] = employee_relation_df['relationship_score'] \
                                                        .apply(lambda x: x['relationship_score'])

    # Load Answer 3 and save it in a DataFrame
    employee_movie_df = pd.DataFrame(answer_three().edges(data=True),
                                     columns=['employee_0', 'employee_1', 'pair_movies_common'])

    # Extract the 'weight' value from the dictionary and keep the number
    employee_movie_df['pair_movies_common'] = employee_movie_df['pair_movies_common'] \
                                                .apply(lambda x: x['weight'] if type(x)==dict else None)

    # Create a copy of the employee_movie_df
    reverse_employees_names_df = employee_movie_df.copy()
    reverse_employees_names_df = reverse_employees_names_df[['employee_1', 'employee_0', 'pair_movies_common']] \
                                        .rename(columns={'employee_1': 'employee_0', 'employee_0': 'employee_1'})    

    # Append the reverse_employees_names_df at the end of the employee_movie_df
    employee_movie_df = employee_movie_df.append(reverse_employees_names_df).reset_index(drop=True)

    # Merge the movies and the relationship_score df's
    merged_df = pd.merge(left=employee_movie_df, right=employee_relation_df, how='right')

    # Fill NaN values with 0
    merged_df.fillna(0, inplace=True)

    # Estimate the correlation between # of pairs of movies in common among employees and their relationship score
    correlation = merged_df['pair_movies_common'].corr(merged_df['relationship_score'])
    
    
    return correlation