In [None]:
import numpy as np
from nose.tools import assert_equal, assert_is_instance
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import matplotlib as mpl
import warnings
import tweepy as tw
import os
import string
from itertools import combinations
from collections import Counter
import pandas as pd

from nose.tools import assert_true, assert_equal, assert_list_equal
warnings.filterwarnings('ignore')

# Problem 1: Create and Draw a graph

Make an undirected graph with an integer number of nodes, and edges connecting every even integer node to the next odd integer node. Start from zero (Zero is an even number). Furthermore, running the function should plot the graph using matplotlib. You should pass the matplotlib.Axes instance to the ax parameter in the drawing method used by networkx. Furthermore, set the $\texttt{with_labels}$ parameter in the networkx graphing function to be $\texttt{True}$.

In [None]:
def undirected_graph(num_nodes):
    """
    Creates a directed graph with 20 integer nodes.
    Every even interger node is connected to the next odd integer node.
    
    Paramters
    ---------
    num_nodes: the number of nodes to have in the graph
    
    Returns
    -------
    g: the networkx undirected graph
    ax: the matplotlib axes object that is used to plot the graph
    """
    
    # YOUR CODE HERE
    
    return g, ax

In [None]:
g,ax=undirected_graph(20)

In [None]:
assert_is_instance(g, nx.Graph)

nodes = np.array(g.nodes())
assert_equal(nodes.size, 20)
assert_equal(nodes.min(), 0)
assert_equal(nodes.max(), 19)

edges = np.array(g.edges())
assert_equal(edges.shape[0], 10)
assert_equal(edges.shape[1], 2)
assert_equal(edges[:, 0].min(), 0)
assert_equal(edges[:, 0].max(), 18)
assert_equal(edges[:, 1].min(), 1)
assert_equal(edges[:, 1].max(), 19)

assert_is_instance(ax, mpl.axes.Axes)


# Problem 2: Get the Adjacency or Incidence Matrix

Write a function called $\texttt{get_matrix}$ that returns either the adjacency or the indicidence matrix. The function should have one input called $\texttt{matrix_type}$ that can take value either $\texttt{"adjacency"}$ or $\texttt{"incidence"}$

In [None]:
def get_matrix(matrix_type,graph):
    """
    get dense representation of matrix of a graph
    
    Paramters
    ---------
    matrix_type: a string, the type of matrix to calculate, either "adjacency" or "incidence"
    graph: the graph to calculate the adjacency or incidence matrix of
    
    Returns
    -------
    matrix: the dense representation of the matrix
    """
    
    # YOUR CODE HERE
    
    return matrix

In [None]:
adjacency_matrix = get_matrix("adjacency",g)
incidence_matrix = get_matrix("incidence",g)

In [None]:
assert_is_instance(incidence_matrix,np.matrixlib.defmatrix.matrix)
assert_is_instance(adjacency_matrix,np.matrixlib.defmatrix.matrix)
assert_equal(incidence_matrix.shape[0],20)
assert_equal(incidence_matrix.shape[1],10)
assert_equal(adjacency_matrix.shape[0],20)
assert_equal(adjacency_matrix.shape[1],20)

# Problem 3: barbell or lollipop graph 
Write a function called barbell_lollipop. This function take in 3 parameters, the type of graph, and 2 integers (which represents the number of nodes). If the type of graph is a string named 'barbell' create a [barbell graph](https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.generators.classic.barbell_graph.html?highlight=barbell%20graph#networkx.generators.classic.barbell_graph) and pass in the 2 integers as arguments in the same order as the function. If the type of graph is anything else create a [lollipop graph](https://networkx.github.io/documentation/latest/reference/generated/networkx.generators.classic.lollipop_graph.html) and pass in the 2 integers as arguments in the same order as the function.

In [None]:
# YOUR CODE HERE

In [None]:
b1 = barbell_lollipop('barbell', 10, 4)
l1 = barbell_lollipop('lollipop', 20, 4)

In [None]:
assert_equal(b1.edges(), [(0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 9), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (3, 4), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (5, 6), (5, 7), (5, 8), (5, 9), (6, 7), (6, 8), (6, 9), (7, 8), (7, 9), (8, 9), (9, 10), (10, 11), (11, 12), (12, 13), (13, 14), (14, 15), (14, 16), (14, 17), (14, 18), (14, 19), (14, 20), (14, 21), (14, 22), (14, 23), (15, 16), (15, 17), (15, 18), (15, 19), (15, 20), (15, 21), (15, 22), (15, 23), (16, 17), (16, 18), (16, 19), (16, 20), (16, 21), (16, 22), (16, 23), (17, 18), (17, 19), (17, 20), (17, 21), (17, 22), (17, 23), (18, 19), (18, 20), (18, 21), (18, 22), (18, 23), (19, 20), (19, 21), (19, 22), (19, 23), (20, 21), (20, 22), (20, 23), (21, 22), (21, 23), (22, 23)])
assert_equal(l1.edges(), [(0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 9), (0, 10), (0, 11), (0, 12), (0, 13), (0, 14), (0, 15), (0, 16), (0, 17), (0, 18), (0, 19), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13), (1, 14), (1, 15), (1, 16), (1, 17), (1, 18), (1, 19), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (2, 10), (2, 11), (2, 12), (2, 13), (2, 14), (2, 15), (2, 16), (2, 17), (2, 18), (2, 19), (3, 4), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (3, 11), (3, 12), (3, 13), (3, 14), (3, 15), (3, 16), (3, 17), (3, 18), (3, 19), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (4, 10), (4, 11), (4, 12), (4, 13), (4, 14), (4, 15), (4, 16), (4, 17), (4, 18), (4, 19), (5, 6), (5, 7), (5, 8), (5, 9), (5, 10), (5, 11), (5, 12), (5, 13), (5, 14), (5, 15), (5, 16), (5, 17), (5, 18), (5, 19), (6, 7), (6, 8), (6, 9), (6, 10), (6, 11), (6, 12), (6, 13), (6, 14), (6, 15), (6, 16), (6, 17), (6, 18), (6, 19), (7, 8), (7, 9), (7, 10), (7, 11), (7, 12), (7, 13), (7, 14), (7, 15), (7, 16), (7, 17), (7, 18), (7, 19), (8, 9), (8, 10), (8, 11), (8, 12), (8, 13), (8, 14), (8, 15), (8, 16), (8, 17), (8, 18), (8, 19), (9, 10), (9, 11), (9, 12), (9, 13), (9, 14), (9, 15), (9, 16), (9, 17), (9, 18), (9, 19), (10, 11), (10, 12), (10, 13), (10, 14), (10, 15), (10, 16), (10, 17), (10, 18), (10, 19), (11, 12), (11, 13), (11, 14), (11, 15), (11, 16), (11, 17), (11, 18), (11, 19), (12, 13), (12, 14), (12, 15), (12, 16), (12, 17), (12, 18), (12, 19), (13, 14), (13, 15), (13, 16), (13, 17), (13, 18), (13, 19), (14, 15), (14, 16), (14, 17), (14, 18), (14, 19), (15, 16), (15, 17), (15, 18), (15, 19), (16, 17), (16, 18), (16, 19), (17, 18), (17, 19), (18, 19), (19, 20), (20, 21), (21, 22), (22, 23)])


In [None]:
# Let's see what both graphs look like
nx.draw(b1)

In [None]:
nx.draw(l1)

# Problem 4:  Finding memebers of the karate club
Each node in the graph below belongs to a particular club. In other words each node has an attirbute in G.
![color_graph.png](color_graph.png)
This snippet of code shows you how acess the club attribute for node 5 and 18:
```python
print(G.node[5]['club'])  # prints out Mr.Hi
print(G.node[18]['club'])  # prints out Officer
```
Iterate through all of the nodes in G and if the node is a memeber of Mr.Hi assign it a color of red (use the string 'r' to denote this). If the node is a member of Officer assign it a color of blue (use the string 'b' to denote this). Store these colors in a list called colors.

In [None]:
G = nx.karate_club_graph()

In [None]:
# YOUR CODE HERE

In [None]:
# Let's see if our graph matches above.
nx.draw_circular(G, node_color=colors)

In [None]:
assert_equal(colors, ['r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'b', 'r', 'r', 'r', 'r', 'b', 'b', 'r', 'r', 'b', 'r', 'b', 'r', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b'])

# Problem 5: Finding the most common hash tags.
We have searched for *#data* and stored it in a file called tweets.data. Each row in tweets.data contains 1 tweet. For this problem do the following:
Read in all of the tweets using which ever method you prefer (numpy, pandas, built-in python functions, etc). Next go through the tweets and find all of the hash tags. Convert the hashtags to all lowercase letters and sort the list of tags. Create a set that only contains unique tags.  Now make combinations for the pairs of unique tags. Create a Counter variable to keep track of the combination of tags commonly found together. Iterate through the combinations of tags and add a new key to the counter if it does not exists and increment the Counter by 1 otherwise increment the value of the key in the counter variable if the combination already exists. Store the Counter variable as *cnt*.

In [None]:
# YOUR CODE HERE

In [None]:
top_10_counts = cnt.most_common(10)
top_20_counts = cnt.most_common(20)
top_50_counts = cnt.most_common(50)

assert_equal(top_10_counts, [(('data', 'marketing'), 73), (('analytics', 'data'), 68), (('bigdata', 'data'), 48), (('ai', 'data'), 44), (('data', 'tech'), 32), (('data', 'datascience'), 22), (('cloud', 'data'), 20), (('blockchain', 'data'), 20), (('data', 'netflix'), 19), (('data', 'phishing'), 19)])
assert_equal(top_20_counts, [(('data', 'marketing'), 73), (('analytics', 'data'), 68), (('bigdata', 'data'), 48), (('ai', 'data'), 44), (('data', 'tech'), 32), (('data', 'datascience'), 22), (('cloud', 'data'), 20), (('blockchain', 'data'), 20), (('data', 'netflix'), 19), (('data', 'phishing'), 19), (('netflix', 'phishing'), 19), (('analytics', 'bigdata'), 19), (('data', 'iot'), 18), (('data', 'security'), 18), (('data', 'ecommerce'), 18), (('data', 'gdpr'), 18), (('ecommerce', 'marketing'), 17), (('business', 'data'), 16), (('data', 'machinelearning'), 13), (('cybersecurity', 'data'), 13)])
assert_equal(top_50_counts, [(('data', 'marketing'), 73), (('analytics', 'data'), 68), (('bigdata', 'data'), 48), (('ai', 'data'), 44), (('data', 'tech'), 32), (('data', 'datascience'), 22), (('cloud', 'data'), 20), (('blockchain', 'data'), 20), (('data', 'netflix'), 19), (('data', 'phishing'), 19), (('netflix', 'phishing'), 19), (('analytics', 'bigdata'), 19), (('data', 'iot'), 18), (('data', 'security'), 18), (('data', 'ecommerce'), 18), (('data', 'gdpr'), 18), (('ecommerce', 'marketing'), 17), (('business', 'data'), 16), (('data', 'machinelearning'), 13), (('cybersecurity', 'data'), 13), (('data', 'infosec'), 13), (('data', 'science'), 13), (('data', 'ml'), 12), (('data', 'fintech'), 12), (('data', 'journalism…'), 11), (('data', 'visual'), 11), (('journalism…', 'visual'), 11), (('data', '…'), 11), (('ai', 'bigdata'), 10), (('bigdata', 'ml'), 10), (('data', 'startup'), 10), (('data', 'startups'), 10), (('data', 'digital'), 10), (('myths', 'narratives'), 10), (('data', 'martech'), 10), (('data', 'dataviz'), 10), (('data', 'python'), 10), (('analytics', 'marketing'), 10), (('ai', 'watsonce'), 10), (('data', 'watsonce'), 10), (('data', 'news'), 9), (('bigdata', 'business'), 9), (('data', 'myths'), 9), (('data', 'narratives'), 9), (('big', 'data'), 9), (('commvaultgo', 'data'), 9), (('breach', 'data'), 9), (('bigdata', 'marketing'), 9), (('ai', 'ml'), 8), (('bigdata', 'datascience'), 8)])

The below ugly piece of code takes in the top 10 pairs and finds all of the unique hashtags that show up.

In [None]:
hashtags = list(set([element for k in top_10_counts for element in k[0]]))
print(hashtags)

In [None]:
top_10_counts

# Problem 6: Create a graph from the Twitter hashtag pairs

Write a function called $\texttt{create_graph}$ which creates a graph using the twitter hashtag pairs. The nodes should represent the twitter hashtag names, and edges should be drawn between nodes which are paired together. For example, 'data' and 'marketing' should have an edge since they are paired together.

In [None]:
def create_graph(list_of_pairs,hashtags):
    '''
    Inputs
    --------
    
    list_of_pairs: a list of tuples where each tuple is of the form (('name1', 'name2), #)
    hastags: a list containing the unique hashtags that show up in list_of_pairs
    
    Returns
    -------
    G: a networkx graph
    '''
    
    # YOUR CODE HERE

In [None]:
g = create_graph(top_10_counts,hashtags)
assert_is_instance(g, nx.Graph)

nodes = np.array(g.nodes())
assert_equal(nodes.size, 11)
nodes_list = ['analytics',
 'netflix',
 'blockchain',
 'ai',
 'data',
 'bigdata',
 'marketing',
 'datascience',
 'phishing',
 'cloud',
 'tech']

assert_equal(1,'data' in nodes)
assert_equal(edges.shape[0],10)

In [None]:
#Let ssee how the graph looks
nx.draw(g,with_labels=True)