In [1]:
from pprint import pprint

## Problem statement: Identifying key connectors among data scientists

In [2]:
# datadump from Data Sciencester network
users = [
    {"id": 0, "name": "Hero"},
    {"id": 1, "name": "Dunn"},
    {"id": 2, "name": "Sue"},
    {"id": 3, "name": "Chi"},
    {"id": 4, "name": "Thor"},
    {"id": 5, "name": "Clive"},
    {"id": 6, "name": "Hicks"},
    {"id": 7, "name": "Devin"},
    {"id": 8, "name": "Kate"},
    {"id": 9, "name": "Klein"},
]
friendship_pairs = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
                    (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodel"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

# improving friends lookout by id
friendships = {}

for i, j in friendship_pairs:
    friendships.setdefault(i, []).append(j)
    friendships.setdefault(j, []).append(i)

pprint(friendships)

{0: [1, 2],
 1: [0, 2, 3],
 2: [0, 1, 3],
 3: [1, 2, 4],
 4: [3, 5],
 5: [4, 6, 7],
 6: [5, 8],
 7: [5, 8],
 8: [6, 7, 9],
 9: [8]}


### Determining the average number of connections

In [3]:
avg_connections = sum(len(friends) for friends in friendships.values()) / len(friendships)
print("Average number of connections:", round(avg_connections, 2))

Average number of connections: 2.4


### Identifying the most connected individual

This concept is commonly referred to as degree centrality.

In [4]:
connections_by_id = [(id, len(friends)) for id, friends in friendships.items()]
connections_by_id.sort(key=lambda pair: pair[1], reverse=True)
print("List of connections, sorted from the most connected to the least connected:\n", connections_by_id, sep="")

List of connections, sorted from the most connected to the least connected:
[(1, 3), (2, 3), (3, 3), (5, 3), (8, 3), (0, 2), (4, 2), (6, 2), (7, 2), (9, 1)]


## Problem statement: Generating friends suggestions

### Derived from shared connections

In [5]:
from collections import Counter

In [6]:
def friends_of_friends_count(id):
    return Counter(friend_of_friend # user may know this person
                for friend in friendships[id]
                for friend_of_friend in friendships[friend]
                if friend_of_friend not in friendships[id] # not the user's friends
                and friend_of_friend != id) # not the user itself

suggested_by_common_friends = {
    id:dict(friends_of_friends_count(id))
    for id in friendships
}

pprint(suggested_by_common_friends)

{0: {3: 2},
 1: {4: 1},
 2: {4: 1},
 3: {0: 2, 5: 1},
 4: {1: 1, 2: 1, 6: 1, 7: 1},
 5: {3: 1, 8: 2},
 6: {4: 1, 7: 2, 9: 1},
 7: {4: 1, 6: 2, 9: 1},
 8: {5: 2},
 9: {6: 1, 7: 1}}


### Derived from common interests

In [7]:
ids_by_interests = {}

for id, interest in interests:
    ids_by_interests.setdefault(interest, []).append(id)

pprint(ids_by_interests)

{'Big Data': [0, 8, 9],
 'C++': [5],
 'Cassandra': [0, 1],
 'HBase': [0, 1],
 'Hadoop': [0, 9],
 'Haskell': [5],
 'Java': [0, 5, 9],
 'Mahout': [7],
 'MapReduce': [9],
 'MongoDB': [1],
 'NoSQL': [1],
 'Postgres': [1],
 'Python': [2, 3, 5],
 'R': [3, 5],
 'Spark': [0],
 'Storm': [0],
 'artificial intelligence': [8],
 'decision trees': [4],
 'deep learning': [8],
 'libsvm': [4],
 'machine learning': [4, 7],
 'mathematics': [6],
 'neural networks': [7, 8],
 'numpy': [2],
 'pandas': [2],
 'probability': [3, 6],
 'programming languages': [5],
 'regression': [3, 4],
 'scikit-learn': [2, 7],
 'scipy': [2],
 'statistics': [3, 6],
 'statsmodel': [2],
 'theory': [6]}


In [8]:
interests_by_id = {}

for id, interest in interests:
    interests_by_id.setdefault(id, []).append(interest)

pprint(interests_by_id)

{0: ['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra'],
 1: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'],
 2: ['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodel', 'pandas'],
 3: ['R', 'Python', 'statistics', 'regression', 'probability'],
 4: ['machine learning', 'regression', 'decision trees', 'libsvm'],
 5: ['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages'],
 6: ['statistics', 'probability', 'mathematics', 'theory'],
 7: ['machine learning', 'scikit-learn', 'Mahout', 'neural networks'],
 8: ['neural networks', 'deep learning', 'Big Data', 'artificial intelligence'],
 9: ['Hadoop', 'Java', 'MapReduce', 'Big Data']}


In [9]:
def most_common_interests_with(id):
    return (Counter(other_id
                    for interest in interests_by_id[id]
                    for other_id in ids_by_interests[interest]
                    if other_id != id))

pprint(most_common_interests_with(0))

Counter({9: 3, 1: 2, 8: 1, 5: 1})
