# Chapter 1: Introduction

Motivating example: Data science social network.

## Finding Key Connectors

In [1]:
users = [
    {'id': 0, 'name': 'Hero'},
    {'id': 1, 'name': 'Dunn'},
    {'id': 2, 'name': 'Sue'},
    {'id': 3, 'name': 'Chi'},
    {'id': 4, 'name': 'Thor'},
    {'id': 5, 'name': 'Clive'},
    {'id': 6, 'name': 'Hicks'},
    {'id': 7, 'name': 'Devin'},
    {'id': 8, 'name': 'Kate'},
    {'id': 9, 'name': 'Klein'},
]

In [2]:
# e.g. user 0 and 1 are friends
friendship_pairs = [(0,1), (0,2), (1,2), (1,3), 
                    (2,3), (3,4), (4,5), (5,6),
                    (5,7), (6,8), (7,8), (8,9)]

In [3]:
# Idea: create a dictionary listing friends of each user

# Initialize the dict with an empty list for each user id:
friendships = {user['id']: [] for user in users}

# loop over the friendship pairs to populate it:
for i, j in friendship_pairs:
    friendships[i].append(j) # add j as a friend of user i
    friendships[j].append(i) # add i as a friend of user j
    
friendships

{0: [1, 2],
 1: [0, 2, 3],
 2: [0, 1, 3],
 3: [1, 2, 4],
 4: [3, 5],
 5: [4, 6, 7],
 6: [5, 8],
 7: [5, 8],
 8: [6, 7, 9],
 9: [8]}

In [4]:
# Question: what's the average number of connections

# First find total number of connections
def number_of_friends(user):
    """How many friends does _user_ have?"""
    user_id = user['id']
    friend_ids = friendships[user_id]
    return len(friend_ids)

total_connections = sum(number_of_friends(user) for user in users) #24
print(total_connections)

24


In [5]:
# Then divide by the total number of users
num_users = len(users)
avg_connections = total_connections / num_users # 2.4
print(avg_connections)

2.4


In [6]:
# Sort by number of friends

# Create a list of (user_id, number_of_friends)
num_friends_by_id = [(user['id'], number_of_friends(user))
                     for user in users]

num_friends_by_id.sort(
    key=lambda id_and_friends: id_and_friends[1], # by num_friends
    reverse=True # largest to smallest
)

print(num_friends_by_id)

[(1, 3), (2, 3), (3, 3), (5, 3), (8, 3), (0, 2), (4, 2), (6, 2), (7, 2), (9, 1)]


Compute the metric *degree centrality* above.

## Data Scientists You May Know

In [7]:
# Idea: find friend of friends

def foaf_ids_bad(user):
    """foaf is short for "friend of a friend" """
    return [foaf_id
            for friend_id in friendships[user['id']]
            for foaf_id in friendships[friend_id]]

In [15]:
# foaf for Hero (user id 0)
hero = users[0]
foaf_ids_bad(hero)

[0, 2, 3, 0, 1, 3]

The above is bad because it includes Hero (himself twice), friends he knows (1,2) and users 3 twice.

In [17]:
from collections import Counter

def friends_of_friends(user):
    user_id = user['id']
    return Counter(
        foaf_id
        for friend_id in friendships[user_id]    # for each of my friends
        for foaf_id in friendships[friend_id]    # find their friends
        if foaf_id != user_id                    # who aren't me
        and foaf_id not in friendships[user_id]  # and aren't my friends
    )

In [21]:
print(friends_of_friends(users[3]))

Counter({0: 2, 5: 1})


This says that Chi has two mutual friends with user 0 and 1 mutal friend with user 5.

In [23]:
interests = [
    (0, 'Hadoop'), (0, 'Big Data'), (0, 'HBase'), (0, 'Java'),
    (0, 'Spark'), (0, 'Storm'), (0, 'Cassandra'),
    (1, 'NoSQL'), (1, 'MongoDB'), (1, 'Cassandra'), (1, 'HBase'),
    (1, 'Postgres'), (2, 'Python'), (2, 'scikit-learn'), (2, 'scipy'),
    (2, 'numpy'), (2, 'statsmodels'), (2, 'pandas'), (3, 'R'), (3, 'Python'),
    (3, 'statistics'), (3, 'regression'), (3, 'probability'),
    (4, 'machine learning'), (4, 'regression'), (4, 'decision trees'), 
    (4, 'libsvm'), (5, 'Python'), (5, 'R'), (5, 'Java'), (5, 'C++'), 
    (5, 'Haskell'), (5, 'programming languages'), (6, 'statistics'),
    (6, 'probability'), (6, 'mathematics'), (6, 'theory'), 
    (7, 'machine learning'), (7, 'scikit-learn'), (7, 'Mahout'),
    (7, 'neural networks'), (8, 'neural networks'), (8, 'deep learning'),
    (8, 'big data'), (8, 'artificial intelligence'), (9, 'hadoop'),
    (9, 'Java'), (9, 'MadReduce'), (9, 'Big Data')
]

In [24]:
# Try to find users who share interests

# build function that finds users with a certain interest
def data_scientist_who_like(target_interest):
    """Find the ids of all users who like the target interest."""
    return [user_id
            for user_id, user_interest in interest
            if user_interest == target_interest]

The above has to iterate over the list each time. Lets create a dictionary.

In [27]:
from collections import defaultdict

# Keys are interest, values are lists of user_ids with that interest
user_ids_by_interest = defaultdict(list)

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)
    
# another from users to interest
interests_by_user_id = defaultdict(list)

for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)

Now it's easy to find who has the most interests in common with a given user:
* iterator over the user's interest
* for each interest, iterate over the other users with that interest
* keep count of how many times we see each other user

In [28]:
def most_common_interests_with(user):
    return Counter(
        interested_user_id
        for interest in interests_by_user_id[user['id']]
        for interested_user_id in user_ids_by_interest[interest]
        if interested_user_id != user['id']
    )

In [30]:
most_common_interests_with(hero)

Counter({9: 2, 1: 2, 5: 1})

## Salaries and Experience

In [31]:
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (64000, 4.2)]

In [32]:
# Keys are years, values are lists of the salaries for each tenure
salary_by_tenure = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)
    
# Keys are years, each value is average salary for that tenure
average_salary_by_tenure = {
    tenure: sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

In [33]:
# Above not useful as no two users have the same tenure. Try bucketing
def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"

In [34]:
# Keys are tenure buckets, values are lists of salaries for that bucket
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)
    
# Keys are tenure buckets, values are average salary in that bucket
average_salary_by_bucket = {
    tenure_bucket: sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}

In [35]:
average_salary_by_bucket

{'more than five': 79166.66666666667,
 'less than two': 48000.0,
 'between two and five': 62000.0}

## Topics of interests

In [37]:
# find topic words
# 1. lower each interest
# 2. split into words
# 3. count the results

words_and_counts = Counter(word
                           for user, interest in interests
                           for word in interest.lower().split())

In [39]:
for word, count in words_and_counts.most_common():
    if count > 1:
        print(word, count)

big 3
data 3
java 3
python 3
learning 3
hadoop 2
hbase 2
cassandra 2
scikit-learn 2
r 2
statistics 2
regression 2
probability 2
machine 2
neural 2
networks 2
