# Chapter 01 - Introduction

In [None]:
# Useful functional library
import cytoolz.curried as toolz

## Finding Key Connectors

User list for entire DataSciencester network

In [None]:
users = [
    { 'id': 0, 'name': 'Hero' },
    { 'id': 1, 'name': 'Dunn' },
    { 'id': 2, 'name': 'Sue' },
    { 'id': 3, 'name': 'Chi' },
    { 'id': 4, 'name': 'Thor' },
    { 'id': 5, 'name': 'Clive' },
    { 'id': 6, 'name': 'Hicks' },
    { 'id': 7, 'name': 'Devin' },
    { 'id': 8, 'name': 'Kate' },
    { 'id': 9, 'name': 'Klein' },
]

The raw friendship list is a list of tuples containing a pair of IDs

In [None]:
friendship_pairs = [
    (0, 1), (0, 2),
    (1, 2), (1, 3),
    (2, 3),
    (3, 4),
    (4, 5),
    (5, 6), (5, 7),
    (6, 8),
    (7, 8),
    (8, 9),
]

This representation is not the easiest with which to work.

Let's create a `dict` mapping user IDs to a list of friend user IDs

In [None]:
# Initialize the `dict` with an empty list for each user ID
friendships = { user['id']: [] for user in users }

In [None]:
# Loop over the `friendship_pairs` to append friends to the `friendships` values
for i, j in friendship_pairs:
    friendships[i].append(j)  # Add j as a friend of user i
    friendships[j].append(i)  # Friendships are symmetrical

In [None]:
friendships

We can now easily ask questions of our `friendships` graph like, "What's the average number of connections?"

In [None]:
# We find the _total_ number of connections by summing up the lengths of all the friends lists
def friend_count(user):
    """How many friends does _user_ have?"""
    user_id = user['id']
    friends_ids = friendships[user_id]
    return len(friends_ids)

total_connections = sum(friend_count(user) for user in users)
total_connections

In [None]:
# Calculate the average friendship length by dividing the `total_connections` by the number of users
user_count = len(users)
avg_connections = total_connections / user_count
avg_connections

Find the most connected people; that is, the people who have the largest number of friends.

In [None]:
# Create a list of pairs between user IDs and the number of friends
friend_counts_by_id = [(user['id'], friend_count(user)) for user in users]
friend_counts_by_id

In [None]:
# And sort the list by the number of friends
friend_counts_by_id.sort(key=lambda friend_count_by_id: friend_count_by_id[1], reverse=True)
friend_counts_by_id

We have identified people who are _central_ to the network. Specifically, we have calculated the network metric _degree centrality_.

## Data Scientists You May Know

Let's try simply calculating the friends of the friend of the user.

In [None]:
# "Friend of a friend" - bad implementation
def foaf_ids_bad(user):
    """Calculate "friends of a friend" of `user`"""
    return [foaf_id
            for friend_id in friendships[user['id']]  # for each friend of the user
            for foaf_id in friendships[friend_id]]  # calculate the IDs of the friend's friends

In [None]:
# Testing this on "Hero" produces unexpected (erroneous) results
foaf_ids_bad(users[0])

Argh... The result contains 0, Hero himself, (twice!), Chi twice, and Dunn and Sue even though Dunn and Sue are already friends.

These results make sense because:

- Hero is indeed friends with both of his friends
- Dunn and Sue are already friends (and friendship is symmetric)
- Chi is reachable through two different friends

In [None]:
print(friendships[0])
print(friendships[1])
print(friendships[2])

Knowing that people are friends of a friend in _multiple_ ways seems like useful information.

Perhaps instead of a list of friends of a friend, we should return a count of unique friends?

In [None]:
# Better "friends of a friend" implementation
from collections import Counter

def friends_of_friends(user):
    """A better calculation of 'friends of a friend'

    Note: The argument to counter is a
    [generator expression](https://docs.python.org/3/glossary.html#index-20). It cannot live "outside" of an
    argument to a function.

    If one wants to test its value, one must copy the text and paste it inside a pair of brackets; that is,
    one must evaluate as part of a list comprehension.
    """
    user_id = user['id']  # alias to lessen typing errors
    return Counter(
        foaf_id
        for friend_id in friendships[user_id]  # for each of my friends
        for foaf_id in friendships[friend_id]  # find their friends
        if foaf_id != user_id  # who are neither me
        and foaf_id not in friendships[user_id]  # nor a friend I already have
    )

In [None]:
# Testing the `Counter` argument
test_id = 3
[foaf_id
 for friend_id in friendships[test_id]  # for each of my friends
 for foaf_id in friendships[friend_id]  # find their friends
 if foaf_id != test_id  # who are neither me
 and foaf_id not in friendships[test_id]]

In [None]:
print(friends_of_friends(users[3]))

You, a budding member of the DataSciencester staff, in addition to perhaps having an interest in your friends' friends, might also be interested in meeting other staff members with similar interests.

Here are the interests of staff members represented as a list of id-interest pairs.

In [None]:
interests = [
    (0, 'hadoop'), (0, 'Big Data'), (0, 'HBase'), (0, 'Java'), (0, 'Spark'), (0, 'Storm'), (0, 'Cassandra'),
    (1, 'NoSQL'), (1, 'MongoDB'), (1, 'Cassandra'), (1, 'HBase'), (1, 'Postgres'),
    (2, 'Python'), (2, 'scikit-learn'), (2, 'scipy'), (2, 'numpy'), (2, 'statsmodels'), (2, 'pandas'),
    (3, 'R'), (3, 'Python'), (3, 'statistics'), (3, 'regression'), (3, 'probability'),
    (4, 'machine learning'), (4, 'regression'), (4, 'decision trees'), (4, 'libsvm'),
    (5, 'Python'), (5, 'R'), (5, 'Java'), (5, 'C++'), (5, 'Haskell'), (5, 'programming languages'),
    (6, 'statistics'), (6, 'probability'), (6, 'mathematics'), (6, 'theory'),
    (7, 'machine learning'), (7, 'scikit-learn'), (7, 'Mahout'), (7, 'neural networks'),
    (8, 'neural networks'), (8, 'deep learning'), (8, 'Big Data'), (8, 'artificial intelligence'),
    (9, 'Hadoop'), (9, 'Java'), (9, 'MapReduce'), (9, 'Big Data'),
]

It is easy to build a function that finds users with a specific interest.

In [None]:
def data_scientists_who_like(target_interest):
    """Find the user IDs of other data scientists who share `target_interest`"""
    return [user_id for user_id, interest in interests if interest == target_interest]

In [None]:
print(data_scientists_who_like('programming languages'))
print(data_scientists_who_like('Python'))
print(data_scientists_who_like('R'))

This code works but it is not especially efficient. It must search through each ID-interest pair to find like-minded data scientists. In this case, relatively small amount of users or searches, it works fine.

To better scale, let's create two indices: one from interests to users...

In [None]:
from collections import defaultdict

# Keys are interests, values are lists of user IDs who share that interest
user_ids_by_interest = defaultdict(list)
for user_id_key, user_interest in interests:
    user_ids_by_interest[user_interest].append(user_id_key)

In [None]:
user_ids_by_interest

In [None]:
user_ids_by_interest['probability']

...And create the reverse index, from user IDs to interests

In [None]:
# Keys are user IDs. Values are lists of interests.
interests_by_user_id = defaultdict(list)
for user_id_key, user_interest in interests:
    interests_by_user_id[user_id_key].append(user_interest)

In [None]:
interests_by_user_id

In [None]:
interests_by_user_id[7]

Now it's easy to find who has the most interests in common with a specific user:

- Iterate over the users interests
- For each interest, iterate over the other users with that interest
- Keep a count of how many times we see each other user

In [None]:
def most_common_interests_with(user):
    """Find the counts other users that shared interests with `user`"""
    return Counter(
        interested_user_id
        for interest in interests_by_user_id[user['id']]  # Loop over all my interests
        for interested_user_id in user_ids_by_interest[interest]  # Calculate everyone interested in my interest
        if interested_user_id != user['id']  # But skip my own interests
    )

In [None]:
# Remember, a `dict` from user ID to shared interest count pairs
most_common_interests_with(users[5])

## Salaries and Experience

Anonymized dataset with salaries and tenure as a data scientist (years).

In [None]:
salaries_and_tenure = [
    (83000, 8.7),
    (88000, 8.1),
    (48000, 0.7),
    (76000, 6),
    (69000, 6.5),
    (76000, 7.5),
    (60000, 2.5),
    (83000, 10),
    (48000, 1.9),
    (63000, 4.2),
]

From plotting the data, it seems clear that salary increases with tenure, but how might we turn this into a "fun fact"?

Let's try looking at the average salary for each tenure.

In [None]:
# Keys are years, values are lists of the salaries for each tenure
salary_by_tenure = defaultdict(list)

for salary_value, tenure_years in salaries_and_tenure:
    salary_by_tenure[tenure_years].append(salary_value)

In [None]:
# Keys are years, values are **average** salary for that tenure
average_salary_by_tenure = {
    tenure: sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

In [None]:
average_salary_by_tenure

Turns out that this calculation is not very interesting because no users have the **same** tenure. We are actually just reporting the anonymized user's salaries.

Let's try bucketing the tenures.

In [None]:
# Define a function to bucket tenures
def tenure_bucket(tenure):
    if tenure < 2:
        return 'less than 2'
    elif tenure <= 5:
        return 'between 2 and 5'
    else:
        return 'more than 5'

In [None]:
# The keys of our new `defaultdict` are tenure buckets; the values are yearly salaries
salary_by_tenure_bucket = defaultdict(list)

for yearly_salary, tenure_years in salaries_and_tenure:
    bucket = tenure_bucket(tenure_years)
    salary_by_tenure_bucket[bucket].append(yearly_salary)

salary_by_tenure_bucket

In [None]:
# We then average the salaries in each tenure bucket
average_salary_by_tenure_bucket = toolz.valmap(
    lambda salaries: sum(salaries) / len(salaries),
    salary_by_tenure_bucket
)
average_salary_by_tenure_bucket

In [None]:
100 * (average_salary_by_tenure_bucket['more than 5'] - average_salary_by_tenure_bucket['less than 2']) / average_salary_by_tenure_bucket['less than 2']

This calculation gives us our "sound bite": "Data scientists with more than 5 years experience make 65% more money than data scientists with little or no experience."

## Paid Accounts

The VP of Revenue wants to better understand which users pay for accounts and which do not.

In [None]:
# Here a list of years experience and paid/unpaid. You notice that there appears to be a relationship between years experience and paid/unpaid.
paid_accounts = [
    (0.7, 'paid'),
    (1.9, 'unpaid'),
    (2.5, 'paid'),
    (4.2, 'unpaid'),
    (6.0, 'unpaid'),
    (6.5, 'unpaid'),
    (7.5, 'unpaid'),
    (8.1, 'unpaid'),
    (8.7, 'paid'),
    (10.0, 'paid'),
]

Users with few years and many years experience tend to pay; other uses do not.

How might we predict paid/unpaid based on years experience?

In [None]:
# Here is our "model"
def predict_paid_or_unpaid(years_experience):
    if years_experience < 3.0:
        return 'paid'
    elif 3.0 <= years_experience < 8.5:
        return 'unpaid'
    else:
        return 'paid'

# Of course, we totally eyeballed the ranges.

In [None]:
sum([paid_up[1] == predict_paid_or_unpaid(paid_up[0]) for paid_up in paid_accounts]) / len(paid_accounts)

In [None]:
list(toolz.map(lambda paid_up: paid_up[1] == predict_paid_or_unpaid(paid_up[0]), paid_accounts))