# Chapter 1: Introduction

## Finding Key Connectors
List of people and their friendships (page 3)

In [1]:
# data

users = [
    {"id": 0, "name": "Hero"},
    {"id": 1, "name": "Dunn"},
    {"id": 2, "name": "Sue"},
    {"id": 3, "name": "Chi"},
    {"id": 4, "name": "Thor"},
    {"id": 5, "name": "Clive"},
    {"id": 6, "name": "Hicks"},
    {"id": 7, "name": "Devin"},
    {"id": 8, "name": "Kate"},
    {"id": 9, "name": "Klein"},
]

friendship_pairs = [(0, 1), (0, 2), (1, 2), (1, 3),
                    (2, 3), (3, 4), (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]

# Looking up the friend of user 1, we have to look through the friendship_pairs to identify all pairs containing user 1.
# This is not the easiest way of doing it.
# Lets create a dict to make this easier. We still have to go over all the friendships_pairs once, but then we get cheap access.

In [7]:
# initialize the dict with an empty list for each user id
friendships = {user["id"]: [] for user in users}
friendships

# loop over the friendship pairs to populate it
for i, j in friendship_pairs:
    friendships[i].append(j) # add j as friend of user i
    friendships[j].append(i) # add i as friend of user j

friendships

{0: [1, 2],
 1: [0, 2, 3],
 2: [0, 1, 3],
 3: [1, 2, 4],
 4: [3, 5],
 5: [4, 6, 7],
 6: [5, 8],
 7: [5, 8],
 8: [6, 7, 9],
 9: [8]}

In [24]:
# simple calculations

# what is the average number of connections?
# total number of connections
def number_of_friends(user):
    """How many friends does _user_ have?"""
    user_id = user["id"]
    friend_ids = friendships[user_id]
    return len(friend_ids)

total_connections = sum(number_of_friends(user) for user in users)

# divide by number of users
num_users = len(users)
avg_connections = total_connections / num_users
avg_connections


# who are the most connected people?
# create a list (user_id, number_of_friends)
num_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]
num_friends_by_id.sort(key=lambda id_and_friends: id_and_friends[1], reverse=True)
num_friends_by_id


[(1, 3),
 (2, 3),
 (3, 3),
 (5, 3),
 (8, 3),
 (0, 2),
 (4, 2),
 (6, 2),
 (7, 2),
 (9, 1)]

## Data Scientists You May Know
Friends of friends (page 6)

In [36]:
# collect your friends friends
def foaf_ids_bad(user):
    """ foaf is short for "friend of a friend" """
    return [foaf_id 
            for friend_id in friendships[user["id"]]
            for foaf_id in friendships[friend_id]
            #if foaf_id != user["id"]                   # who aren't me
            #and foaf_id not in friendships[user["id"]]
            ]

foaf_ids_bad(users[0])

# knowing that people are friends of friends in multiple ways (user 3) seems like interesting information.
# lets produce a count of mutual friends and exclude people already known to the user
from collections import Counter

def friends_of_friends(user):
    user_id = user["id"]
    return Counter(
        foaf_id
        for friend_id in friendships[user_id]   # for each of my friends
        for foaf_id in friendships[friend_id]   # find their friends
        if foaf_id != user_id                   # who aren't me
        and foaf_id not in friendships[user_id] # and aren't my friends
    )

friends_of_friends(users[3])

Counter({0: 2, 5: 1})

In [18]:
# data on interests of users
interests = {
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"), (0, "Spark"), (0, "Storm"), (0, "Cassandra"), 
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"), (1, "Postgres"), 
    (2, "Python"), (2, "scikit-learn"), (2, "scipy"), (2, "numpy"), (2, "statsmodels"), (2, "pandas"), 
    (3, "R"), (3, "Python"), (3, "statistics"), (3, "regression"), (3, "probability"), 
    (4, "machine learning"), (4, "regression"), (4, "decision trees"), (4, "libsvm"), 
    (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"), (5, "Haskell"), (5, "programming languages"), 
    (6, "statistics"), (6, "probability"), (6, "mathematics"), (6, "theory"), 
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"), (7, "neural networks"), 
    (8, "neural networks"), (8, "deep learning"), (8, "Big Data"), (8, "artificial intelligence"), 
    (9, "Hadoop"), (9, "Java"), (9, "MapReduce"), (9, "Big Data") 
}

# user with certain interests
def data_scientists_who_like(target_interest):
    """Find the ids of all useres who like the target interest"""
    return [user_id
            for user_id, user_interst in interests
            if user_interst == target_interest]

data_scientists_who_like("Java")

# in above example we have to examine the whole list of interest for every search
# we are proably better off building an index from interests to users
from collections import defaultdict

# keys are interests, values are lists of user_ids with that interest
user_ids_by_interest = defaultdict(list)

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

# Keys are user_ids, values are lists of interests for that user_id
interests_by_user_id = defaultdict(list)

for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)

# who has the most interests in common wih a given user?
# iterate over the user's interest
# for each interest, iterate over the other users with that interest
# keep count of how many times we see each other user

def most_common_interest_with(user):
    return Counter(
        interested_user_id
        for interest in interests_by_user_id[user["id"]]
        for interested_user_id in user_ids_by_interest[interest]
        if interested_user_id != user["id"]
    )

most_common_interest_with(users[0])

NameError: name 'users' is not defined

## Salaries and Experience
Salaries and tenure (page 8)

In [8]:
# data
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (63000, 4.2)]

In [10]:
# average salary for every tenure
from collections import defaultdict

# keys are years, values are lists of the salaries for each tenure
salary_by_tenure = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)

# keys are years, each value is the average salary for that tenure
average_salary_by_tenure = {
    tenure: sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

average_salary_by_tenure


{8.7: 83000.0,
 8.1: 88000.0,
 0.7: 48000.0,
 6: 76000.0,
 6.5: 69000.0,
 7.5: 76000.0,
 2.5: 60000.0,
 10: 83000.0,
 1.9: 48000.0,
 4.2: 63000.0}

In [12]:
# bucket the tenures
def tenure_buckets(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between to and five"
    else:
        return "more than five"
    
# keys are tenure buckets, values are lists of salaries for that bucket
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_buckets(tenure)
    salary_by_tenure_bucket[bucket].append(salary)

# compute average salary for each bucket

# keys are tenure buckets, each value is the average salary for that tenure
average_salary_by_tenure_bucket = {
    tenure_bucket: sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}

average_salary_by_tenure_bucket




{'more than five': 79166.66666666667,
 'less than two': 48000.0,
 'between to and five': 61500.0}

## Paid Accounts
Who pays for their account (page 11)

In [13]:
# people with very little or very much experience tend to pay
def predict_paid_or_unpaid(years_of_experience):
    if years_of_experience < 3.0:
        return "paid"
    elif years_of_experience < 8.5:
        return "unpaid"
    else:
        return "paid"
    
predict_paid_or_unpaid(5.5)

'unpaid'

## Topics Of Interest
Which topic has the most interest (page 11)

In [21]:
# Lowercase each interest
# split into words
# count the result

from collections import Counter

words_and_counts = Counter(
    word
    for user, interest in interests
    for word in interest.lower().split()
)

# get words that accur more than once
for word, count in words_and_counts.most_common():
    if count > 1:
        print(word, count)

java 3
python 3
learning 3
big 3
data 3
regression 2
scikit-learn 2
neural 2
networks 2
statistics 2
cassandra 2
probability 2
hadoop 2
machine 2
hbase 2
r 2
