# Chapter 1 - Introduction

## Who is a data scientist?

A data scientist is someone who extracts insights from messy data.

Examples mentioned were the following:
- OkCupid asking its members questions to find the most appropriate matches. But it also analyzes these results to figure out innocuous-sounding questions you can ask someone to find out how likely someone is to sleep with you on the first date.
- Facebook asking you to list your hometown and current location, to make it easier for your friends to find and connect with you. But it also analyzes these results to identify global migration patterns and where the fanbases of different football teams live.
- Target tracking your purchases and interactions. And it uses the data to predictively model which of its customers are pregnant, to better market baby-related purchases to them.
- Obama campaign employing data scientists to data-mine and experiment to identify voters who needed extra attention, choosing optimal donor-specific fundraising appeals and programs, and focusing get-out-the-vote efforts where they were most likely to be useful.


# Finding Key Connectors

In [10]:
users = [
    { "id": 0, "name": "Hero" },
    { "id": 1, "name": "Dunn" },
    { "id": 2, "name": "Sue" },
    { "id": 3, "name": "Chi" },
    { "id": 4, "name": "Thor" },
    { "id": 5, "name": "Clive" },
    { "id": 6, "name": "Hicks" },
    { "id": 7, "name": "Devin" },
    { "id": 8, "name": "Kate" },
    { "id": 9, "name": "Klein" },
]

friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
                (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]


# Add a list of friends to each user
for user in users:
    user["friends"] = []

for i, j in friendships:
    users[i]["friends"].append(users[j])
    users[j]["friends"].append(users[i])

# Find the average number of connections
def number_of_friends(user):
    return len(user["friends"])

total_connections = sum(number_of_friends(user) for user in users)    # 24

num_users = len(users)
avg_connections = total_connections / num_users     # 2.4

# Sort from "most friends" to "least friends"
num_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]

sorted_num_friends_by_id = sorted(num_friends_by_id,
                                    key=lambda id_friends_pair: id_friends_pair[1],
                                    reverse=True)


# Result: [(1, 3), (2, 3), (3, 3), (5, 3), (8, 3), (0, 2), (4, 2), (6, 2), (7, 2), (9, 1)]
# print(sorted_num_friends_by_id)

What we just did is identify people who are somehow central to the network.

What we computed is the network metric **degree centrality**

# "Data Scientists You May Know" suggester

In [14]:
def friends_of_friend_ids_bad(user):
    return [foaf["id"]
            for friend in user["friends"]
            for foaf in friend["friends"]]

# print(friends_of_friend_ids_bad(users[0]))

'''
Result: [0, 2, 3, 0, 1, 3]

users[0] is Hero.
His own id appears twice because he is himself is a friend of both his friends Dunn (users[1]) and Sue (users[2]).
Chi (users[3]) also appears twice since she is the friend of both Dunn and Sue.
'''

from collections import Counter

def not_the_same(user, other_user):
    return user["id"] != other_user["id"]

def not_friends(user, other_user):
    return all(not_the_same(friend, other_user) for friend in user["friends"])

def friends_of_friend_ids(user):
    return Counter(foaf["id"]
                    for friend in user["friends"]   # for each of my friends
                    for foaf in friend["friends"]   # count their friends
                    if not_the_same(user, foaf)     # who are not me
                    and not_friends(user, foaf))    # and aren't my friends

print(friends_of_friend_ids(users[3]))

Counter({0: 2, 5: 1})


In [33]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"), (0, "Spark"), (0, "Storm"), (0, "Cassandra"), (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"), (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"), (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"), (3, "statistics"), (3, "regression"), (3, "probability"), (4, "machine learning"), (4, "regression"), (4, "decision trees"), (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"), (5, "Haskell"), (5, "programming languages"), (6, "statistics"), (6, "probability"), (6, "mathematics"), (6, "theory"), (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"), (7, "neural networks"), (8, "neural networks"), (8, "deep learning"), (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"), (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

def data_scientists_who_like(target_interest):
    return [user_id
            for user_id, user_interest in interests
            if user_interest == target_interest]

# print(data_scientists_who_like("machine learning"))

from collections import defaultdict

user_ids_by_interest = defaultdict(list)
interests_by_user_id = defaultdict(list)

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)
    interests_by_user_id[user_id].append(interest)

def most_common_interests_with(user):
    return Counter(interested_user_id
                   for interest in interests_by_user_id[user["id"]]
                   for interested_user_id in user_ids_by_interest[interest]
                   if interested_user_id != user["id"])

print(user_ids_by_interest)
print(interests_by_user_id)

print(most_common_interests_with(users[0]))

defaultdict(<class 'list'>, {'Hadoop': [0, 9], 'Big Data': [0, 8, 9], 'HBase': [0, 1], 'Java': [0, 5, 9], 'Spark': [0], 'Storm': [0], 'Cassandra': [0, 1], 'NoSQL': [1], 'MongoDB': [1], 'Postgres': [1], 'Python': [2, 3, 5], 'scikit-learn': [2, 7], 'scipy': [2], 'numpy': [2], 'statsmodels': [2], 'pandas': [2], 'R': [3, 5], 'statistics': [3, 6], 'regression': [3, 4], 'probability': [3, 6], 'machine learning': [4, 7], 'decision trees': [4], 'libsvm': [4], 'C++': [5], 'Haskell': [5], 'programming languages': [5], 'mathematics': [6], 'theory': [6], 'Mahout': [7], 'neural networks': [7, 8], 'deep learning': [8], 'artificial intelligence': [8], 'MapReduce': [9]})
defaultdict(<class 'list'>, {0: ['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra'], 1: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'], 2: ['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas'], 3: ['R', 'Python', 'statistics', 'regression', 'probability'], 4: ['machine learning', 'regression'

# Salaries and Experience

In [38]:
salaries_and_tenures = [( 83000 , 8.7 ), ( 88000 , 8.1 ), ( 48000 , 0.7 ), ( 76000 , 6 ), ( 69000 , 6.5 ), ( 76000 , 7.5 ), ( 60000 , 2.5 ), ( 83000 , 10 ), ( 48000 , 1.9 ), ( 63000 , 4.2 )]

# keys are years, values are lists of the salaries for each tenure 
salary_by_tenure = defaultdict ( list ) 
for salary , tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary) 
    
# keys are years, each value is average salary for that tenure 
average_salary_by_tenure = {
    tenure : sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

print(average_salary_by_tenure)

def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"
    
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)

average_salary_by_bucket = {
    tenure_bucket : sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}

print(average_salary_by_bucket)

{8.7: 83000.0, 8.1: 88000.0, 0.7: 48000.0, 6: 76000.0, 6.5: 69000.0, 7.5: 76000.0, 2.5: 60000.0, 10: 83000.0, 1.9: 48000.0, 4.2: 63000.0}
{'more than five': 79166.66666666667, 'less than two': 48000.0, 'between two and five': 61500.0}


Your possible analysis could be: "Data scientists with more than five years experience earn 65% more than data scientists with little or no experience!"

# Topics of Interest

In [44]:
words_and_counts = Counter(word
                           for user, interest in interests
                           for word in interest.lower().split())

for word, count in words_and_counts.most_common():
    if count > 1:
        print(f"{word} {count}")

big 3
data 3
java 3
python 3
learning 3
hadoop 2
hbase 2
cassandra 2
scikit-learn 2
r 2
statistics 2
regression 2
probability 2
machine 2
neural 2
networks 2
