In [1]:
users = [
    { "id": 0, "name": "Hero" },
    { "id": 1, "name": "Dunn" },
    { "id": 2, "name": "Sue" },
    { "id": 3, "name": "Chi" },
    { "id": 4, "name": "Thor" },
    { "id": 5, "name": "Clive" },
    { "id": 6, "name": "Hicks" },
    { "id": 7, "name": "Devin" },
    { "id": 8, "name": "Kate" },
    { "id": 9, "name": "Klein" }
]


In [2]:
friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
               (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]


In [3]:
for user in users:
    user["friends"] = []

In [4]:
for i, j in friendships:
    # this works because users[i] is the user whose id is i
    users[i]["friends"].append(users[j]) # add j as a friend of i
    users[j]["friends"].append(users[i]) # add i as a friend of j

In [5]:
def number_of_friends(user):
    """how many friends does _user_ have?"""
    return len(user["friends"])                   # length of friend_ids list


In [6]:
total_connections = sum(number_of_friends(user)
                        for user in users)    

In [7]:
total_connections

24

In [8]:
[number_of_friends(user) for user in users]

[2, 3, 3, 3, 2, 3, 2, 2, 3, 1]

In [9]:
from __future__ import division                   # integer division is lame
num_users = len(users)                            # length of the users list
avg_connections = total_connections / num_users   # 2.4

In [10]:
avg_connections

2.4

In [11]:
# create a list (user_id, number_of_friends)
num_friends_by_id = [(user["id"], number_of_friends(user))
                     for user in users]

In [12]:
sorted(num_friends_by_id,key=lambda (user_id,num_friends): num_friends,reverse=True)                                     


[(1, 3),
 (2, 3),
 (3, 3),
 (5, 3),
 (8, 3),
 (0, 2),
 (4, 2),
 (6, 2),
 (7, 2),
 (9, 1)]

In [13]:
users[0]['friends']

[{'friends': [{'friends': [...], 'id': 0, 'name': 'Hero'},
   {'friends': [{'friends': [...], 'id': 0, 'name': 'Hero'},
     {...},
     {'friends': [{...},
       {...},
       {'friends': [{...},
         {'friends': [{...},
           {'friends': [{...},
             {'friends': [{...},
               {'friends': [{...}, {...}], 'id': 7, 'name': 'Devin'},
               {'friends': [{...}], 'id': 9, 'name': 'Klein'}],
              'id': 8,
              'name': 'Kate'}],
            'id': 6,
            'name': 'Hicks'},
           {'friends': [{...},
             {'friends': [{'friends': [{...}, {...}],
                'id': 6,
                'name': 'Hicks'},
               {...},
               {'friends': [{...}], 'id': 9, 'name': 'Klein'}],
              'id': 8,
              'name': 'Kate'}],
            'id': 7,
            'name': 'Devin'}],
          'id': 5,
          'name': 'Clive'}],
        'id': 4,
        'name': 'Thor'}],
      'id': 3,
      'name': 'Chi'}],
   

In [14]:
def friends_of_friend_ids_bad(user):
    # "foaf" is short for "friend of a friend"
    return [foaf["id"]
            for friend in user["friends"]     # for each of user's friends
            for foaf in friend["friends"]]    # get each of _their_ friends

In [15]:
    friends_of_friend_ids_bad(users[0])

[0, 2, 3, 0, 1, 3]

In [16]:
print [friend["id"] for friend in users[0]["friends"]]  # [1, 2]
print [friend["id"] for friend in users[1]["friends"]]  # [0, 2, 3]
print [friend["id"] for friend in users[2]["friends"]]  # [0, 1, 3]

[1, 2]
[0, 2, 3]
[0, 1, 3]


In [17]:
all([True, True])

True

In [22]:
from collections import Counter                       # not loaded by default

def not_the_same(user, other_user):
    """two users are not the same if they have different ids"""
    return user["id"] != other_user["id"]


def not_friends(user, other_user):
    """other_user is not a friend if he's not in user["friends"];
    that is, if he's not_the_same as all the people in user["friends"]"""
    return all(not_the_same(friend, other_user)
               for friend in user["friends"])

def friends_of_friend_ids(user):
    return Counter(foaf["id"]
                   for friend in user["friends"]    # for each of my friends
                   for foaf in friend["friends"]    # count *their* friends
                   if not_the_same(user, foaf)      # who aren't me
                   and not_friends(user, foaf))     # and aren't my friends


In [23]:
friends_of_friend_ids(users[3])

Counter({0: 2, 5: 1})

In [24]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

In [25]:
def data_scientists_who_like(target_interest):
    return [user_id
            for user_id, user_interest in interests
            if user_interest == target_interest]

In [26]:
from collections import defaultdict

# keys are interests, values are lists of user_ids with that interest
user_ids_by_interest = defaultdict(list)

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

In [27]:
# keys are user_ids, values are lists of interests for that user_id
interests_by_user_id = defaultdict(list)

for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)

In [31]:
def most_common_interests_with(user):
    return Counter(interested_user_id
        for interest in interests_by_user_id[user["id"]]
        for interested_user_id in user_ids_by_interest[interest]
        if interested_user_id != user["id"])

In [32]:
most_common_interests_with(users[0])

Counter({1: 2, 5: 1, 8: 1, 9: 3})

In [33]:
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (63000, 4.2)]

In [34]:
# keys are years, values are lists of the salaries for each tenure
salary_by_tenure = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)

# keys are years, each value is average salary for that tenure
average_salary_by_tenure = {
    tenure : sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

In [35]:
average_salary_by_tenure

{0.7: 48000.0,
 1.9: 48000.0,
 2.5: 60000.0,
 4.2: 63000.0,
 6: 76000.0,
 6.5: 69000.0,
 7.5: 76000.0,
 8.1: 88000.0,
 8.7: 83000.0,
 10: 83000.0}

In [48]:
my_map={
    x: y+1 for x,y in {'a':1,'b':2,'c':5}.items()
}

In [49]:
my_map

{'a': 2, 'b': 3, 'c': 6}

In [50]:
def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"

In [51]:
# keys are tenure buckets, values are lists of salaries for that bucket
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)

In [52]:
# keys are tenure buckets, values are average salary for that bucket
average_salary_by_bucket = {
  tenure_bucket : sum(salaries) / len(salaries)
  for tenure_bucket, salaries in salary_by_tenure_bucket.iteritems()
}

In [53]:
# keys are tenure buckets, values are average salary for that bucket
average_salary_by_bucket = {
  tenure_bucket : sum(salaries) / len(salaries)
  for tenure_bucket, salaries in salary_by_tenure_bucket.iteritems()
}

In [54]:
average_salary_by_bucket

{'between two and five': 61500.0,
 'less than two': 48000.0,
 'more than five': 79166.66666666667}

In [55]:
def predict_paid_or_unpaid(years_experience):
  if years_experience < 3.0:
    return "paid"
  elif years_experience < 8.5:
    return "unpaid"
  else:
    return "paid"

In [56]:
words_and_counts = Counter(word
                           for _, interest in interests
                           for word in interest.lower().split())

In [57]:
for word, count in words_and_counts.most_common():
    if count > 1:
        print word, count

learning 3
java 3
python 3
big 3
data 3
hbase 2
regression 2
cassandra 2
statistics 2
probability 2
hadoop 2
networks 2
machine 2
neural 2
scikit-learn 2
r 2


In [58]:
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [59]:
lookup = defaultdict(int)

In [60]:
lookup['a']

0

In [61]:
5/2

2.5

In [65]:
def funct(a,b):
    return a + b

In [66]:
print funct(b=3,a=10)

13


In [67]:
x = range(10)  
y = x[1:-1]
y[0] =3
print y
print x

[3, 2, 3, 4, 5, 6, 7, 8]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [68]:
x=5,

In [70]:
type(x)

tuple

In [71]:
x= [1,2,3]
x[1],x[0] = x[0],x[1]

In [72]:
print x

[2, 1, 3]


In [73]:
tweet = {
    "user" : "joelgrus",
    "text" : "Data Science is Awesome",
    "retweet_count" : 100,
    "hashtags" : ["#data", "#science", "#datascience", "#awesome", "#yolo"]
}

In [74]:
tweet_keys   = tweet.keys()

In [75]:
[tw for tw in tweet_keys]

['text', 'retweet_count', 'hashtags', 'user']

In [76]:
tweet_values = tweet.values()
[tw for tw  in tweet_values]

['Data Science is Awesome',
 100,
 ['#data', '#science', '#datascience', '#awesome', '#yolo'],
 'joelgrus']

In [82]:
[ (k,v) for k,v in tweet.items()]

[('text', 'Data Science is Awesome'),
 ('retweet_count', 100),
 ('hashtags', ['#data', '#science', '#datascience', '#awesome', '#yolo']),
 ('user', 'joelgrus')]

In [83]:
x=10

"big!" if x >= 10 else "small!"

'big!'

In [86]:
"if" if [] else "else"

'else'

In [87]:
"if" if 0 else "else"

'else'

In [89]:
"pepe" or "carlos"

'pepe'

In [90]:
"pepe" and "carlos"

'carlos'

In [91]:
1 and "last"

'last'

In [92]:
1 or last

1

In [93]:
all([])   

True

In [94]:
x = [4,1,2,3]

In [95]:
x.sort()

In [96]:
x

[1, 2, 3, 4]

In [97]:
y = sorted(x, reverse=True)
print x
print y


[1, 2, 3, 4]
[4, 3, 2, 1]


In [99]:
sorted([-4,1,-2,3], key=abs, reverse=True) 

[-4, 3, -2, 1]

In [100]:
{ x : x * x for x in range(5) }

{0: 0, 1: 1, 2: 4, 3: 9, 4: 16}

In [106]:
{x*x for x in range(-5,5) }

{0, 1, 4, 9, 16, 25}

In [107]:
def lazy_range(n):
    """a lazy version of range"""
    i = 0
    while i < n:
        yield i
        i += 1

In [108]:
lazy_evens_below_20 = (i for i in lazy_range(20) if i % 2 == 0)

In [109]:
lazy_evens_below_20

<generator object <genexpr> at 0x7fbc0c364820>

In [110]:
for x in lazy_evens_below_20:
    print x

0
2
4
6
8
10
12
14
16
18


In [112]:
import random
random.randrane(10)

7

In [113]:
random.randrange(3, 6)

5

In [117]:
x = range(10)
random.shuffle(x)
x

[6, 7, 1, 2, 9, 3, 5, 4, 8, 0]

In [118]:
random.choice(["Alice", "Bob", "Charlie"])

'Charlie'

In [121]:
lottery_numbers = range(60)
random.sample(lottery_numbers, 6)

[6, 35, 53, 27, 40, 11]

In [122]:
import re
print all([                                # all of these are true, because
    not re.match("a", "cat"),              # * 'cat' doesn't start with 'a'
    re.search("a", "cat"),                 # * 'cat' has an 'a' in it
    not re.search("c", "dog"),             # * 'dog' doesn't have a 'c' in it
    3 == len(re.split("[ab]", "carbs")),   # * split on a or b to ['c','r','s']
    "R-D-" == re.sub("[0-9]", "-", "R2D2") # * replace digits with dashes
    ]) 

True


In [123]:
def exp(base, power):
    return base ** power

In [124]:
from functools import partial
two_to_the = partial(exp, 2)     # is now a function of one variable
print two_to_the(3)         

8


In [125]:
def double(x):
    return 2 * x

xs = [1, 2, 3, 4]
twice_xs = [double(x) for x in xs]        # [2, 4, 6, 8]
twice_xs = map(double, xs)                # same as above
list_doubler = partial(map, double)       # *function* that doubles a list
twice_xs = list_doubler(xs)               # again [2, 4, 6, 8]

In [126]:
map(lambda x:x**2, xrange(4))

[0, 1, 4, 9]

In [138]:
map(lambda (x,y) :x*y, zip(xrange(4),xrange(10,6,-1)))

[0, 9, 16, 21]

In [129]:
zip(xrange(4),xrange(4))

[(0, 0), (1, 1), (2, 2), (3, 3)]

In [139]:
def is_even(x):
    """True if x is even, False if x is odd"""
    return x % 2 == 0

x_evens = [x for x in xs if is_even(x)]    # [2, 4]
x_evens = filter(is_even, xs)              # same as above
list_evener = partial(filter, is_even)     # *function* that filters a list
x_evens = list_evener(xs)                  # again [2, 4]


In [141]:
reduce(lambda x,y:x*y, xrange(1,5)) 

24

In [142]:
def magic(*args, **kwargs):
    print "unnamed args:", args
    print "keyword args:", kwargs

magic(1, 2, key="word", key2="word2")

unnamed args: (1, 2)
keyword args: {'key2': 'word2', 'key': 'word'}


In [144]:
def other_way_magic(x, y, z):
    return x + y + z

x_y_list = [1, 2]
z_dict = { "z" : 3 }
print other_way_magic(*x_y_list, **z_dict)

6


In [145]:
def f2(x, y):
    return x + y

def doubler_correct(f):
    """works no matter what kind of inputs f expects"""
    def g(*args, **kwargs):
        """whatever arguments g is supplied, pass them through to f"""
        return 2 * f(*args, **kwargs)
    return g

g = doubler_correct(f2)
print g(1, 2) 

6


In [146]:
zip(('a', 1), ('b', 2), ('c', 3))

[('a', 'b', 'c'), (1, 2, 3)]