List Comprehensions and Generators

In [2]:
# Step 1: import modules
from collections import Counter
import calendar
import itertools
import random
import re
import requests
import string

List Comprehensions

In [2]:
names = 'pybites matt bob julian sam'.split()
names

['pybites', 'matt', 'bob', 'julian', 'sam']

In [3]:
# title case each name from the list
for name in names: print(name.title())

Pybites
Matt
Bob
Julian
Sam


In [4]:
# now I only want to look at names that start with A-M, Python's string library makes that pretty simple to do
first_half_alphabet = list(string.ascii_lowercase)[:13]
first_half_alphabet

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm']

In [6]:
new_names = []
for name in names:
    if name[0] in first_half_alphabet:
        new_names.append(name.title())
new_names

['Matt', 'Bob', 'Julian']

In [9]:
# same thing as the previous cell but this is more pythonic since it is all in one line
new_names2 = [name.title() for name in names if name[0] in first_half_alphabet]
new_names2

['Matt', 'Bob', 'Julian']

This is the portion of the notebook where we can clean up words in lists using list comprehensions

In [3]:
resp = requests.get('http://projects.bobbelderbos.com/pcc/harry.txt')
words = resp.text.lower().split()
words[:5]

['the', 'boy', 'who', 'lived', 'mr.']

Now to get the most common words.

In [4]:
cnt = Counter(words)
cnt.most_common(5)

[('the', 202), ('he', 136), ('a', 108), ('and', 100), ('to', 93)]

If you notice from the previous part, there are non alphabetical characters that got counted as well in the list so let's get rid of those

In [9]:
# \W matches any non alphanumeric characters
words = [re.sub(r'\W+', r'', word) for word in words]

Now verify that words were maintain but extraneous characters were removed.

In [10]:
'_' in words

False

In [11]:
'the' in words

True

Now, let's try this again and try to lessen the amount of cells required for this excercise.

In [13]:
resp = requests.get('http://projects.bobbelderbos.com/pcc/stopwords.txt')
stopwords = resp.text.lower().split()
len(stopwords)

319

In [16]:
words = [word for word in words if word.strip() and word not in stopwords]
words
# fun fact discovered by accident, turns out, python is very forgiving since I fogot the parentheses after the strip method 
# and it still ran correctly. who knew?

['boy',
 'lived',
 'mr',
 'mrs',
 'dursley',
 'number',
 'privet',
 'drive',
 'proud',
 'say',
 'perfectly',
 'normal',
 'thank',
 'people',
 'youd',
 'expect',
 'involved',
 'strange',
 'mysterious',
 'just',
 'didnt',
 'hold',
 'nonsense',
 'mr',
 'dursley',
 'director',
 'firm',
 'called',
 'grunnings',
 'drills',
 'big',
 'beefy',
 'man',
 'hardly',
 'neck',
 'did',
 'large',
 'mustache',
 'mrs',
 'dursley',
 'blonde',
 'nearly',
 'twice',
 'usual',
 'neck',
 'came',
 'useful',
 'spent',
 'time',
 'craning',
 'garden',
 'fences',
 'spying',
 'neighbors',
 'dursleys',
 'small',
 'son',
 'called',
 'dudley',
 'opinion',
 'finer',
 'boy',
 'dursleys',
 'wanted',
 'secret',
 'greatest',
 'fear',
 'somebody',
 'discover',
 'didnt',
 'think',
 'bear',
 'potters',
 'mrs',
 'potter',
 'mrs',
 'dursleys',
 'sister',
 'hadnt',
 'met',
 'years',
 'fact',
 'mrs',
 'dursley',
 'pretended',
 'didnt',
 'sister',
 'sister',
 'goodfornothing',
 'husband',
 'undursleyish',
 'possible',
 'dursleys',


Generators

In [1]:
# this is just a proof of concept on how generators work
def num_gen():
    for i in range(5):
        yield i
        
gen = num_gen()

In [2]:
next(gen)

0

In [4]:
for i in gen:
    print(i)

1
2
3
4


In [5]:
# this next command will fail after running the previous 3 cells since the generator is exhausted
next(gen)

StopIteration: 

Now let's do a direct comparison between a list and generator.

In [1]:
import calendar
# list
def leap_years_list(n=1000000):
    leap_years = []
    for year in range(1, n+1):
        if calendar.isleapyear(year):
            leap_years.append(year)
    return leap_years

# generators
def leap_years_gen(n=1000000):
    for year in range(1, n+1):
        if calendar.isleapyear(year):
            yield year

In [4]:
%timeit -n1 leap_years_list()

NameError: name 'calendar' is not defined