In [1]:
import numpy as np
import pandas as pd

## Examples of list comprehensions

In [4]:
# illustrate a simple list comprehension
[x for x in range(20)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [6]:
# illustrate the use of an 'if' statement in a list comprehension
[x for x in range(20) if (x % 2 == 0)]

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

In [8]:
# illustrate a more complex 'if' statement
[x ** 2 for x in range(20) if (x % 2 == 0) and (x % 3 == 0)]

[0, 36, 144, 324]

## List comprehensions for tokenizing a sentence

In [9]:
s = 'This is a sentence!'
print(s)

This is a sentence!


In [10]:
ls = 'According; to; Vonnegut; you; should; never; use; semi-colons.'
print(ls)

According; to; Vonnegut; you; should; never; use; semi-colons.


In [14]:
# split a sentence on white space... note that we still have the '!'
s.split()

['This', 'is', 'a', 'sentence!']

In [13]:
ls.split()

['According;',
 'to;',
 'Vonnegut;',
 'you;',
 'should;',
 'never;',
 'use;',
 'semi-colons.']

In [17]:
# use a list comprehension to convert to lower case
[w.lower() for w in s.split() if w.isalpha()]

['this', 'is', 'a']

In [18]:
[w.lower() for w in ls.split() if w.isalpha()]

[]

In [13]:
# use a list comprehension to select only those words that are alphanumeric and convert to lower case

## Make this easier with NLTK

In [43]:
# Note: you may need to pip or conda install nltk

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

ps = PorterStemmer()

[nltk_data] Downloading package punkt to /Users/mme/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mme/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
# use word tokenize to split the sentences
word_tokenize(s)

['This', 'is', 'a', 'sentence', '!']

In [21]:
word_tokenize(ls)

['According',
 ';',
 'to',
 ';',
 'Vonnegut',
 ';',
 'you',
 ';',
 'should',
 ';',
 'never',
 ';',
 'use',
 ';',
 'semi-colons',
 '.']

In [38]:
ls = "According; to; Vonnegut; you; can't use; semi-colons."
print(ls)

According; to; Vonnegut; you; can't use; semi-colons.


In [35]:
ls

'According; to; Vonnegut; you; should; never; use; semi-colons.'

In [39]:
word_tokenize(ls)

['According',
 ';',
 'to',
 ';',
 'Vonnegut',
 ';',
 'you',
 ';',
 'ca',
 "n't",
 'use',
 ';',
 'semi-colons',
 '.']

In [44]:
[ps.stem(w.lower()) for w in word_tokenize(ls) if not w in stop_words]

['accord',
 ';',
 ';',
 'vonnegut',
 ';',
 ';',
 'ca',
 "n't",
 'use',
 ';',
 'semi-colon',
 '.']

In [None]:
# create a function that uses list comprehensions to tokenize, remove stop words and non alphanumeric, and .lower()