In [1]:
s = "Hi, my name is Ian."

In [3]:
#case normalization
s.lower()

'hi, my name is ian.'

In [4]:
#tokenization
s.split() #split by white space

['Hi,', 'my', 'name', 'is', 'Ian.']

In [11]:
#for more complicated sentences
from nltk import word_tokenize
word_tokenize(s)

['Hi', ',', 'my', 'name', 'is', 'Ian', '.']

In [12]:
#test on an email
word_tokenize("ian.noriega@queensu.ca")

['ian.noriega', '@', 'queensu.ca']

In [13]:
#test on a complicated sentence
word_tokenize("Let's go to the U.N. in NYC!!!")

['Let', "'s", 'go', 'to', 'the', 'U.N.', 'in', 'NYC', '!', '!', '!']

In [14]:
#import ngrams
from nltk import ngrams
tokens = word_tokenize(s)
grams = ngrams(tokens, 2) #specify how long we want the ngrams to be

In [16]:
for gram in grams:
    print(gram) #observe our 2-grams (6)

('Hi', ',')
(',', 'my')
('my', 'name')
('name', 'is')
('is', 'Ian')
('Ian', '.')


In [17]:
#let's try for 3-grams
grams = ngrams(tokens, 3)

In [18]:
for gram in grams:
    print(gram) #observe our 3-grams (5)

('Hi', ',', 'my')
(',', 'my', 'name')
('my', 'name', 'is')
('name', 'is', 'Ian')
('is', 'Ian', '.')


In [19]:
#regular expressions
import re
re.sub(r'Ian', 'Steve', s) #replace all instances of Ian with Steve in s

'Hi, my name is Steve.'

In [20]:
#testing with a phrase involving random characters
s2 = u'My name is àèù Ian' 

In [24]:
import unidecode
unidecode.unidecode(s2) #observe we've removed the unwanted accents

'My name is aeu Ian'

In [26]:
#remove numbers (similar to grepl in R)
re.sub(r'\d+', '', 'There are only 4 classes left')

'There are only  classes left'

In [30]:
#lemmatization (obtain root form)
from nltk.stem import WordNetLemmatizer
WordNetLemmatizer().lemmatize('caresses')

'caress'

In [31]:
WordNetLemmatizer().lemmatize('meeting')

'meeting'

In [39]:
WordNetLemmatizer().lemmatize('denied') #...would expect deny...

'denied'

In [40]:
#stopwords
from nltk.corpus import stopwords
stopwords.words('english') #view the english stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [41]:
tokens

['Hi', ',', 'my', 'name', 'is', 'Ian', '.']

In [43]:
[t for t in tokens if t not in stopwords.words('english')]
#observe "my" and "is" have been removed

['Hi', ',', 'name', 'Ian', '.']

In [45]:
#vectorization
from sklearn.feature_extraction.text import CountVectorizer
myDocs = ['the quick brown furry fox jumped over a second furry brown fox',
          'the spares brown furry matrix',
          'the quick matrix']

In [46]:
myDocs

['the quick brown furry fox jumped over a second furry brown fox',
 'the spares brown furry matrix',
 'the quick matrix']

In [48]:
#initialize count vectorizer and its parameters
tf_vectorizer = CountVectorizer(min_df=0.5, max_df = 0.75, ngram_range=[1,3])
#min_df is mininum number of documents the word has to be in
#max_df is maximum number of documents the word has to be in
#ngram_range is the range of n-grams that will be retained

#build document term matrix
dtm_tf = tf_vectorizer.fit_transform(myDocs)

In [50]:
dtm_tf.shape #observations: 3 documents, kept 6 words

(3, 6)

In [52]:
#print out our matrix
dtm_tf.toarray()

array([[2, 1, 2, 0, 1, 1],
       [1, 1, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 1]], dtype=int64)

In [53]:
#use pandas to create a nicer view
import pandas as pd
df = pd.DataFrame(dtm_tf.toarray(), columns=tf_vectorizer.get_feature_names())

  return f(*args, **kwds)
  return f(*args, **kwds)


In [56]:
#now view
df
#nice!

Unnamed: 0,brown,brown furry,furry,matrix,quick,the quick
0,2,1,2,0,1,1
1,1,1,1,1,0,0
2,0,0,0,1,1,1
