In [4]:
import numpy as np

In [8]:
sentence = "The dogs are barking at the ducks! Don't disturb them! Version 2.3"
token_sentence = sentence.split(' ')
vocab = sorted(set(token_sentence))
num_tokens = len(token_sentence)
vocab_size = len(vocab)
onehot_vectors = np.zeros((num_tokens, vocab_size), int)
for i, word in enumerate(token_sentence):
    onehot_vectors[i, vocab.index(word)] =  1
onehot_vectors

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [5]:
import pandas as pd

In [7]:
pd.DataFrame(onehot_vectors, columns = vocab)

Unnamed: 0,The,are,at,barking,dogs,ducks!,the
0,1,0,0,0,0,0,0
1,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0
3,0,0,0,1,0,0,0
4,0,0,1,0,0,0,0
5,0,0,0,0,0,0,1
6,0,0,0,0,0,1,0


In [17]:
bow = {}
for token in sentence.lower().split(' '):
    bow[token] = bow.get(token, 0) + 1
sorted(bow)

['are', 'at', 'barking', 'dogs', 'ducks!', 'the']

In [20]:
df = pd.DataFrame(pd.Series(dict([(token, 1) for token in sentence.split(' ')])), columns = ['sent']).T

In [21]:
df

Unnamed: 0,The,are,at,barking,dogs,ducks!,the
sent,1,1,1,1,1,1,1


In [24]:
sentences = "Here comes the sun!\nLittle darling\nHere comes the sun, and I say\nIt's alright"
corpus = {}
for i, sent in enumerate(sentences.split("\n")):
    corpus['sent{}'.format(i)] = dict((token, 1) for token in sent.split())
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
df

Unnamed: 0,Here,I,It's,Little,alright,and,comes,darling,say,sun!,"sun,",the
sent0,1,0,0,0,0,0,1,0,0,1,0,1
sent1,0,0,0,1,0,0,0,1,0,0,0,0
sent2,1,1,0,0,0,1,1,0,1,0,1,1
sent3,0,0,1,0,1,0,0,0,0,0,0,0


In [23]:
corpus

{'sent0': {'Here': 1,
  'I': 1,
  'alright': 1,
  'and': 1,
  'comes': 1,
  'darlingHere': 1,
  "sayIt's": 1,
  'sun!Little': 1,
  'sun,': 1,
  'the': 1}}

# Five ways to to a dot product for vectors

In [26]:
v1 = pd.np.array([1, 2, 3])
v2 = pd.np.array([4, 5, 6])
v1.dot(v2)

32

In [27]:
(v1*v2).sum()

32

In [29]:
sum([x1 * x2 for x1, x2 in zip(v1, v2)])

32

In [30]:
v1.reshape(-1,1).T@v2.reshape(-1,1)

array([[32]])

In [33]:
np.matmul(v1.reshape(-1,1).T, v2.reshape(-1,1))

array([[32]])

# Sentence overlap through dot product

In [34]:
df = df.T

In [37]:
df.sent0.dot(df.sent3)

0

In [40]:
[(k, v) for (k, v) in (df.sent0 & df.sent2).items() if v]

[('Here', 1), ('comes', 1), ('the', 1)]

# Tokenizing on puctuation

In [17]:
import re

In [22]:
pattern = re.compile(r'([-\s.,;!?])+')
tokens = pattern.split(sentence)
tokens = [x for x in tokens if x and x not in '- \t\n.,;!?']
tokens

['The',
 'dogs',
 'are',
 'barking',
 'at',
 'the',
 'ducks',
 "Don't",
 'disturb',
 'them',
 'Version',
 '2',
 '3']

In [2]:
from nltk.tokenize import RegexpTokenizer

In [9]:
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(sentence)

['The',
 'dogs',
 'are',
 'barking',
 'at',
 'the',
 'ducks',
 '!',
 'Don',
 "'t",
 'disturb',
 'them',
 '!',
 'Version',
 '2',
 '.3']

### The best results:

In [10]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentence)

['The',
 'dogs',
 'are',
 'barking',
 'at',
 'the',
 'ducks',
 '!',
 'Do',
 "n't",
 'disturb',
 'them',
 '!',
 'Version',
 '2.3']

### Twitter and SNS:

In [13]:
from nltk.tokenize.casual import casual_tokenize
message = 'Heeeeeeyyy @duck! Aweesomee! :)'
casual_tokenize(message)

['Heeeeeeyyy', '@duck', '!', 'Aweesomee', '!', ':)']

In [14]:
casual_tokenize(message, reduce_len=True, strip_handles=True)

['Heeeyyy', '!', 'Aweesomee', '!', ':)']

### N-Gram Tokenizer

In [24]:
from nltk.util import ngrams
list(ngrams(tokens, 3)) # we use 'list' to convert the generator to a list

[('The', 'dogs', 'are'),
 ('dogs', 'are', 'barking'),
 ('are', 'barking', 'at'),
 ('barking', 'at', 'the'),
 ('at', 'the', 'ducks'),
 ('the', 'ducks', "Don't"),
 ('ducks', "Don't", 'disturb'),
 ("Don't", 'disturb', 'them'),
 ('disturb', 'them', 'Version'),
 ('them', 'Version', '2'),
 ('Version', '2', '3')]

In [25]:
bigrams = list(ngrams(tokens, 2))
bigrams = [' '.join(x) for x in bigrams]
bigrams

['The dogs',
 'dogs are',
 'are barking',
 'barking at',
 'at the',
 'the ducks',
 "ducks Don't",
 "Don't disturb",
 'disturb them',
 'them Version',
 'Version 2',
 '2 3']

### Filtering out stop words

In [29]:
stop_words = ['a', 'an', 'the', 'at']
tokens_without_stops = [x for x in tokens if x.lower() not in stop_words]
tokens_without_stops

['dogs',
 'are',
 'barking',
 'ducks',
 "Don't",
 'disturb',
 'them',
 'Version',
 '2',
 '3']

In [32]:
import nltk
nltk.download('stopwords')
stops = nltk.corpus.stopwords.words('english')
len(stops)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Julia\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


179

In [33]:
stops

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [34]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stops
len(sklearn_stops)

318

In [35]:
sklearn_stops

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

### Stemming and Lemmatizing

In [37]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
sent = "dish washer's washed dishes"
' '.join([stemmer.stem(w).strip("'") for w in sent.split()])

'dish washer wash dish'

In [38]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('better')

'better'

In [41]:
lemmatizer.lemmatize('better', pos='a')

'good'