Bag of Words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm

In [7]:
class Category:
    BOOKS = "BOOKS"
    CLOTHING = "CLOTHING"

In [8]:
train_x = ["I love the book", "this is a great book", "the fit is great", "I love the shoes"]
train_y = [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]

In [9]:
vectorizer = CountVectorizer(binary=True, ngram_range=(1,2))
train_x_vectors = vectorizer.fit_transform(train_x)

print(train_x_vectors[0])
print(vectorizer.get_feature_names())
print(train_x_vectors.toarray())

  (0, 7)	1
  (0, 10)	1
  (0, 0)	1
  (0, 8)	1
  (0, 11)	1
['book', 'fit', 'fit is', 'great', 'great book', 'is', 'is great', 'love', 'love the', 'shoes', 'the', 'the book', 'the fit', 'the shoes', 'this', 'this is']
[[1 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0]
 [1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1]
 [0 1 1 1 0 1 1 0 0 0 1 0 1 0 0 0]
 [0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0]]


In [10]:
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

SVC(kernel='linear')

In [11]:
test_x = vectorizer.transform(['i love the book'])

clf_svm.predict(test_x)

array(['BOOKS'], dtype='<U8')

Word Vectors

In [15]:
import spacy

In [13]:
!python -m spacy download en_core_web_md

[38;5;3m⚠ Skipping model package dependencies and setting `--no-deps`. You
don't seem to have the spaCy package itself installed (maybe because you've
built from source?), so installing the model dependencies would cause spaCy to
be downloaded, which probably isn't what you want. If the model package has
other dependencies, you'll have to install them manually.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [30]:
nlp = spacy.load("en_core_web_md")

ModuleNotFoundError: No module named 'thinc.types'

In [17]:
print(train_x)

['I love the book', 'this is a great book', 'the fit is great', 'I love the shoes']


In [18]:
docs = [nlp(text) for text in train_x]
print(docs[0].vector)

NameError: name 'nlp' is not defined

In [14]:
train_x_word_vectors = [x.vector for x in docs]

In [15]:
clf_svm_wv = svm.SVC(kernel='linear')
clf_svm_wv.fit(train_x_word_vectors, train_y)

SVC(kernel='linear')

In [22]:
text_x = ["these earings hurt"]
test_docs = [nlp(text) for text in text_x]
test_x_word_vectors = [x.vector for x in test_docs]


clf_svm_wv.predict(test_x_word_vectors)

array(['CLOTHING'], dtype='<U8')

Regexes

In [23]:
import re

In [25]:
regexp = re.compile(r"^ab[^\s]*cd$")

In [34]:
phrases = ["abcd", "xxx", "aaa abxxxcd ccc", "ab cd"]

matches = []
for phrase in phrases:
    if re.search(regexp, phrase):
        matches.append(phrase)

print (matches)

['abcd']


In [36]:
regexp = re.compile(r"read|story|book")

phrases = ["i like that history", "the car treaded up the hill", "this hat is nice"]

matches = []
for phrase in phrases:
    if re.search(regexp, phrase):
        matches.append(phrase)

print (matches)

['i like that history', 'the car treaded up the hill']


Stemming/Lemmatization

In [37]:
import nltk

In [38]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/romanokonesnikov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/romanokonesnikov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/romanokonesnikov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [39]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [44]:
stemmer = PorterStemmer()

phrase = "reading the books. stories"
words = word_tokenize(phrase)

stemmed_words = []
for word in words:
    stemmed_words.append(stemmer.stem(word))
    
" ".join(stemmed_words)

'read the book . stori'

In [45]:
from nltk.stem import WordNetLemmatizer

In [48]:
lemmatizer = WordNetLemmatizer()

phrase = "reading the books"
words = word_tokenize(phrase)

lemmatized_words = []
for word in words:
    lemmatized_words.append(lemmatizer.lemmatize(word, pos='v'))
    
" ".join(lemmatized_words)

# by default all words are noun, we can change it to verb

'read the book'

Stopword Removal

In [49]:
from nltk.corpus import stopwords

In [52]:
stop_words = stopwords.words('english')
print(len(stop_words))

179


In [54]:
phrase = "Here is an example sentence demonstrating the removal of stopwords"

stripped_phrase = []
words = word_tokenize(phrase)
for word in words:
    if word not in stop_words:
        stripped_phrase.append(word)

" ".join(stripped_phrase)

'Here example sentence demonstrating removal stopwords'

Various other techniques (spell correction, sentiment, & pos tagging)

In [56]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.15.3-py2.py3-none-any.whl (636 kB)
[K     |████████████████████████████████| 636 kB 7.6 MB/s eta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.15.3


In [57]:
from textblob import TextBlob

In [64]:
phrase = "the book was horrible"

tb_phrase = TextBlob(phrase)

tb_phrase.correct()
tb_phrase.tags
tb_phrase.sentiment

Sentiment(polarity=-1.0, subjectivity=1.0)

Transformer Architecture

In [31]:
!pip install spacy-transformers
!python -m spacy download en_trf_bertbaseuncased_lg

Collecting srsly<3.0.0,>=2.4.0
  Using cached srsly-2.4.0-cp38-cp38-macosx_10_9_x86_64.whl (449 kB)
Collecting catalogue<2.1.0,>=2.0.1
  Using cached catalogue-2.0.1-py3-none-any.whl (9.6 kB)
Installing collected packages: catalogue, srsly
  Attempting uninstall: catalogue
    Found existing installation: catalogue 1.0.0
    Uninstalling catalogue-1.0.0:
      Successfully uninstalled catalogue-1.0.0
  Attempting uninstall: srsly
    Found existing installation: srsly 1.0.5
    Uninstalling srsly-1.0.5:
      Successfully uninstalled srsly-1.0.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 7.4.1 requires catalogue<1.1.0,>=0.0.7, but you have catalogue 2.0.1 which is incompatible.
thinc 7.4.1 requires srsly<1.1.0,>=0.0.6, but you have srsly 2.4.0 which is incompatible.
spacy 2.3.2 requires catalogue<1.1.0,>=0.0.7, but you have catalogue 2.0.1 which

In [4]:
import torch

In [33]:
nlp = spacy.load('en_trf_bertbaseuncased_lg')
doc = nlp("Here is some text to encode.")

ImportError: [E048] Can't import language trf from spacy.lang: No module named 'spacy.lang.trf'

In [32]:
!python -m spacy validate

[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation:
/Users/romanokonesnikov/opt/anaconda3/envs/2020-10-illumidesk-env/lib/python3.8/site-packages/spacy[0m

TYPE      NAME                        MODEL                       VERSION                            
package   en-trf-bertbaseuncased-lg   en_trf_bertbaseuncased_lg   [38;5;2m2.3.0[0m   [38;5;2m✔[0m
package   en-core-web-md              en_core_web_md              [38;5;2m2.3.1[0m   [38;5;2m✔[0m



In [28]:
!pip install spacy==2.3.2

Collecting spacy==2.3.2
  Using cached spacy-2.3.2-cp38-cp38-macosx_10_9_x86_64.whl (10.1 MB)
Collecting thinc==7.4.1
  Using cached thinc-7.4.1-cp38-cp38-macosx_10_9_x86_64.whl (2.1 MB)
Collecting catalogue<1.1.0,>=0.0.7
  Using cached catalogue-1.0.0-py2.py3-none-any.whl (7.7 kB)
Collecting srsly<1.1.0,>=1.0.2
  Using cached srsly-1.0.5-cp38-cp38-macosx_10_9_x86_64.whl (177 kB)
Installing collected packages: srsly, catalogue, thinc, spacy
  Attempting uninstall: srsly
    Found existing installation: srsly 2.4.0
    Uninstalling srsly-2.4.0:
      Successfully uninstalled srsly-2.4.0
  Attempting uninstall: catalogue
    Found existing installation: catalogue 2.0.1
    Uninstalling catalogue-2.0.1:
      Successfully uninstalled catalogue-2.0.1
  Attempting uninstall: thinc
    Found existing installation: thinc 7.4.0
    Uninstalling thinc-7.4.0:
      Successfully uninstalled thinc-7.4.0
  Attempting uninstall: spacy
    Found existing installation: spacy 2.2.4
    Uninstalling spa