<a href="https://colab.research.google.com/github/philipsales/natural-language-processing-notes/blob/master/philipsales%20/%20natural-language-processing-notes/fast_ai_traditional_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
np.set_printoptions(suppress=True)

# Summary
1. Data Exploration
  1. Stop Words
    1. Stop words in SkLearn but not in Spacy
    1. Stop words in Spacy but not in SkLearn
  1. Stemming
  1. Lemmatization
  1. Spacy


1. Data Processing

# Data Exploration


In [3]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
newsgroups_train.filenames.shape, newsgroups_train.target.shape

((2034,), (2034,))

In [5]:
print("\n--------".join(newsgroups_train.data[:3]))

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych
--------

Seems to be, barring evidence to the contrary, that Koresh was simply
another deranged fanatic who thought it neccessary to take a whole bunch of
folks with him, children and all, to satisfy his delusional mania. Jim
Jones, circa 1993.


Nope - fruitcakes like Koresh have been demonstrating such evil corruption
for centuries.
--------
 >In article <1993Apr19.020359.26996@sq.sq.c

In [6]:
np.array(newsgroups_train.target_names)[newsgroups_train.target[:4]]

array(['comp.graphics', 'talk.religion.misc', 'sci.space', 'alt.atheism'],
      dtype='<U18')

In [7]:
newsgroups_train.target[:3]

array([1, 3, 2])

## Stop Words, stemming, Lemmatization


### Stop words


In [8]:
from sklearn.feature_extraction import stop_words

sorted(list(stop_words.ENGLISH_STOP_WORDS))[:20]



['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amoungst']

In [9]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [10]:
from nltk import stem

In [11]:
wnl = stem.WordNetLemmatizer()
porter = stem.porter.PorterStemmer()
lancaster = stem.lancaster.LancasterStemmer()

In [12]:
word_list = ['feet', 'foot', 'foots', 'footing']
word_list = ['fly', 'flies', 'flying', 'organize', 'organizes', 'organizing', 'universe', 'university']

In [13]:
[wnl.lemmatize(word) for word in word_list]

['fly',
 'fly',
 'flying',
 'organize',
 'organizes',
 'organizing',
 'universe',
 'university']

In [14]:
[porter.stem(word) for word in word_list]

['fli', 'fli', 'fli', 'organ', 'organ', 'organ', 'univers', 'univers']

In [15]:
[lancaster.stem(word) for word in word_list]

['fly', 'fli', 'fly', 'org', 'org', 'org', 'univers', 'univers']

### Spacy

In [16]:
!pip install -U spacy
#!spacy -m download en_core_web_sm
#!python -m spacy download en_core_web_sm

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/50/b2/12466d3018bb84b039139ef76436ea7a01e98125c2aee6a81e527eb4ebe1/spacy-2.3.4-cp36-cp36m-manylinux2014_x86_64.whl (10.4MB)
[K     |████████████████████████████████| 10.4MB 4.4MB/s 
Collecting thinc<7.5.0,>=7.4.1
[?25l  Downloading https://files.pythonhosted.org/packages/1b/c9/ce2e03720a5647fd90da575325376ff258653a05f357aa970fd87e6c1a55/thinc-7.4.3-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 40.4MB/s 
Installing collected packages: thinc, spacy
  Found existing installation: thinc 7.4.0
    Uninstalling thinc-7.4.0:
      Successfully uninstalled thinc-7.4.0
  Found existing installation: spacy 2.2.4
    Uninstalling spacy-2.2.4:
      Successfully uninstalled spacy-2.2.4
Successfully installed spacy-2.3.4 thinc-7.4.3


In [17]:
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups

lookups = Lookups()
lemmatizer = Lemmatizer(lookups)

In [18]:
[lemmatizer.lookup(word) for word in word_list]

['fly',
 'flies',
 'flying',
 'organize',
 'organizes',
 'organizing',
 'universe',
 'university']

In [19]:
nlp = spacy.load('en_core_web_sm')



In [20]:
sorted(list(nlp.Defaults.stop_words))[:20]

["'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also']

### Stop words in SkLearn but not in Spacy


In [28]:
stop_words.ENGLISH_STOP_WORDS - nlp.Defaults.stop_words

frozenset({'amoungst',
           'bill',
           'cant',
           'co',
           'con',
           'couldnt',
           'cry',
           'de',
           'describe',
           'detail',
           'eg',
           'etc',
           'fill',
           'find',
           'fire',
           'found',
           'hasnt',
           'ie',
           'inc',
           'interest',
           'ltd',
           'mill',
           'sincere',
           'system',
           'thick',
           'thin',
           'un'})

### Stop words in Spacy but not in SkLearn


In [21]:
nlp.Defaults.stop_words - stop_words.ENGLISH_STOP_WORDS

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'ca',
 'did',
 'does',
 'doing',
 'just',
 'make',
 "n't",
 'n‘t',
 'n’t',
 'quite',
 'really',
 'regarding',
 'say',
 'unless',
 'used',
 'using',
 'various',
 '‘d',
 '‘ll',
 '‘m',
 '‘re',
 '‘s',
 '‘ve',
 '’d',
 '’ll',
 '’m',
 '’re',
 '’s',
 '’ve'}

In [22]:
doc = nlp(u"Apple is looking at buying a U.K. startup for $1 billion")

for token in doc:
  print(token.text)

Apple
is
looking
at
buying
a
U.K.
startup
for
$
1
billion


In [23]:
doc = nlp(u"The striped bats are hanging on their feet for best")

for token in doc:
  print(token.lemma_)

the
stripe
bat
be
hang
on
-PRON-
foot
for
good


In [24]:
for word in doc:
    print(word.text, word.pos_)

The DET
striped VERB
bats NOUN
are AUX
hanging VERB
on ADP
their DET
feet NOUN
for ADP
best ADJ


In [25]:
from spacy import displacy

doc = nlp(u"My name is Jeff.")
displacy.serve(doc, style="dep")


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


# Data Processing

In [26]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [29]:
import nltk

In [30]:
vectorizer = CountVectorizer(stop_words='english')

In [32]:
vectors = vectorizer.fit_transform(newsgroups_test.data).todense()
vectors

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [33]:
vectors.shape

(1353, 21240)