<a href="https://colab.research.google.com/github/pastrop/kaggle/blob/master/Tokenization_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tokenization

In [None]:
import nltk
from nltk.tokenize import(TreebankWordTokenizer,
                          TweetTokenizer,
                          MWETokenizer)

In [None]:
#Create tokenizers:
tree = TreebankWordTokenizer()
tweet = TweetTokenizer()
mwe = MWETokenizer()

# Create a string input
sent1 = 'There are more things in heaven and earth, Horatio, than are dreamt of in your philosophy'
     
# Use tokenize method
print(f'Treebank -> {tree.tokenize(sent1)}')
print(f'Tweettokenizer -> {tweet.tokenize(sent1)}')
print(f'MWEtokenizer -> {mwe.tokenize(sent1)}')

Treebank -> ['There', 'are', 'more', 'things', 'in', 'heaven', 'and', 'earth', ',', 'Horatio', ',', 'than', 'are', 'dreamt', 'of', 'in', 'your', 'philosophy']
Tweettokenizer -> ['There', 'are', 'more', 'things', 'in', 'heaven', 'and', 'earth', ',', 'Horatio', ',', 'than', 'are', 'dreamt', 'of', 'in', 'your', 'philosophy']
MWEtokenizer -> ['T', 'h', 'e', 'r', 'e', ' ', 'a', 'r', 'e', ' ', 'm', 'o', 'r', 'e', ' ', 't', 'h', 'i', 'n', 'g', 's', ' ', 'i', 'n', ' ', 'h', 'e', 'a', 'v', 'e', 'n', ' ', 'a', 'n', 'd', ' ', 'e', 'a', 'r', 't', 'h', ',', ' ', 'H', 'o', 'r', 'a', 't', 'i', 'o', ',', ' ', 't', 'h', 'a', 'n', ' ', 'a', 'r', 'e', ' ', 'd', 'r', 'e', 'a', 'm', 't', ' ', 'o', 'f', ' ', 'i', 'n', ' ', 'y', 'o', 'u', 'r', ' ', 'p', 'h', 'i', 'l', 'o', 's', 'o', 'p', 'h', 'y']


**Neural Nets**

In [None]:
#This is a tokenization example while working with neural nets.  Info only,  this is not directly applicable to the current use case:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
sent2 = "Mary had a little lumb and, according to GPT3, ate it with the mint jelly"
encoded_input = tokenizer(sent2)
print(encoded_input.input_ids)

[101, 2090, 1125, 170, 1376, 181, 1818, 1830, 1105, 117, 2452, 1106, 15175, 1942, 1495, 117, 8756, 1122, 1114, 1103, 22532, 179, 23083, 102]


In [None]:
pip install spacy-transformers

In [1]:
import spacy
from termcolor import colored

In [11]:
#spacy custom tokenizer
from spacy.tokenizer import Tokenizer
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

def custom_tokenizer(nlp):
    infixes = (
        LIST_ELLIPSES
        + LIST_ICONS
        + [
            r"(?<=[0-9])[+\-\*^](?=[0-9-])",
            r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
            ),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
            #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
            r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
        ]
    )

    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)


nlp_sm.tokenizer = custom_tokenizer(nlp_sm)

In [2]:
import spacy.cli

In [3]:
spacy.cli.download("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [4]:
spacy.cli.download('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [5]:
nlp_sm = spacy.load('en_core_web_sm')

In [6]:
nlp_lg = spacy.load('en_core_web_lg')

In [None]:
nlp_trf = spacy.load("en_core_web_trf")

In [None]:
st = ['Make it so we can hide and unhide the carousel',
      'Mary had a little lamb and, according to GPT3, ate it with the mint jelly',
      '-The well-tested code',
      "I'M GONNA PUKE",
      'Much sleeker. Very attractive!..I would strongly recommend',
      'CoughROOTCough',
      "So...I'm very happy",
      'A starling among starlings',
      'It was a love-fest',
      "It's great!",
      'Kindle-Fire is on fire'
      ]

In [None]:
doc_sm = []
for item in st:
  doc_sm.append(nlp_sm(item))

In [12]:
res = nlp_sm("Mother-in-law loves riding mary-go-around while watching primHD"); 
for token in res:
  print(token.text, token.lemma_, token.pos_, token.is_stop, token.idx, token.idx+len(token.shape_))

Mother-in-law Mother-in-law PROPN False 0 12
loves love VERB False 14 18
riding ride VERB False 20 24
mary-go-around mary-go-around PROPN False 27 39
while while SCONJ True 42 46
watching watch VERB False 48 52
primHD primhd NOUN False 57 63


In [None]:
doc_lg = []
for item in st:
  doc_lg.append(nlp_lg(item))

In [None]:
doc_trf = []
for item in st:
  doc_trf.append(nlp_trf(item))

In [None]:
def res_prt(doc,st):
  for item, text in zip(doc,st):
    print(colored(text,'red'))
    for token in item:
      print(token.text,token.pos_, token.tag_)
    print(' ')

In [None]:
#print(colored('EN_CORE_WEB_LG','blue'))
#res_prt(doc_lg,st)
print(colored('EN_CORE_WEB_SM','blue'))
res_prt(doc_sm,st)

[34mEN_CORE_WEB_SM[0m
[31mMake it so we can hide and unhide the carousel[0m
Make VERB VB
it PRON PRP
so SCONJ IN
we PRON PRP
can VERB MD
hide VERB VB
and CCONJ CC
unhide VERB VB
the DET DT
carousel NOUN NN
 
[31mMary had a little lamb and, according to GPT3, ate it with the mint jelly[0m
Mary PROPN NNP
had AUX VBD
a DET DT
little ADJ JJ
lamb NOUN NN
and CCONJ CC
, PUNCT ,
according VERB VBG
to ADP IN
GPT3 PROPN NNP
, PUNCT ,
ate VERB VBD
it PRON PRP
with ADP IN
the DET DT
mint NOUN NN
jelly ADV RB
 
[31m-The well-tested code[0m
-The NUM CD
well ADV RB
- PUNCT HYPH
tested VERB VBN
code NOUN NN
 
[31mI'M GONNA PUKE[0m
I'M PROPN NNP
GONNA PROPN NNP
PUKE PROPN NNP
 
[31mMuch sleeker. Very attractive!..I would strongly recommend[0m
Much ADJ JJ
sleeker NOUN NN
. PUNCT .
Very ADV RB
attractive! PROPN NNP
.. PUNCT .
I PRON PRP
would VERB MD
strongly ADV RB
recommend VERB VB
 
[31mCoughROOTCough[0m
CoughROOTCough PROPN NNP
 
[31mSo...I'm very happy[0m
So ADV RB
... PUNCT NFP
I'm

In [None]:
res_prt(doc_trf,st)

In [None]:
for token in doc:
    print(token.text,token.pos_, token.tag_)

In [None]:
doc = nlp('I have limited bookshelf space.')

-The well-tested code<br> '\n-The well-tested code'<br>stopwords when spelled out: 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 15, 20, 40, 50, 60, 100<br>
not stopwords when spelled out: 7, 13, 14, 16, 17, 18, 19, 30, 70, 80, 90, 1000, 100000<br>Splits "3G", though not "401k"</br>Splits hyphenated words (including, e.g. "thirty-six", "x-ray", "wi-fi")<br>Doesn't catch multiword tokens like "in front of" or "according to"<br>I'M GONNA PUKE<br>Much sleeker. Very attractive!..I would strongly recommend<br>sturdy(something<br>Rosette calls "CoughROOTCough" a proper noun, which, sure.  Spacy calls it a number, which, what?<br>"So...I'm very happy."<br>"A starling among starlings."<br>"It was a love-fest"<br>'Its great!'


# Text Analysis

In [None]:
import json
import re
import numpy as np

In [None]:
from scipy.spatial import distance

In [None]:
# file upload while using Google Colab
from google.colab import files
uploaded = files.upload()

In [None]:
data = []
with open('kindle.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))

In [None]:
data[50]['text']

In [None]:
txts = [] 
for item in data:
  txts.append(re.sub('\n+', ' ', item['text']))

**Google Unviersal Encoder Model**

In [None]:
!pip install -q tensorflow-hub
import tensorflow_hub as hub

In [None]:
#using universal sentence encoder to get sentence encodings
#Load the Universal Sentence Encoder's TF Hub module
#param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
print ("module {} loaded".format(module_url))
model = hub.load(module_url)
def embed(input):
  return model(input)

**lumi embeddings**

In [None]:
test_data = np.load('vector.npz')
#test_data['vect'][0]

In [None]:
test_data['vect'].shape

In [None]:
concept_data = np.load('concept_vector.npz')
concept_data['vect'].shape

In [None]:
lumi = np.concatenate((test_data['vect'],concept_data['vect']), axis=0); lumi.shape

In [None]:
lumi = []
for row in test_data['vect']:
  lumi.append(row)

In [None]:
lumi[0][0]

**embedding using universal encoder**

In [None]:
concepts = ['Kindle','Amazon','apps','tablet','Kindle Fire','purchase','Kindle Fire HD','iPad','device','download']
concept_vectors = np.array(model(concepts))

In [None]:
embedding_tuples = []
for item in txts:
  tmp = embed([item]).numpy(),item
  embedding_tuples.append(tmp)

In [None]:
embedding_tuples[0]

In [None]:
test = []
for item in txts:
  test.append(embed([item]).numpy().flatten())

In [None]:
test = np.array(test); test.shape

In [None]:
test_test = np.concatenate((test,concept_vectors), axis=0); test_test.shape

In [None]:
# data visualization:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import random
%matplotlib inline

In [None]:
# visualization code
def tsne_plot(emb):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    #for i in range(len(emb)):
        #tokens.append(emb[i][0])
        #labels.append(emb[i][1])
    tkns = np.array(emb)
    tkns = tkns.reshape(tkns.shape[0], -1)
    tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tkns)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    #Outputting all the embeddings
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
    plt.show()

In [None]:
#tsne_plot(embedding_tuples)
tsne_plot(lumi)

In [None]:
# some funcier visualization code
def tsne_plot_fancy(emb):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    #for i in range(len(emb)):
        #tokens.append(emb[i][0])
        #labels.append(emb[i][1])
    #tokens = np.array(emb)
    #print(tokens[:1])
    #tokens = tokens.reshape(tokens.shape[0], -1)
    #points = points.reshape(tkns.shape[0], -1)
    tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(emb)
    #concepts_values = tsne_model.fit_transform(points)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    print(new_values[-10:])
    x_p = []
    y_p = []
    for value in new_values[-10:]:
        x_p.append(value[0])
        y_p.append(value[1])    

        
    plt.figure(figsize=(16, 16)) 
    #Outputting all the embeddings and overlying concepts
    lb = ['Kindle','Amazon','apps','tablet','Kindle Fire','purchase','Kindle Fire HD','iPad','device','download']
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
    for i in range(len(x_p)):
        plt.scatter(x_p[i],y_p[i],s=500,c='darkblue')
        '''plt.annotate(lb[i],
              xy=(x_p[i], y_p[i]),
              xytext=(15, 15),
              textcoords='offset points',
              fontsize = 12,
              ha='right',
              va='bottom')  '''
    plt.show()

In [None]:
#tsne_plot with concepts overlay(embedding_tuples)
tsne_plot_fancy(test_test)

In [None]:
tsne_plot_fancy(lumi)

 [ -4.180652  -59.0469   ] -- 'Kindle' (1)<br>
 [ 40.489338  -18.087015 ] -- 'Amazon'(2)<br>
 [ 45.76149    -6.4174824] -- 'apps'(3)<br>
 [-28.779793  -39.422756 ] -- 'tablet' (4)<br>
 [ -4.189738  -59.018326 ] -- 'Kindle Fire (5)'<br>
 [ 53.536438  -26.654696 ] -- 'purchase'(6)<br>
 [ -4.195783  -58.99838  ] -- 'Kindle Fire HD' (7)<br>
 [ -8.17021   -43.90645  ] -- 'iPad' (8)<br>
 [-28.982622  -39.634308 ] -- 'device' (9)<br>
 [ 53.530563  -26.637133 ] -- 'download'(10)<br>
 ['Kindle','Amazon','apps','tablet','Kindle Fire','purchase','Kindle Fire HD','iPad','device','download']

**Documents to Concepts Measuring**

In [None]:
#512 - dimensional embeddings
top_20 = []
target = concept_vectors[4]
for ind, item in enumerate(embedding_tuples[:10]):
    tmp = distance.cosine(item[0],target),item[1],ind
    top_20.append(tmp)

top_20.sort()

for ind, item in enumerate(embedding_tuples[10:100]):
  tmp = distance.cosine(item[0],target),item[1],ind
  if tmp[0]<top_20[-1][0]:
    top_20.pop()
    top_20.append(tmp)
    top_20.sort

In [None]:
[item[2] for item in top_20]

[2, 6, 1, 4, 8, 7, 0, 5, 9, 51]

In [None]:
import pprint
pp = pp = pprint.PrettyPrinter(indent=1, width=100)
for text in top_20:
  pp.pprint(text[1])
  print(' ')

*USEFUL CODE SNIPPETS*

In [None]:
concepts = ['Kindle','Amazon','apps','tablet','Kindle Fire','purchase','Kindle Fire HD','iPad','device','download']
concept_vectors = np.array(model(concepts))

In [None]:
distance.cosine(concept_vectors[0],concept_vectors[4])

In [None]:
concept_data = np.load('concept_vector.npz')
concept_data['vect'].shape

In [None]:
distance.cosine(concept_data['vect'][4],concept_data['vect'][9])

In [None]:
#concept_vectors.shape
a=np.zeros((90,512))
concept_vectors_padded = np.concatenate((concept_vectors,a), axis=0); concept_vectors_padded.shape

In [None]:
str1 = 'I\'m no good'
str2 =  "I'm no good"
str3 = 'I am no good' 
inp = [str1,str2,str3]

In [None]:
out = np.array(model(inp))

In [None]:
distance.cosine(out[0],out[1])

In [None]:
!unzip file_location

In [None]:
for item in data:
  re.sub('\n+', ' ' item['text'])