# EDA paramgram embedding and Spacy tokenizer

In [1]:
import os
import re
import pandas as pd
import spacy
from collections import Counter

In [2]:
def token_info(tokens):
    print('Number of tokens: ', len(tokens))
    print('First 10 tokens: ', tokens[:10])

In [3]:
def token_types(tokens):
    num_tokens = []
    symbol_tokens = []
    low_tokens = []
    up_tokens = []
    caps_tokens = []
    num_re = re.compile('[0-9]')
    low_re = re.compile('[a-z]')
    up_re = re.compile('[A-Z]')
    caps_re = re.compile('^[A-Z]*$')
    letnum_re = re.compile('[a-zA-Z0-9]')
    for t in tokens:
        if num_re.search(t):
            num_tokens.append(t)
        if not letnum_re.search(t):
            symbol_tokens.append(t)
        if low_re.match(t):
            low_tokens.append(t)
        if up_re.match(t):
            up_tokens.append(t)
        if caps_re.match(t):
            caps_tokens.append(t)
    print('Number of num tokens: ', len(num_tokens))
    print('Number of symbol tokens: ', len(symbol_tokens))
    print('Number of lowcase tokens: ' , len(low_tokens))
    print('Number of upper case tokens: ', len(up_tokens))
    print('Nubmer of capslock tokens: ' , len(caps_tokens))
    
    print('\n')
    
    print('Freq num tokens: ', num_tokens[:10])
    print('Rare num tokens: ', num_tokens[-10:], '\n')
    print('Freq symbol tokens: ', symbol_tokens[:10])
    print('Rare symbol tokens: ', symbol_tokens[-10:], '\n')
    print('Freq low tokens: ', low_tokens[:10])
    print('Rare low tokens: ', low_tokens[-10:], '\n')
    print('Freq up tokens: ', up_tokens[:10])
    print('Rare up tokens: ', up_tokens[-10:], '\n')
    print('Freq caps tokens: ', caps_tokens[:10])
    print('Rare caps tokens: ', caps_tokens[-10:])

### Look at embedding

In [4]:
emb_path = 'embeddings/paragram_300_sl999/paragram_300_sl999.txt'
data_dir = cache = '../data'

train_csv = os.path.join(data_dir, 'train.csv')
test_csv = os.path.join(data_dir, 'test.csv')
emb_path = os.path.join(data_dir, emb_path)

In [5]:
%%time
emb_tokens = []
errors = 0
with open(emb_path, 'rb') as f:
    for line in f:
        try:
            tok = line.rstrip().split(b" ")[0]
            tok = tok.decode('utf-8')
            emb_tokens.append(tok)
        except: 
            errors += 1
            continue
print(errors)

4
CPU times: user 17 s, sys: 1.9 s, total: 18.9 s
Wall time: 25.6 s


In [6]:
token_info(emb_tokens)

Number of tokens:  1703752
First 10 tokens:  [',', '.', 'the', 'and', 'to', 'of', 'a', 'in', '"', ':']


In [7]:
token_types(emb_tokens)

Number of num tokens:  510038
Number of symbol tokens:  2518
Number of lowcase tokens:  1265160
Number of upper case tokens:  1
Nubmer of capslock tokens:  1


Freq num tokens:  ['1', '2', '3', '4', '2012', '5', '2011', '10', '2010', '2009']
Rare num tokens:  ['crowdog89', 'data0053', 'doc2doc', 'i686.pet', 'km/11', 'mono-2', 'otherjuicystar07', 'own36', 'p263', 'sarah123'] 

Freq symbol tokens:  [',', '.', '"', ':', ')', '(', '-', '...', '!', '?']
Rare symbol tokens:  ['?!?!?!?!!', '?!?!?!?!?!?!?!?!?', 'ãƒˆã', 'ａ', '-------------------------------------------------------------------------------------------------------------------------------------------------', 'ɯ', 'вЂ', '回', 'ĵ', 'ÐÐ'] 

Freq low tokens:  ['the', 'and', 'to', 'of', 'a', 'in', 'is', 'for', 'i', 'that']
Rare low tokens:  ['windowstransgender', 'wordsforyoungmen', 'work.like', 'working.so', 'wried', 'wwent', 'xalisae', 'xtremecaffeine', 'zipout', 'zulchzulu'] 

Freq up tokens:  ['UUUNKKK']
Rare up tokens:  ['UUUNKKK'] 

### Tokenize data and build vocab

In [8]:
train = pd.read_csv(train_csv)['question_text'].values.tolist()
test = pd.read_csv(test_csv)['question_text'].values.tolist()
data = train + test

In [9]:
%%time
unknown_tokens = []
vocab = Counter()
spacy_en = spacy.load('en')
tokenizer = spacy_en.tokenizer
emb_tokens_set = set(emb_tokens)
for question in data:
    doc = tokenizer(question.lower())
    for token in doc:
        tok_text = token.text
        vocab.update([tok_text])
        if vocab[tok_text] == 1:
            if tok_text not in emb_tokens_set:
                unknown_tokens.append(tok_text)
        
#doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
#for token in doc:
#    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#          token.shape_, token.is_alpha, token.is_stop)

CPU times: user 2min 36s, sys: 139 ms, total: 2min 36s
Wall time: 2min 34s


In [10]:
vocab_tokens = dict(vocab.most_common()).keys()

In [11]:
token_types(vocab_tokens)

Number of num tokens:  28206
Number of symbol tokens:  1710
Number of lowcase tokens:  203672
Number of upper case tokens:  0
Nubmer of capslock tokens:  1


Freq num tokens:  ['2', '2017', '2018', '1', '3', '10', '5', '4', '12', '6']
Rare num tokens:  ['10–12', '61850', 'feb-2018', '2200-$2500/mo', '4000/mo', '999999', '450cc', '78.53', '900va', '6740'] 

Freq symbol tokens:  ['?', ',', '.', '"', '-', ')', '(', '/', "'", '&']
Rare symbol tokens:  ['τῆς', 'κατὰ', 'τὴν', 'ἡμετέραν', 'χρῆσιν', 'συνηθείας', '=--', '中庸', '中庸之道', '∅⊂'] 

Freq low tokens:  ['the', 'what', 'is', 'a', 'to', 'in', 'of', 'i', 'how', 'do']
Rare low tokens:  ['youvision', 'enrollnment', 'noumber', 'sgill', 'clearly"bad', 'myair', 'rabels', 'photopicker', 'khayami', 'thetis'] 

Freq up tokens:  []
Rare up tokens:  [] 

Freq caps tokens:  ['\n']
Rare caps tokens:  ['\n']


### Capslock tokens

In [12]:
caps_re = re.compile('^[A-Z]*$')
caps_tokens = []
for t in vocab_tokens:
    if caps_re.match(t):
        caps_tokens.append(t)

In [13]:
emotional_caps_tokens = []
for t in caps_tokens:
    if t not in emb_tokens_set and t.lower() in emb_tokens_set:
        emotional_caps_tokens.append(t)
print(len(emotional_caps_tokens))

0


In [14]:
emotional_caps_tokens

[]

### Unknown tokens

In [15]:
unk_vocab = {}
for w in unknown_tokens:
    unk_vocab[w] = vocab[w]
unk_vocab = Counter(unk_vocab)

In [16]:
print('Total number of tokens: ', sum(vocab.values()))
print('Number of unique tokens: ', len(vocab))
print('Total number of unknown tokens: ', sum(unk_vocab.values()))
print('Number of unique unknown tokens: ', len(unknown_tokens))
print('Percent of unknown tokens: ', 100 * sum(unk_vocab.values())/sum(vocab.values()))

Total number of tokens:  19840784
Number of unique tokens:  225503
Total number of unknown tokens:  102982
Number of unique unknown tokens:  73283
Percent of unknown tokens:  0.519041989469771


In [17]:
unk_common = dict(unk_vocab.most_common())
print('Number of 100 most common unk tokens: ', sum(list(unk_common.values())[:100]))

Number of 100 most common unk tokens:  9625


In [18]:
unk_common

{'..': 888,
 'quorans': 884,
 'brexit': 542,
 'cryptocurrencies': 525,
 'redmi': 398,
 '/math': 231,
 'x^2': 217,
 'f(x': 192,
 '^2': 147,
 'coinbase': 146,
 'oneplus': 143,
 '.net': 143,
 'uceed': 126,
 'bhakts': 118,
 'demonetisation': 118,
 'machedo': 112,
 'gdpr': 110,
 'adityanath': 108,
 'upwork': 106,
 'boruto': 105,
 'bnbr': 105,
 'alshamsi': 100,
 '\\frac': 96,
 'dceu': 94,
 'iiest': 91,
 'litecoin': 90,
 'sjws': 89,
 'unacademy': 89,
 'zerodha': 85,
 'qoura': 84,
 'tensorflow': 81,
 "qur'an": 79,
 'a+': 77,
 'lnmiit': 73,
 'kavalireddi': 70,
 'doklam': 70,
 'muoet': 67,
 'nicmar': 65,
 '1/': 65,
 'vajiram': 62,
 'srmjee': 61,
 'adhaar': 60,
 'x^3': 59,
 'elitmus': 58,
 'altcoin': 58,
 'zebpay': 58,
 'w/': 57,
 'altcoins': 56,
 'jiren': 56,
 'awdhesh': 55,
 "5'4": 53,
 'hackerrank': 53,
 'ryzen': 51,
 "a2a'd": 49,
 'y^2': 49,
 'baahubali': 48,
 'koinex': 48,
 '.what': 47,
 'mhcet': 47,
 'b+': 46,
 "5'9": 46,
 'byju': 46,
 'binance': 45,
 '\\sqrt': 44,
 '^3': 44,
 'srmjeee': 44

In [19]:
token_types(list(unk_common.keys()))

Number of num tokens:  15637
Number of symbol tokens:  1508
Number of lowcase tokens:  61302
Number of upper case tokens:  0
Nubmer of capslock tokens:  1


Freq num tokens:  ['x^2', '^2', '1/', 'x^3', "5'4", "a2a'd", 'y^2', "5'9", '^3', "5'5"]
Rare num tokens:  ['a0=', 'an+1=', 'an=2', 'nba2k18', '3.71/4.44', '39/389', 'z2600', '10–12', 'feb-2018', '2200-$2500/mo'] 

Freq symbol tokens:  ['..', ':(', '₹', '"-', 'करना', '\ufeff', '\n', ':/', '️', '\\\\']
Rare symbol tokens:  ['τῆς', 'κατὰ', 'τὴν', 'ἡμετέραν', 'χρῆσιν', 'συνηθείας', '=--', '中庸', '中庸之道', '∅⊂'] 

Freq low tokens:  ['quorans', 'brexit', 'cryptocurrencies', 'redmi', 'x^2', 'f(x', 'coinbase', 'oneplus', 'uceed', 'bhakts']
Rare low tokens:  ['die.is', 'youvision', 'enrollnment', 'noumber', 'sgill', 'clearly"bad', 'myair', 'rabels', 'photopicker', 'khayami'] 

Freq up tokens:  []
Rare up tokens:  [] 

Freq caps tokens:  ['\n']
Rare caps tokens:  ['\n']


### Read embedding

In [69]:
import numpy as np
embs = {}
with open(emb_path) as f:
    count = 0
    for line in f:
        if count > 90000:
            break
        count += 1
        cut = line.find(' ')
        tok = line[:cut]
        emb = line[cut + 1:].split()
        emb = np.array([float(e) for e in emb])
        embs[tok] = emb

In [74]:
low = embs['much']
up = embs['more']

In [75]:
def sim(v1, v2):
    result = sum(v1*v2)/np.linalg.norm(v1)/np.linalg.norm(v2)
    return result

In [76]:
print(sim(low, up))

0.2550954033558647
