# EDA google news embedding and Spacy tokenizer

In [1]:
import os
import re
import pandas as pd
import spacy
from collections import Counter

In [2]:
def token_info(tokens):
    print('Number of tokens: ', len(tokens))
    print('First 10 tokens: ', tokens[:10])

In [3]:
def token_types(tokens):
    num_tokens = []
    symbol_tokens = []
    low_tokens = []
    up_tokens = []
    caps_tokens = []
    num_re = re.compile('[0-9]')
    low_re = re.compile('[a-z]')
    up_re = re.compile('[A-Z]')
    caps_re = re.compile('^[A-Z]*$')
    letnum_re = re.compile('[a-zA-Z0-9]')
    for t in tokens:
        if num_re.search(t):
            num_tokens.append(t)
        if not letnum_re.search(t):
            symbol_tokens.append(t)
        if low_re.match(t):
            low_tokens.append(t)
        if up_re.match(t):
            up_tokens.append(t)
        if caps_re.match(t):
            caps_tokens.append(t)
    print('Number of num tokens: ', len(num_tokens))
    print('Number of symbol tokens: ', len(symbol_tokens))
    print('Number of lowcase tokens: ' , len(low_tokens))
    print('Number of upper case tokens: ', len(up_tokens))
    print('Nubmer of capslock tokens: ' , len(caps_tokens))
    
    print('\n')
    
    print('Freq num tokens: ', num_tokens[:10])
    print('Rare num tokens: ', num_tokens[-10:], '\n')
    print('Freq symbol tokens: ', symbol_tokens[:10])
    print('Rare symbol tokens: ', symbol_tokens[-10:], '\n')
    print('Freq low tokens: ', low_tokens[:10])
    print('Rare low tokens: ', low_tokens[-10:], '\n')
    print('Freq up tokens: ', up_tokens[:10])
    print('Rare up tokens: ', up_tokens[-10:], '\n')
    print('Freq caps tokens: ', caps_tokens[:10])
    print('Rare caps tokens: ', caps_tokens[-10:])

### Look at embedding

In [50]:
emb_path = 'embeddings/paragram_300_sl999/paragram_300_sl999.txt'
data_dir = cache = '../data'

train_csv = os.path.join(data_dir, 'train.csv')
test_csv = os.path.join(data_dir, 'test.csv')
emb_path = os.path.join(data_dir, emb_path)

In [51]:
%%time
emb_tokens = []
errors = 0
with open(emb_path, 'rb') as f:
    for line in f:
        try:
            tok = line.rstrip().split(b" ")[0]
            tok = tok.decode('utf-8')
            emb_tokens.append(tok)
        except: 
            errors += 1
            continue
print(errors)

4
CPU times: user 17.3 s, sys: 1.63 s, total: 18.9 s
Wall time: 24.9 s


In [52]:
token_info(emb_tokens)

Number of tokens:  1703752
First 10 tokens:  [',', '.', 'the', 'and', 'to', 'of', 'a', 'in', '"', ':']


In [53]:
token_types(emb_tokens)

Number of num tokens:  510038
Number of symbol tokens:  2518
Number of lowcase tokens:  1265160
Number of upper case tokens:  1
Nubmer of capslock tokens:  1


Freq num tokens:  ['1', '2', '3', '4', '2012', '5', '2011', '10', '2010', '2009']
Rare num tokens:  ['crowdog89', 'data0053', 'doc2doc', 'i686.pet', 'km/11', 'mono-2', 'otherjuicystar07', 'own36', 'p263', 'sarah123'] 

Freq symbol tokens:  [',', '.', '"', ':', ')', '(', '-', '...', '!', '?']
Rare symbol tokens:  ['?!?!?!?!!', '?!?!?!?!?!?!?!?!?', 'ãƒˆã', 'ａ', '-------------------------------------------------------------------------------------------------------------------------------------------------', 'ɯ', 'вЂ', '回', 'ĵ', 'ÐÐ'] 

Freq low tokens:  ['the', 'and', 'to', 'of', 'a', 'in', 'is', 'for', 'i', 'that']
Rare low tokens:  ['windowstransgender', 'wordsforyoungmen', 'work.like', 'working.so', 'wried', 'wwent', 'xalisae', 'xtremecaffeine', 'zipout', 'zulchzulu'] 

Freq up tokens:  ['UUUNKKK']
Rare up tokens:  ['UUUNKKK'] 

### Tokenize data and build vocab

In [54]:
train = pd.read_csv(train_csv)['question_text'].values.tolist()
test = pd.read_csv(test_csv)['question_text'].values.tolist()
data = train + test

In [55]:
%%time
unknown_tokens = []
vocab = Counter()
spacy_en = spacy.load('en')
tokenizer = spacy_en.tokenizer
emb_tokens_set = set(emb_tokens)
for question in data:
    doc = tokenizer(question)
    for token in doc:
        tok_text = token.text
        vocab.update([tok_text])
        if vocab[tok_text] == 1:
            if tok_text not in emb_tokens_set:
                unknown_tokens.append(tok_text)
        
#doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
#for token in doc:
#    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#          token.shape_, token.is_alpha, token.is_stop)

CPU times: user 2min 30s, sys: 98.3 ms, total: 2min 30s
Wall time: 2min 28s


In [56]:
vocab_tokens = dict(vocab.most_common()).keys()

In [57]:
token_types(vocab_tokens)

Number of num tokens:  29530
Number of symbol tokens:  1727
Number of lowcase tokens:  132699
Number of upper case tokens:  113484
Nubmer of capslock tokens:  15169


Freq num tokens:  ['2', '2017', '2018', '1', '3', '10', '5', '4', '12', '6']
Rare num tokens:  ['10–12', '61850', 'Feb-2018', '2200-$2500/mo', '4000/mo', '999999', '450cc', '78.53', '900va', '6740'] 

Freq symbol tokens:  ['?', ',', '.', '"', '-', ')', '(', '/', "'", '&']
Rare symbol tokens:  ['τῆς', 'κατὰ', 'τὴν', 'ἡμετέραν', 'χρῆσιν', 'συνηθείας', '=--', '中庸', '中庸之道', '∅⊂'] 

Freq low tokens:  ['the', 'to', 'a', 'in', 'is', 'of', 'and', 'do', 'are', 'for']
Rare low tokens:  ['noumber', 'sml', 'sgill', 'jauhar', 'clearly"bad', 'myAIR', 'cybershot', 'ganpat', 'rabels', 'photopicker'] 

Freq up tokens:  ['What', 'I', 'How', 'Why', 'Is', 'Can', 'Which', 'Do', 'If', 'India']
Rare up tokens:  ['Dime', 'Entanglements', 'Tamilar', 'Flurry', 'Routines', 'ULTIMATELY', 'Hb', 'Generate', 'Khayami', 'Thetis'] 

Freq caps tokens:  ['

### Capslock tokens

In [58]:
caps_re = re.compile('^[A-Z]*$')
caps_tokens = []
for t in vocab_tokens:
    if caps_re.match(t):
        caps_tokens.append(t)

In [59]:
emotional_caps_tokens = []
for t in caps_tokens:
    if t not in emb_tokens_set and t.lower() in emb_tokens_set:
        emotional_caps_tokens.append(t)
print(len(emotional_caps_tokens))

13152


In [60]:
emotional_caps_tokens

['I',
 'US',
 'USA',
 'A',
 'JEE',
 'TV',
 'UK',
 'IIT',
 'MBA',
 'IT',
 'CSE',
 'UPSC',
 'MS',
 'IQ',
 'CBSE',
 'CA',
 'NEET',
 'C',
 'CS',
 'NIT',
 'BJP',
 'MBBS',
 'PC',
 'IAS',
 'SSC',
 'B',
 'AI',
 'GST',
 'GATE',
 'EU',
 'DC',
 'ECE',
 'X',
 'CAT',
 'AIIMS',
 'DNA',
 'VIT',
 'MIT',
 'IIM',
 'SBI',
 'CGL',
 'SEO',
 'PG',
 'SAT',
 'GPA',
 'G',
 'OBC',
 'BITS',
 'BA',
 'PM',
 'K',
 'SC',
 'AC',
 'II',
 'ISIS',
 'BBA',
 'GRE',
 'HR',
 'D',
 'ICSE',
 'PO',
 'PR',
 'SAP',
 'SRM',
 'AP',
 'PHP',
 'TCS',
 'NDA',
 'WWII',
 'R',
 'FBI',
 'OS',
 'HTML',
 'PDF',
 'NASA',
 'IIIT',
 'BITSAT',
 'CEO',
 'CGPA',
 'IELTS',
 'CET',
 'IP',
 'IPL',
 'M',
 'NBA',
 'NYC',
 'E',
 'UN',
 'API',
 'RBI',
 'DU',
 'UAE',
 'NCERT',
 'PCM',
 'T',
 'HIV',
 'OK',
 'SSB',
 'NFL',
 'AWS',
 'HP',
 'MCU',
 'IS',
 'MA',
 'USB',
 'ISRO',
 'GB',
 'SQL',
 'CLAT',
 'ADHD',
 'BCA',
 'ID',
 'GDP',
 'PSU',
 'ISC',
 'IPS',
 'FIFA',
 'IBPS',
 'V',
 'S',
 'BPD',
 'BHU',
 'NIFT',
 'INR',
 'IB',
 'IES',
 'UP',
 'UG',
 'UX',
 'LG

### Unknown tokens

In [61]:
unk_vocab = {}
for w in unknown_tokens:
    unk_vocab[w] = vocab[w]
unk_vocab = Counter(unk_vocab)

In [62]:
print('Total number of tokens: ', sum(vocab.values()))
print('Number of unique tokens: ', len(vocab))
print('Total number of unknown tokens: ', sum(unk_vocab.values()))
print('Number of unique unknown tokens: ', len(unknown_tokens))
print('Percent of unknown tokens: ', 100 * sum(unk_vocab.values())/sum(vocab.values()))

Total number of tokens:  19837964
Number of unique tokens:  268601
Total number of unknown tokens:  3354250
Number of unique unknown tokens:  165972
Percent of unknown tokens:  16.90823715578877


In [63]:
unk_common = dict(unk_vocab.most_common())
print('Number of 100 most common unk tokens: ', sum(list(unk_common.values())[:100]))

Number of 100 most common unk tokens:  2100848


In [64]:
unk_common

{'What': 452309,
 'I': 344086,
 'How': 274523,
 'Why': 151622,
 'Is': 115278,
 'Can': 55112,
 'Which': 49416,
 'Do': 42778,
 'If': 36135,
 'India': 34181,
 'Are': 30882,
 'Does': 24583,
 'Who': 23507,
 'Where': 20267,
 'Should': 17794,
 'Quora': 16573,
 'Will': 15335,
 'When': 15253,
 'Trump': 14824,
 'Indian': 14231,
 'US': 13251,
 'The': 11015,
 'Would': 10740,
 'In': 10644,
 'My': 8627,
 'China': 8356,
 'English': 8069,
 'Did': 8028,
 'American': 7952,
 'Have': 7571,
 'Chinese': 6614,
 'America': 6571,
 'Has': 6121,
 'USA': 5917,
 'Google': 5883,
 'A': 5697,
 'Americans': 5620,
 'University': 5617,
 'Canada': 5510,
 'Muslims': 5373,
 'Facebook': 5190,
 'JEE': 5153,
 'TV': 4887,
 'As': 4865,
 'Could': 4789,
 'United': 4679,
 'Was': 4648,
 'Indians': 4613,
 'UK': 4455,
 'God': 4353,
 'Pakistan': 4307,
 'It': 4287,
 'Delhi': 4245,
 'Muslim': 4212,
 'IIT': 4164,
 'North': 4017,
 'States': 3943,
 'Android': 3855,
 'Donald': 3831,
 'MBA': 3810,
 'New': 3765,
 'And': 3693,
 'YouTube': 3675

In [65]:
token_types(list(unk_common.keys()))

Number of num tokens:  19491
Number of symbol tokens:  1518
Number of lowcase tokens:  39402
Number of upper case tokens:  113484
Nubmer of capslock tokens:  15169


Freq num tokens:  ['WW2', '3D', 'PS4', 'H1B', 'F1', 'B2B', 'WW1', 'CO2', 'x^2', '2D']
Rare num tokens:  ['An=2', 'NBA2K18', '3.71/4.44', '39/389', 'GD1', 'N4200', 'Z2600', '10–12', 'Feb-2018', '2200-$2500/mo'] 

Freq symbol tokens:  ['..', ':(', '₹', '"-', 'करना', '\ufeff', '\n', ':/', '️', '\\\\']
Rare symbol tokens:  ['τῆς', 'κατὰ', 'τὴν', 'ἡμετέραν', 'χρῆσιν', 'συνηθείας', '=--', '中庸', '中庸之道', '∅⊂'] 

Freq low tokens:  ['iPhone', 'iOS', 'cryptocurrencies', 'iPad', 'eBay', 'iTunes', 'pH', 'x^2', 'f(x', 'iPhones']
Rare low tokens:  ['heritor', 'proportionals', 'youvision', 'enrollnment', 'noumber', 'sgill', 'clearly"bad', 'myAIR', 'rabels', 'photopicker'] 

Freq up tokens:  ['What', 'I', 'How', 'Why', 'Is', 'Can', 'Which', 'Do', 'If', 'India']
Rare up tokens:  ['Dime', 'Entanglements', 'Tamilar', 'Flurry', 'Routines', 'UL

### Look at spacy tokenizer 

In [66]:
from spacy.lang.en.stop_words import STOP_WORDS

In [67]:
print(STOP_WORDS)

{'around', 'every', 'hereafter', 'no', 'across', 'rather', 'side', 'their', 'enough', 'anything', 'via', 'whereupon', 'nevertheless', 'anywhere', 'name', 'using', 'everyone', 'six', 'along', 'former', 'afterwards', 'below', 'to', 'others', 'well', 'wherever', 'at', 'nor', 'above', 'beside', 'eight', 'quite', 'toward', 'onto', 'another', 'latter', 'by', 'third', 'put', 'whoever', 'used', 'through', 'but', 'against', 'who', 'off', 'most', 'someone', 'his', 'together', 'yours', 'seeming', 'did', 'eleven', 'that', 'from', 'alone', 'too', 'whereby', 'as', 'each', 'next', 'other', 'any', 'everything', 'towards', 'few', 'until', 'before', 'front', 'get', 'some', 'within', 'him', 'out', 'less', 'then', 'almost', 'doing', 'has', 'herein', 'hereupon', 'elsewhere', 're', 'serious', 'this', 'hereby', 'in', 'very', 'three', 'give', 'several', 'there', 'else', 'hers', 'neither', 'down', 'hundred', 'between', 'made', 'nothing', 'never', 'seem', 'thereby', 'your', 'keep', 'seemed', 'same', 'after', 'i

### Read embedding

In [69]:
import numpy as np
embs = {}
with open(emb_path) as f:
    count = 0
    for line in f:
        if count > 90000:
            break
        count += 1
        cut = line.find(' ')
        tok = line[:cut]
        emb = line[cut + 1:].split()
        emb = np.array([float(e) for e in emb])
        embs[tok] = emb

In [74]:
low = embs['much']
up = embs['more']

In [75]:
def sim(v1, v2):
    result = sum(v1*v2)/np.linalg.norm(v1)/np.linalg.norm(v2)
    return result

In [76]:
print(sim(low, up))

0.2550954033558647
